inprogress
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.io.parsers.nexus;
27
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import org.forester.archaeopteryx.Constants;
39 import org.forester.io.parsers.IteratingPhylogenyParser;
40 import org.forester.io.parsers.PhylogenyParser;
41 import org.forester.io.parsers.nhx.NHXFormatException;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.parsers.util.PhylogenyParserException;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.util.ForesterUtil;
50
51 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
52
53     final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
54     final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
55     final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
56     final private static String  tree                      = NexusConstants.TREE.toLowerCase();
57     final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
58     final private static String  end                       = NexusConstants.END.toLowerCase();
59     final private static String  endblock                  = "endblock";
60     final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
61                                                                               Pattern.CASE_INSENSITIVE );
62     final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
63     private Object               _nexus_source;
64     private List<String>         _taxlabels;
65     private Map<String, String>  _translate_map;
66     private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
67     private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
68     private TAXONOMY_EXTRACTION  _taxonomy_extraction      = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
69     private Phylogeny            _next;
70     private BufferedReader       _br;
71     private boolean              _in_trees_block;
72     private StringBuilder        _nh;
73     private String               _name;
74     private StringBuilder        _translate_sb;
75     private boolean              _in_taxalabels;
76     private boolean              _in_translate;
77     private boolean              _is_rooted;
78     private boolean              _rooted_info_present;
79     private boolean              _in_tree;
80
81     @Override
82     public final boolean hasNext() {
83         return _next != null;
84     }
85
86     @Override
87     public final Phylogeny next() throws NHXFormatException, IOException {
88         final Phylogeny phy = _next;
89         getNext();
90         return phy;
91     }
92
93     @Override
94     public final Phylogeny[] parse() throws IOException {
95         reset();
96         final List<Phylogeny> l = new ArrayList<Phylogeny>();
97         while ( hasNext() ) {
98             l.add( next() );
99         }
100         final Phylogeny[] p = new Phylogeny[ l.size() ];
101         for( int i = 0; i < l.size(); ++i ) {
102             p[ i ] = l.get( i );
103         }
104         return p;
105     }
106
107     @Override
108     public final void reset() throws FileNotFoundException, IOException {
109         _taxlabels = new ArrayList<String>();
110         _translate_map = new HashMap<String, String>();
111         _nh = new StringBuilder();
112         _name = "";
113         _translate_sb = new StringBuilder();
114         _next = null;
115         _in_trees_block = false;
116         _in_taxalabels = false;
117         _in_translate = false;
118         _in_tree = false;
119         _rooted_info_present = false;
120         _is_rooted = false;
121         _br = ParserUtils.createReader( _nexus_source );
122         getNext();
123     }
124
125     public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
126         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
127     }
128
129     public final void setReplaceUnderscores( final boolean replace_underscores ) {
130         _replace_underscores = replace_underscores;
131     }
132
133     @Override
134     public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
135         if ( nexus_source == null ) {
136             throw new PhylogenyParserException( "attempt to parse null object" );
137         }
138         _nexus_source = nexus_source;
139         reset();
140     }
141
142     public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
143         _taxonomy_extraction = taxonomy_extraction;
144     }
145
146     private final void createPhylogeny( final String name,
147                                         final StringBuilder nhx,
148                                         final boolean rooted_info_present,
149                                         final boolean is_rooted ) throws IOException {
150         _next = null;
151         final NHXParser pars = new NHXParser();
152         if ( ( _taxlabels.size() < 1 ) && ( _translate_map.size() < 1 ) ) {
153             pars.setTaxonomyExtraction( _taxonomy_extraction );
154             pars.setReplaceUnderscores( _replace_underscores );
155             pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
156         }
157         else {
158             pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
159             pars.setReplaceUnderscores( false );
160             pars.setIgnoreQuotes( false );
161         }
162         if ( rooted_info_present ) {
163             pars.setGuessRootedness( false );
164         }
165         pars.setSource( nhx );
166         final Phylogeny p = pars.next();
167         if ( p == null ) {
168             throw new PhylogenyParserException( "failed to create phylogeny" );
169         }
170         p.setName( name );
171         if ( rooted_info_present ) {
172             p.setRooted( is_rooted );
173         }
174         if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
175             final PhylogenyNodeIterator it = p.iteratorExternalForward();
176             while ( it.hasNext() ) {
177                 final PhylogenyNode node = it.next();
178                 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
179                     node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
180                 }
181                 else if ( _taxlabels.size() > 0 ) {
182                     int i = -1;
183                     try {
184                         i = Integer.parseInt( node.getName() );
185                     }
186                     catch ( final NumberFormatException e ) {
187                         // Ignore.
188                     }
189                     if ( i > 0 ) {
190                         node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
191                     }
192                 }
193                 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
194                     ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
195                     //                    final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
196                     //                                                                                    getTaxonomyExtraction() );
197                     //                    if ( !ForesterUtil.isEmpty( tax ) ) {
198                     //                        if ( !node.getNodeData().isHasTaxonomy() ) {
199                     //                            node.getNodeData().setTaxonomy( new Taxonomy() );
200                     //                        }
201                     //                        node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
202                     //                    }
203                 }
204             }
205         }
206         _next = p;
207     }
208
209     private final void getNext() throws IOException, NHXFormatException {
210         _next = null;
211         String line;
212         while ( ( line = _br.readLine() ) != null ) {
213             line = line.trim();
214             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
215                 line = ForesterUtil.collapseWhiteSpace( line );
216                 line = removeWhiteSpaceBeforeSemicolon( line );
217                 final String line_lc = line.toLowerCase();
218                 if ( line_lc.startsWith( begin_trees ) ) {
219                     _in_trees_block = true;
220                     _in_taxalabels = false;
221                     _in_translate = false;
222                 }
223                 else if ( line_lc.startsWith( taxlabels ) ) {
224                     _in_trees_block = false;
225                     _in_taxalabels = true;
226                     _in_translate = false;
227                 }
228                 else if ( line_lc.startsWith( translate ) ) {
229                     _in_taxalabels = false;
230                     _in_translate = true;
231                 }
232                 else if ( _in_trees_block ) {
233                     //FIXME TODO need to work on this "title" and "link"
234                     if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
235                         // Do nothing.
236                     }
237                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
238                         _in_trees_block = false;
239                         _in_tree = false;
240                         _in_translate = false;
241                         if ( _nh.length() > 0 ) {
242                             createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
243                             _nh = new StringBuilder();
244                             _name = "";
245                             _rooted_info_present = false;
246                             _is_rooted = false;
247                             if ( _next != null ) {
248                                 return;
249                             }
250                         }
251                     }
252                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
253                         boolean might = false;
254                         if ( _nh.length() > 0 ) {
255                             might = true;
256                             createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
257                             _nh = new StringBuilder();
258                             _name = "";
259                             _rooted_info_present = false;
260                             _is_rooted = false;
261                         }
262                         _in_tree = true;
263                         _nh.append( line.substring( line.indexOf( '=' ) ) );
264                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
265                         if ( name_matcher.matches() ) {
266                             _name = name_matcher.group( 1 );
267                             _name = _name.replaceAll( "['\"]+", "" );
268                         }
269                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
270                         if ( rootedness_matcher.matches() ) {
271                             final String s = rootedness_matcher.group( 1 );
272                             line = line.replaceAll( "\\[\\&.\\]", "" );
273                             _rooted_info_present = true;
274                             if ( s.toUpperCase().equals( "R" ) ) {
275                                 _is_rooted = true;
276                             }
277                         }
278                         if ( might && ( _next != null ) ) {
279                             return;
280                         }
281                     }
282                     else if ( _in_tree && !_in_translate ) {
283                         _nh.append( line );
284                     }
285                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
286                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
287                         _in_tree = false;
288                         _in_translate = false;
289                         createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
290                         _nh = new StringBuilder();
291                         _name = "";
292                         _rooted_info_present = false;
293                         _is_rooted = false;
294                         if ( _next != null ) {
295                             return;
296                         }
297                     }
298                 }
299                 if ( _in_taxalabels ) {
300                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
301                         _in_taxalabels = false;
302                     }
303                     else {
304                         final String[] labels = line.split( "\\s+" );
305                         for( String label : labels ) {
306                             if ( !label.toLowerCase().equals( taxlabels ) ) {
307                                 if ( label.endsWith( ";" ) ) {
308                                     _in_taxalabels = false;
309                                     label = label.substring( 0, label.length() - 1 );
310                                 }
311                                 if ( label.length() > 0 ) {
312                                     _taxlabels.add( label );
313                                 }
314                             }
315                         }
316                     }
317                 }
318                 if ( _in_translate ) {
319                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
320                         _in_translate = false;
321                     }
322                     else {
323                         _translate_sb.append( " " );
324                         _translate_sb.append( line.trim() );
325                         if ( line.endsWith( ";" ) ) {
326                             _in_translate = false;
327                             setTranslateKeyValuePairs( _translate_sb );
328                         }
329                     }
330                 }
331             }
332         }
333         if ( _nh.length() > 0 ) {
334             createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
335             if ( _next != null ) {
336                 return;
337             }
338         }
339     }
340
341     private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
342         String s = translate_sb.toString().trim();
343         if ( s.endsWith( ";" ) ) {
344             s = s.substring( 0, s.length() - 1 ).trim();
345         }
346         for( final String pair : s.split( "," ) ) {
347             final String[] kv = pair.trim().split( "\\s+" );
348             if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
349                 throw new IOException( "ill-formatted translate values: " + translate_sb );
350             }
351             if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
352                 throw new IOException( "ill-formatted translate values: " + translate_sb );
353             }
354             String key = "";
355             String value = "";
356             if ( kv.length == 3 ) {
357                 key = kv[ 1 ];
358                 value = kv[ 2 ];
359             }
360             else {
361                 key = kv[ 0 ];
362                 value = kv[ 1 ];
363             }
364             if ( value.endsWith( ";" ) ) {
365                 value = value.substring( 0, value.length() - 1 );
366             }
367             _translate_map.put( key, value );
368         }
369     }
370
371     private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
372         return s.replaceAll( "\\s+;", ";" );
373     }
374
375     @Override
376     public String getName() {
377         return "Nexus Phylogenies Parser";
378     }
379 }