in progress
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.io.parsers.nexus;
27
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import org.forester.archaeopteryx.AptxConstants;
39 import org.forester.io.parsers.IteratingPhylogenyParser;
40 import org.forester.io.parsers.PhylogenyParser;
41 import org.forester.io.parsers.nhx.NHXFormatException;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.parsers.util.PhylogenyParserException;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.Sequence;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.sequence.BasicSequence;
51 import org.forester.sequence.MolecularSequence;
52 import org.forester.util.ForesterConstants;
53 import org.forester.util.ForesterUtil;
54
55 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
56
57    
58     final private static boolean DEBUG                               = false;
59     
60     final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
61     final private static String            end                       = NexusConstants.END.toLowerCase();
62     final private static String            endblock                  = "endblock";
63     final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
64     final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
65     final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
66                                                                                         Pattern.CASE_INSENSITIVE );
67     final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
68     final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
69     final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
70     final private static String            tree                      = NexusConstants.TREE.toLowerCase();
71     final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
72                                                                                         Pattern.CASE_INSENSITIVE );
73     final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
74     final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
75     final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
76     //final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
77     //                                                                                    Pattern.CASE_INSENSITIVE );
78     final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
79     private BufferedReader                 _br;
80     private boolean                        _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
81     private boolean                        _in_taxalabels;
82     private boolean                        _in_translate;
83     private boolean                        _in_tree;
84     private boolean                        _in_trees_block;
85     private boolean                        _in_data_block;
86     private boolean                        _is_rooted;
87     private String                         _datatype;
88     private String                         _name;
89     private Phylogeny                      _next;
90     private Object                         _nexus_source;
91     private StringBuilder                  _nh;
92     private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
93     private boolean                        _rooted_info_present;
94     private List<String>                   _taxlabels;
95     private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
96     private String                         _title;
97     private Map<String, String>            _translate_map;
98     private StringBuilder                  _translate_sb;
99     private Map<String, MolecularSequence> _seqs;
100     private final boolean                  _add_sequences            = true;
101     private boolean                       _parse_beast_style_extended_tags           = false;
102            
103
104     @Override
105     public String getName() {
106         return "Nexus Phylogenies Parser";
107     }
108
109     @Override
110     public final boolean hasNext() {
111         return _next != null;
112     }
113
114     @Override
115     public final Phylogeny next() throws NHXFormatException, IOException {
116         final Phylogeny phy = _next;
117         getNext();
118         return phy;
119     }
120
121     @Override
122     public final Phylogeny[] parse() throws IOException {
123         final List<Phylogeny> l = new ArrayList<Phylogeny>();
124         while ( hasNext() ) {
125             l.add( next() );
126         }
127         final Phylogeny[] p = new Phylogeny[ l.size() ];
128         for( int i = 0; i < l.size(); ++i ) {
129             p[ i ] = l.get( i );
130         }
131         reset();
132         return p;
133     }
134
135     @Override
136     public final void reset() throws FileNotFoundException, IOException {
137         _taxlabels = new ArrayList<String>();
138         _translate_map = new HashMap<String, String>();
139         _nh = new StringBuilder();
140         _name = "";
141         _title = "";
142         _translate_sb = null;
143         _next = null;
144         _in_trees_block = false;
145         _in_taxalabels = false;
146         _in_translate = false;
147         _in_tree = false;
148         _rooted_info_present = false;
149         _is_rooted = false;
150         _seqs = new HashMap<String, MolecularSequence>();
151         _br = ParserUtils.createReader( _nexus_source, ForesterConstants.UTF_8 );
152         getNext();
153     }
154
155     public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
156         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
157     }
158
159     public final void setReplaceUnderscores( final boolean replace_underscores ) {
160         _replace_underscores = replace_underscores;
161     }
162
163     @Override
164     public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
165         if ( nexus_source == null ) {
166             throw new PhylogenyParserException( "attempt to parse null object" );
167         }
168         _nexus_source = nexus_source;
169         reset();
170     }
171
172     public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
173         _taxonomy_extraction = taxonomy_extraction;
174     }
175
176     private final void createPhylogeny( final String title,
177                                         final String name,
178                                         final StringBuilder nhx,
179                                         final boolean rooted_info_present,
180                                         final boolean is_rooted ) throws IOException {
181         _next = null;
182         final NHXParser pars = new NHXParser();
183         pars.setTaxonomyExtraction( _taxonomy_extraction );
184         pars.setReplaceUnderscores( _replace_underscores );
185         pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
186         pars.setParseBeastStyleExtendedTags( _parse_beast_style_extended_tags );
187         if ( rooted_info_present ) {
188             pars.setGuessRootedness( false );
189         }
190         pars.setSource( nhx.toString() );
191         final Phylogeny p = pars.next();
192         if ( p == null ) {
193             throw new PhylogenyParserException( "failed to create phylogeny" );
194         }
195         String myname = null;
196         if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
197             myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
198         }
199         else if ( !ForesterUtil.isEmpty( title ) ) {
200             myname = title.replace( '_', ' ' ).trim();
201         }
202         else if ( !ForesterUtil.isEmpty( name ) ) {
203             myname = name.trim();
204         }
205         if ( !ForesterUtil.isEmpty( myname ) ) {
206             p.setName( myname );
207         }
208         if ( rooted_info_present ) {
209             p.setRooted( is_rooted );
210         }
211         if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
212             final PhylogenyNodeIterator it = p.iteratorExternalForward();
213             while ( it.hasNext() ) {
214                 final PhylogenyNode node = it.next();
215                 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
216                     node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
217                 }
218                 else if ( _taxlabels.size() > 0 ) {
219                     int i = -1;
220                     try {
221                         i = Integer.parseInt( node.getName() );
222                     }
223                     catch ( final NumberFormatException e ) {
224                         // Ignore.
225                     }
226                     if ( i > 0 ) {
227                         node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
228                     }
229                 }
230                 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
231                     ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
232                 }
233                 else if ( _replace_underscores ) {
234                     if ( !ForesterUtil.isEmpty( node.getName() ) ) {
235                         node.setName( node.getName().replace( '_', ' ' ).trim() );
236                     }
237                 }
238                 if ( _add_sequences ) {
239                     if ( _seqs.containsKey( node.getName() ) ) {
240                         final MolecularSequence s = _seqs.get( node.getName() );
241                         //TODO need to check for uniqueness when adding seqs....
242                         final Sequence ns = new Sequence( s );
243                         ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
244                         node.getNodeData().addSequence( ns );
245                     }
246                 }
247             }
248         }
249         _next = p;
250     }
251
252     private final void getNext() throws IOException, NHXFormatException {
253         _next = null;
254         String line;
255         while ( ( line = _br.readLine() ) != null ) {
256             if ( DEBUG ) {
257                 System.out.println( line );
258             }
259             line = line.trim();
260             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
261                 line = ForesterUtil.collapseWhiteSpace( line );
262                 line = removeWhiteSpaceBeforeSemicolon( line );
263                 final String line_lc = line.toLowerCase();
264                 if ( line_lc.startsWith( begin_trees ) ) {
265                     _in_trees_block = true;
266                     _in_taxalabels = false;
267                     _in_translate = false;
268                     _in_data_block = false;
269                     _datatype = null;
270                     _title = "";
271                 }
272                 else if ( line_lc.startsWith( taxlabels ) ) {
273                     //TODO need to be taxa block instead
274                     _in_trees_block = false;
275                     _in_taxalabels = true;
276                     _in_translate = false;
277                     _in_data_block = false;
278                     _datatype = null;
279                 }
280                 else if ( line_lc.startsWith( translate ) ) {
281                     _translate_sb = new StringBuilder();
282                     _in_taxalabels = false;
283                     _in_translate = true;
284                     _in_data_block = false;
285                     _datatype = null;
286                 }
287                 else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
288                     _in_taxalabels = false;
289                     _in_trees_block = false;
290                     _in_translate = false;
291                     _in_data_block = true;
292                     _datatype = null;
293                 }
294                 else if ( _in_trees_block ) {
295                     if ( line_lc.startsWith( "title" ) ) {
296                         final Matcher title_m = TITLE_PATTERN.matcher( line );
297                         if ( title_m.lookingAt() ) {
298                             _title = title_m.group( 1 );
299                         }
300                     }
301                     else if ( line_lc.startsWith( "link" ) ) {
302                         //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
303                         //if ( link_m.lookingAt() ) {
304                             //final String link = link_m.group( 1 );  //TODO why?
305                        // }
306                     }
307                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
308                         _in_trees_block = false;
309                         _in_tree = false;
310                         _in_translate = false;
311                         if ( _nh.length() > 0 ) {
312                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
313                             _nh = new StringBuilder();
314                             _name = "";
315                             _rooted_info_present = false;
316                             _is_rooted = false;
317                             if ( _next != null ) {
318                                 return;
319                             }
320                         }
321                     }
322                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
323                         boolean might = false;
324                         if ( _nh.length() > 0 ) {
325                             might = true;
326                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
327                             _nh = new StringBuilder();
328                             _name = "";
329                             _rooted_info_present = false;
330                             _is_rooted = false;
331                         }
332                         _in_tree = true;
333                         _nh.append( line.substring( line.indexOf( '=' ) ) );
334                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
335                         if ( name_matcher.matches() ) {
336                             _name = name_matcher.group( 1 );
337                             _name = _name.replaceAll( "['\"]+", "" );
338                         }
339                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
340                         if ( rootedness_matcher.matches() ) {
341                             final String s = rootedness_matcher.group( 1 );
342                             line = line.replaceAll( "\\[\\&.\\]", "" );
343                             _rooted_info_present = true;
344                             if ( s.toUpperCase().equals( "R" ) ) {
345                                 _is_rooted = true;
346                             }
347                         }
348                         if ( might && ( _next != null ) ) {
349                             return;
350                         }
351                     }
352                     else if ( _in_tree && !_in_translate ) {
353                         _nh.append( line );
354                     }
355                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
356                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
357                         _in_tree = false;
358                         _in_translate = false;
359                         createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
360                         _nh = new StringBuilder();
361                         _name = "";
362                         _rooted_info_present = false;
363                         _is_rooted = false;
364                         if ( _next != null ) {
365                             return;
366                         }
367                     }
368                 }
369                 if ( _in_taxalabels ) {
370                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
371                         _in_taxalabels = false;
372                     }
373                     else {
374                         final String[] labels = line.split( "\\s+" );
375                         for( String label : labels ) {
376                             if ( !label.toLowerCase().equals( taxlabels ) ) {
377                                 if ( label.endsWith( ";" ) ) {
378                                     _in_taxalabels = false;
379                                     label = label.substring( 0, label.length() - 1 );
380                                 }
381                                 if ( label.length() > 0 ) {
382                                     _taxlabels.add( label );
383                                 }
384                             }
385                         }
386                     }
387                 }
388                 if ( _in_translate ) {
389                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
390                         _in_translate = false;
391                     }
392                     else {
393                         _translate_sb.append( " " );
394                         _translate_sb.append( line.trim() );
395                         if ( line.endsWith( ";" ) ) {
396                             _in_translate = false;
397                             setTranslateKeyValuePairs( _translate_sb );
398                         }
399                     }
400                 }
401                 if ( _in_data_block ) {
402                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
403                         _in_data_block = false;
404                         _datatype = null;
405                     }
406                     else if ( line_lc.startsWith( "link" ) ) {
407                      //   final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
408                      //   if ( link_m.lookingAt() ) {
409                      //       final String link = link_m.group( 1 );
410                      //   }
411                     }
412                     else {
413                         final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
414                         if ( datatype_matcher.find() ) {
415                             _datatype = datatype_matcher.group( 1 );
416                         }
417                         else {
418                             if ( ( _datatype != null )
419                                     && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
420                                             .equals( "rna" ) ) ) {
421                                 if ( line.endsWith( ";" ) ) {
422                                     _in_data_block = false;
423                                     line = line.substring( 0, line.length() - 1 );
424                                 }
425                                 final Matcher aln_matcher = ALN_PATTERN.matcher( line );
426                                 if ( aln_matcher.matches() ) {
427                                     final String id = aln_matcher.group( 1 );
428                                     final String seq = aln_matcher.group( 2 );
429                                     MolecularSequence s = null;
430                                     if ( _datatype.equals( "protein" ) ) {
431                                         s = BasicSequence.createAaSequence( id, seq );
432                                     }
433                                     else if ( _datatype.equals( "dna" ) ) {
434                                         s = BasicSequence.createDnaSequence( id, seq );
435                                     }
436                                     else {
437                                         s = BasicSequence.createRnaSequence( id, seq );
438                                     }
439                                     _seqs.put( id, s );
440                                 }
441                             }
442                         }
443                     }
444                 }
445             }
446         }
447         if ( _nh.length() > 0 ) {
448             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
449             if ( _next != null ) {
450                 return;
451             }
452         }
453     }
454
455     private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
456         String s = translate_sb.toString().trim();
457         if ( s.endsWith( ";" ) ) {
458             s = s.substring( 0, s.length() - 1 ).trim();
459         }
460         for( String pair : s.split( "," ) ) {
461             String key = "";
462             String value = "";
463             final int ti = pair.toLowerCase().indexOf( "translate" );
464             if ( ti > -1 ) {
465                 pair = pair.substring( ti + 9 );
466             }
467             final Matcher m = TRANSLATE_PATTERN.matcher( pair );
468             if ( m.find() ) {
469                 key = m.group( 1 );
470                 value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
471             }
472             else {
473                 throw new IOException( "ill-formatted translate values: " + pair );
474             }
475             if ( value.endsWith( ";" ) ) {
476                 value = value.substring( 0, value.length() - 1 );
477             }
478             _translate_map.put( key, value );
479         }
480     }
481     
482     public final void setParseBeastStyleExtendedTags( final boolean parse_beast_style_extended_tags ) {
483         _parse_beast_style_extended_tags = parse_beast_style_extended_tags;
484     }
485     
486     private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
487         return s.replaceAll( "\\s+;", ";" );
488     }
489 }