4617a8f47c1b52f217af97cebd02c524cc29eda3
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.io.parsers.nexus;
27
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import org.forester.archaeopteryx.Constants;
39 import org.forester.io.parsers.IteratingPhylogenyParser;
40 import org.forester.io.parsers.PhylogenyParser;
41 import org.forester.io.parsers.nhx.NHXFormatException;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.parsers.util.PhylogenyParserException;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.Sequence;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.sequence.BasicSequence;
51 import org.forester.sequence.MolecularSequence;
52 import org.forester.util.ForesterUtil;
53
54 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
55
56     final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
57     final private static String            end                       = NexusConstants.END.toLowerCase();
58     final private static String            endblock                  = "endblock";
59     final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
60     final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
61     final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
62                                                                                         Pattern.CASE_INSENSITIVE );
63     final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
64     final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
65     final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
66     final private static String            tree                      = NexusConstants.TREE.toLowerCase();
67     final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
68                                                                                         Pattern.CASE_INSENSITIVE );
69     final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
70     final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
71     final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
72     final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
73                                                                                         Pattern.CASE_INSENSITIVE );
74     final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
75     private BufferedReader                 _br;
76     private boolean                        _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
77     private boolean                        _in_taxalabels;
78     private boolean                        _in_translate;
79     private boolean                        _in_tree;
80     private boolean                        _in_trees_block;
81     private boolean                        _in_data_block;
82     private boolean                        _is_rooted;
83     private String                         _datatype;
84     private String                         _name;
85     private Phylogeny                      _next;
86     private Object                         _nexus_source;
87     private StringBuilder                  _nh;
88     private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
89     private boolean                        _rooted_info_present;
90     private List<String>                   _taxlabels;
91     private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
92     private String                         _title;
93     private Map<String, String>            _translate_map;
94     private StringBuilder                  _translate_sb;
95     private Map<String, MolecularSequence> _seqs;
96     private final boolean                  _add_sequences            = true;
97
98     @Override
99     public String getName() {
100         return "Nexus Phylogenies Parser";
101     }
102
103     @Override
104     public final boolean hasNext() {
105         return _next != null;
106     }
107
108     @Override
109     public final Phylogeny next() throws NHXFormatException, IOException {
110         final Phylogeny phy = _next;
111         getNext();
112         return phy;
113     }
114
115     @Override
116     public final Phylogeny[] parse() throws IOException {
117         final List<Phylogeny> l = new ArrayList<Phylogeny>();
118         while ( hasNext() ) {
119             l.add( next() );
120         }
121         final Phylogeny[] p = new Phylogeny[ l.size() ];
122         for( int i = 0; i < l.size(); ++i ) {
123             p[ i ] = l.get( i );
124         }
125         reset();
126         return p;
127     }
128
129     @Override
130     public final void reset() throws FileNotFoundException, IOException {
131         _taxlabels = new ArrayList<String>();
132         _translate_map = new HashMap<String, String>();
133         _nh = new StringBuilder();
134         _name = "";
135         _title = "";
136         _translate_sb = null;
137         _next = null;
138         _in_trees_block = false;
139         _in_taxalabels = false;
140         _in_translate = false;
141         _in_tree = false;
142         _rooted_info_present = false;
143         _is_rooted = false;
144         _seqs = new HashMap<String, MolecularSequence>();
145         _br = ParserUtils.createReader( _nexus_source );
146         getNext();
147     }
148
149     public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
150         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
151     }
152
153     public final void setReplaceUnderscores( final boolean replace_underscores ) {
154         _replace_underscores = replace_underscores;
155     }
156
157     @Override
158     public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
159         if ( nexus_source == null ) {
160             throw new PhylogenyParserException( "attempt to parse null object" );
161         }
162         _nexus_source = nexus_source;
163         reset();
164     }
165
166     public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
167         _taxonomy_extraction = taxonomy_extraction;
168     }
169
170     private final void createPhylogeny( final String title,
171                                         final String name,
172                                         final StringBuilder nhx,
173                                         final boolean rooted_info_present,
174                                         final boolean is_rooted ) throws IOException {
175         _next = null;
176         final NHXParser pars = new NHXParser();
177         pars.setTaxonomyExtraction( _taxonomy_extraction );
178         pars.setReplaceUnderscores( _replace_underscores );
179         pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
180         if ( rooted_info_present ) {
181             pars.setGuessRootedness( false );
182         }
183         pars.setSource( nhx );
184         final Phylogeny p = pars.next();
185         if ( p == null ) {
186             throw new PhylogenyParserException( "failed to create phylogeny" );
187         }
188         String myname = null;
189         if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
190             myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
191         }
192         else if ( !ForesterUtil.isEmpty( title ) ) {
193             myname = title.replace( '_', ' ' ).trim();
194         }
195         else if ( !ForesterUtil.isEmpty( name ) ) {
196             myname = name.trim();
197         }
198         if ( !ForesterUtil.isEmpty( myname ) ) {
199             p.setName( myname );
200         }
201         if ( rooted_info_present ) {
202             p.setRooted( is_rooted );
203         }
204         if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
205             final PhylogenyNodeIterator it = p.iteratorExternalForward();
206             while ( it.hasNext() ) {
207                 final PhylogenyNode node = it.next();
208                 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
209                     node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
210                 }
211                 else if ( _taxlabels.size() > 0 ) {
212                     int i = -1;
213                     try {
214                         i = Integer.parseInt( node.getName() );
215                     }
216                     catch ( final NumberFormatException e ) {
217                         // Ignore.
218                     }
219                     if ( i > 0 ) {
220                         node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
221                     }
222                 }
223                 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
224                     ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
225                 }
226                 else if ( _replace_underscores ) {
227                     if ( !ForesterUtil.isEmpty( node.getName() ) ) {
228                         node.setName( node.getName().replace( '_', ' ' ).trim() );
229                     }
230                 }
231                 if ( _add_sequences ) {
232                     if ( _seqs.containsKey( node.getName() ) ) {
233                         final MolecularSequence s = _seqs.get( node.getName() );
234                         //TODO need to check for uniqueness when adding seqs....
235                         final Sequence ns = new Sequence( s );
236                         ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
237                         node.getNodeData().addSequence( ns );
238                     }
239                 }
240             }
241         }
242         _next = p;
243     }
244
245     private final void getNext() throws IOException, NHXFormatException {
246         _next = null;
247         String line;
248         while ( ( line = _br.readLine() ) != null ) {
249             line = line.trim();
250             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
251                 line = ForesterUtil.collapseWhiteSpace( line );
252                 line = removeWhiteSpaceBeforeSemicolon( line );
253                 final String line_lc = line.toLowerCase();
254                 if ( line_lc.startsWith( begin_trees ) ) {
255                     _in_trees_block = true;
256                     _in_taxalabels = false;
257                     _in_translate = false;
258                     _in_data_block = false;
259                     _datatype = null;
260                     _title = "";
261                 }
262                 else if ( line_lc.startsWith( taxlabels ) ) {
263                     //TODO need to be taxa block instead
264                     _in_trees_block = false;
265                     _in_taxalabels = true;
266                     _in_translate = false;
267                     _in_data_block = false;
268                     _datatype = null;
269                 }
270                 else if ( line_lc.startsWith( translate ) ) {
271                     _translate_sb = new StringBuilder();
272                     _in_taxalabels = false;
273                     _in_translate = true;
274                     _in_data_block = false;
275                     _datatype = null;
276                 }
277                 else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
278                     _in_taxalabels = false;
279                     _in_trees_block = false;
280                     _in_translate = false;
281                     _in_data_block = true;
282                     _datatype = null;
283                 }
284                 else if ( _in_trees_block ) {
285                     if ( line_lc.startsWith( "title" ) ) {
286                         final Matcher title_m = TITLE_PATTERN.matcher( line );
287                         if ( title_m.lookingAt() ) {
288                             _title = title_m.group( 1 );
289                         }
290                     }
291                     else if ( line_lc.startsWith( "link" ) ) {
292                         final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
293                         if ( link_m.lookingAt() ) {
294                             final String link = link_m.group( 1 );
295                             System.out.println( "link taxa:" + link );
296                         }
297                     }
298                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
299                         _in_trees_block = false;
300                         _in_tree = false;
301                         _in_translate = false;
302                         if ( _nh.length() > 0 ) {
303                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
304                             _nh = new StringBuilder();
305                             _name = "";
306                             _rooted_info_present = false;
307                             _is_rooted = false;
308                             if ( _next != null ) {
309                                 return;
310                             }
311                         }
312                     }
313                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
314                         boolean might = false;
315                         if ( _nh.length() > 0 ) {
316                             might = true;
317                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
318                             _nh = new StringBuilder();
319                             _name = "";
320                             _rooted_info_present = false;
321                             _is_rooted = false;
322                         }
323                         _in_tree = true;
324                         _nh.append( line.substring( line.indexOf( '=' ) ) );
325                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
326                         if ( name_matcher.matches() ) {
327                             _name = name_matcher.group( 1 );
328                             _name = _name.replaceAll( "['\"]+", "" );
329                         }
330                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
331                         if ( rootedness_matcher.matches() ) {
332                             final String s = rootedness_matcher.group( 1 );
333                             line = line.replaceAll( "\\[\\&.\\]", "" );
334                             _rooted_info_present = true;
335                             if ( s.toUpperCase().equals( "R" ) ) {
336                                 _is_rooted = true;
337                             }
338                         }
339                         if ( might && ( _next != null ) ) {
340                             return;
341                         }
342                     }
343                     else if ( _in_tree && !_in_translate ) {
344                         _nh.append( line );
345                     }
346                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
347                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
348                         _in_tree = false;
349                         _in_translate = false;
350                         createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
351                         _nh = new StringBuilder();
352                         _name = "";
353                         _rooted_info_present = false;
354                         _is_rooted = false;
355                         if ( _next != null ) {
356                             return;
357                         }
358                     }
359                 }
360                 if ( _in_taxalabels ) {
361                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
362                         _in_taxalabels = false;
363                     }
364                     else {
365                         final String[] labels = line.split( "\\s+" );
366                         for( String label : labels ) {
367                             if ( !label.toLowerCase().equals( taxlabels ) ) {
368                                 if ( label.endsWith( ";" ) ) {
369                                     _in_taxalabels = false;
370                                     label = label.substring( 0, label.length() - 1 );
371                                 }
372                                 if ( label.length() > 0 ) {
373                                     _taxlabels.add( label );
374                                 }
375                             }
376                         }
377                     }
378                 }
379                 if ( _in_translate ) {
380                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
381                         _in_translate = false;
382                     }
383                     else {
384                         _translate_sb.append( " " );
385                         _translate_sb.append( line.trim() );
386                         if ( line.endsWith( ";" ) ) {
387                             _in_translate = false;
388                             setTranslateKeyValuePairs( _translate_sb );
389                         }
390                     }
391                 }
392                 if ( _in_data_block ) {
393                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
394                         _in_data_block = false;
395                         _datatype = null;
396                     }
397                     else if ( line_lc.startsWith( "link" ) ) {
398                         final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
399                         if ( link_m.lookingAt() ) {
400                             final String link = link_m.group( 1 );
401                             System.out.println( "link taxa:" + link );
402                         }
403                     }
404                     else {
405                         final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
406                         if ( datatype_matcher.find() ) {
407                             _datatype = datatype_matcher.group( 1 );
408                             System.out.println( _datatype );
409                         }
410                         else {
411                             if ( ( _datatype != null )
412                                     && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
413                                             .equals( "rna" ) ) ) {
414                                 if ( line.endsWith( ";" ) ) {
415                                     _in_data_block = false;
416                                     line = line.substring( 0, line.length() - 1 );
417                                 }
418                                 final Matcher aln_matcher = ALN_PATTERN.matcher( line );
419                                 if ( aln_matcher.matches() ) {
420                                     final String id = aln_matcher.group( 1 );
421                                     final String seq = aln_matcher.group( 2 );
422                                     MolecularSequence s = null;
423                                     if ( _datatype.equals( "protein" ) ) {
424                                         s = BasicSequence.createAaSequence( id, seq );
425                                     }
426                                     else if ( _datatype.equals( "dna" ) ) {
427                                         s = BasicSequence.createDnaSequence( id, seq );
428                                     }
429                                     else {
430                                         s = BasicSequence.createRnaSequence( id, seq );
431                                     }
432                                     _seqs.put( id, s );
433                                     System.out.println( s );
434                                 }
435                             }
436                         }
437                     }
438                 }
439             }
440         }
441         if ( _nh.length() > 0 ) {
442             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
443             if ( _next != null ) {
444                 return;
445             }
446         }
447     }
448
449     private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
450         String s = translate_sb.toString().trim();
451         if ( s.endsWith( ";" ) ) {
452             s = s.substring( 0, s.length() - 1 ).trim();
453         }
454         for( String pair : s.split( "," ) ) {
455             String key = "";
456             String value = "";
457             final int ti = pair.toLowerCase().indexOf( "translate" );
458             if ( ti > -1 ) {
459                 pair = pair.substring( ti + 9 );
460             }
461             final Matcher m = TRANSLATE_PATTERN.matcher( pair );
462             if ( m.find() ) {
463                 key = m.group( 1 );
464                 value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
465             }
466             else {
467                 throw new IOException( "ill-formatted translate values: " + pair );
468             }
469             if ( value.endsWith( ";" ) ) {
470                 value = value.substring( 0, value.length() - 1 );
471             }
472             _translate_map.put( key, value );
473         }
474     }
475
476     private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
477         return s.replaceAll( "\\s+;", ";" );
478     }
479 }