2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.io.parsers.nexus;
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
38 import org.forester.archaeopteryx.AptxConstants;
39 import org.forester.io.parsers.IteratingPhylogenyParser;
40 import org.forester.io.parsers.PhylogenyParser;
41 import org.forester.io.parsers.nhx.NHXFormatException;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.parsers.util.PhylogenyParserException;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.Sequence;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.sequence.BasicSequence;
51 import org.forester.sequence.MolecularSequence;
52 import org.forester.util.ForesterConstants;
53 import org.forester.util.ForesterUtil;
55 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
58 final private static boolean DEBUG = false;
60 final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase();
61 final private static String end = NexusConstants.END.toLowerCase();
62 final private static String endblock = "endblock";
63 final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
64 final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase();
65 final private static Pattern TITLE_PATTERN = Pattern.compile( "TITLE.?\\s+([^;]+)",
66 Pattern.CASE_INSENSITIVE );
67 final private static String translate = NexusConstants.TRANSLATE.toLowerCase();
68 final private static String data = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
69 final private static String characters = NexusConstants.BEGIN_DATA.toLowerCase();
70 final private static String tree = NexusConstants.TREE.toLowerCase();
71 final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
72 Pattern.CASE_INSENSITIVE );
73 final private static Pattern TRANSLATE_PATTERN = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
74 final private static Pattern ALN_PATTERN = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
75 final private static Pattern DATATYPE_PATTERN = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
76 //final private static Pattern LINK_TAXA_PATTERN = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
77 // Pattern.CASE_INSENSITIVE );
78 final private static String utree = NexusConstants.UTREE.toLowerCase();
79 private BufferedReader _br;
80 private boolean _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
81 private boolean _in_taxalabels;
82 private boolean _in_translate;
83 private boolean _in_tree;
84 private boolean _in_trees_block;
85 private boolean _in_data_block;
86 private boolean _is_rooted;
87 private String _datatype;
89 private Phylogeny _next;
90 private Object _nexus_source;
91 private StringBuilder _nh;
92 private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
93 private boolean _rooted_info_present;
94 private List<String> _taxlabels;
95 private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.NO;
96 private String _title;
97 private Map<String, String> _translate_map;
98 private StringBuilder _translate_sb;
99 private Map<String, MolecularSequence> _seqs;
100 private final boolean _add_sequences = true;
101 private boolean _parse_beast_style_extended_tags = false;
105 public String getName() {
106 return "Nexus Phylogenies Parser";
110 public final boolean hasNext() {
111 return _next != null;
115 public final Phylogeny next() throws NHXFormatException, IOException {
116 final Phylogeny phy = _next;
122 public final Phylogeny[] parse() throws IOException {
123 final List<Phylogeny> l = new ArrayList<Phylogeny>();
124 while ( hasNext() ) {
127 final Phylogeny[] p = new Phylogeny[ l.size() ];
128 for( int i = 0; i < l.size(); ++i ) {
136 public final void reset() throws FileNotFoundException, IOException {
137 _taxlabels = new ArrayList<String>();
138 _translate_map = new HashMap<String, String>();
139 _nh = new StringBuilder();
142 _translate_sb = null;
144 _in_trees_block = false;
145 _in_taxalabels = false;
146 _in_translate = false;
148 _rooted_info_present = false;
150 _seqs = new HashMap<String, MolecularSequence>();
151 _br = ParserUtils.createReader( _nexus_source, ForesterConstants.UTF_8 );
155 public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
156 _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
159 public final void setReplaceUnderscores( final boolean replace_underscores ) {
160 _replace_underscores = replace_underscores;
164 public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
165 if ( nexus_source == null ) {
166 throw new PhylogenyParserException( "attempt to parse null object" );
168 _nexus_source = nexus_source;
172 public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
173 _taxonomy_extraction = taxonomy_extraction;
176 private final void createPhylogeny( final String title,
178 final StringBuilder nhx,
179 final boolean rooted_info_present,
180 final boolean is_rooted ) throws IOException {
182 final NHXParser pars = new NHXParser();
183 pars.setTaxonomyExtraction( _taxonomy_extraction );
184 pars.setReplaceUnderscores( _replace_underscores );
185 pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
186 pars.setParseBeastStyleExtendedTags( _parse_beast_style_extended_tags );
187 if ( rooted_info_present ) {
188 pars.setGuessRootedness( false );
190 pars.setSource( nhx.toString() );
191 final Phylogeny p = pars.next();
193 throw new PhylogenyParserException( "failed to create phylogeny" );
195 String myname = null;
196 if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
197 myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
199 else if ( !ForesterUtil.isEmpty( title ) ) {
200 myname = title.replace( '_', ' ' ).trim();
202 else if ( !ForesterUtil.isEmpty( name ) ) {
203 myname = name.trim();
205 if ( !ForesterUtil.isEmpty( myname ) ) {
208 if ( rooted_info_present ) {
209 p.setRooted( is_rooted );
211 if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
212 final PhylogenyNodeIterator it = p.iteratorExternalForward();
213 while ( it.hasNext() ) {
214 final PhylogenyNode node = it.next();
215 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
216 node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
218 else if ( _taxlabels.size() > 0 ) {
221 i = Integer.parseInt( node.getName() );
223 catch ( final NumberFormatException e ) {
227 node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
230 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
231 ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
233 else if ( _replace_underscores ) {
234 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
235 node.setName( node.getName().replace( '_', ' ' ).trim() );
238 if ( _add_sequences ) {
239 if ( _seqs.containsKey( node.getName() ) ) {
240 final MolecularSequence s = _seqs.get( node.getName() );
241 //TODO need to check for uniqueness when adding seqs....
242 final Sequence ns = new Sequence( s );
243 ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
244 node.getNodeData().addSequence( ns );
252 private final void getNext() throws IOException, NHXFormatException {
255 while ( ( line = _br.readLine() ) != null ) {
257 System.out.println( line );
260 if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
261 line = ForesterUtil.collapseWhiteSpace( line );
262 line = removeWhiteSpaceBeforeSemicolon( line );
263 final String line_lc = line.toLowerCase();
264 if ( line_lc.startsWith( begin_trees ) ) {
265 _in_trees_block = true;
266 _in_taxalabels = false;
267 _in_translate = false;
268 _in_data_block = false;
272 else if ( line_lc.startsWith( taxlabels ) ) {
273 //TODO need to be taxa block instead
274 _in_trees_block = false;
275 _in_taxalabels = true;
276 _in_translate = false;
277 _in_data_block = false;
280 else if ( line_lc.startsWith( translate ) ) {
281 _translate_sb = new StringBuilder();
282 _in_taxalabels = false;
283 _in_translate = true;
284 _in_data_block = false;
287 else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
288 _in_taxalabels = false;
289 _in_trees_block = false;
290 _in_translate = false;
291 _in_data_block = true;
294 else if ( _in_trees_block ) {
295 if ( line_lc.startsWith( "title" ) ) {
296 final Matcher title_m = TITLE_PATTERN.matcher( line );
297 if ( title_m.lookingAt() ) {
298 _title = title_m.group( 1 );
301 else if ( line_lc.startsWith( "link" ) ) {
302 //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
303 //if ( link_m.lookingAt() ) {
304 //final String link = link_m.group( 1 ); //TODO why?
307 else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
308 _in_trees_block = false;
310 _in_translate = false;
311 if ( _nh.length() > 0 ) {
312 createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
313 _nh = new StringBuilder();
315 _rooted_info_present = false;
317 if ( _next != null ) {
322 else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
323 boolean might = false;
324 if ( _nh.length() > 0 ) {
326 createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
327 _nh = new StringBuilder();
329 _rooted_info_present = false;
333 _nh.append( line.substring( line.indexOf( '=' ) ) );
334 final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
335 if ( name_matcher.matches() ) {
336 _name = name_matcher.group( 1 );
337 _name = _name.replaceAll( "['\"]+", "" );
339 final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
340 if ( rootedness_matcher.matches() ) {
341 final String s = rootedness_matcher.group( 1 );
342 line = line.replaceAll( "\\[\\&.\\]", "" );
343 _rooted_info_present = true;
344 if ( s.toUpperCase().equals( "R" ) ) {
348 if ( might && ( _next != null ) ) {
352 else if ( _in_tree && !_in_translate ) {
355 if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
356 && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
358 _in_translate = false;
359 createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
360 _nh = new StringBuilder();
362 _rooted_info_present = false;
364 if ( _next != null ) {
369 if ( _in_taxalabels ) {
370 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
371 _in_taxalabels = false;
374 final String[] labels = line.split( "\\s+" );
375 for( String label : labels ) {
376 if ( !label.toLowerCase().equals( taxlabels ) ) {
377 if ( label.endsWith( ";" ) ) {
378 _in_taxalabels = false;
379 label = label.substring( 0, label.length() - 1 );
381 if ( label.length() > 0 ) {
382 _taxlabels.add( label );
388 if ( _in_translate ) {
389 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
390 _in_translate = false;
393 _translate_sb.append( " " );
394 _translate_sb.append( line.trim() );
395 if ( line.endsWith( ";" ) ) {
396 _in_translate = false;
397 setTranslateKeyValuePairs( _translate_sb );
401 if ( _in_data_block ) {
402 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
403 _in_data_block = false;
406 else if ( line_lc.startsWith( "link" ) ) {
407 // final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
408 // if ( link_m.lookingAt() ) {
409 // final String link = link_m.group( 1 );
413 final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
414 if ( datatype_matcher.find() ) {
415 _datatype = datatype_matcher.group( 1 );
418 if ( ( _datatype != null )
419 && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
420 .equals( "rna" ) ) ) {
421 if ( line.endsWith( ";" ) ) {
422 _in_data_block = false;
423 line = line.substring( 0, line.length() - 1 );
425 final Matcher aln_matcher = ALN_PATTERN.matcher( line );
426 if ( aln_matcher.matches() ) {
427 final String id = aln_matcher.group( 1 );
428 final String seq = aln_matcher.group( 2 );
429 MolecularSequence s = null;
430 if ( _datatype.equals( "protein" ) ) {
431 s = BasicSequence.createAaSequence( id, seq );
433 else if ( _datatype.equals( "dna" ) ) {
434 s = BasicSequence.createDnaSequence( id, seq );
437 s = BasicSequence.createRnaSequence( id, seq );
447 if ( _nh.length() > 0 ) {
448 createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
449 if ( _next != null ) {
455 private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
456 String s = translate_sb.toString().trim();
457 if ( s.endsWith( ";" ) ) {
458 s = s.substring( 0, s.length() - 1 ).trim();
460 for( String pair : s.split( "," ) ) {
463 final int ti = pair.toLowerCase().indexOf( "translate" );
465 pair = pair.substring( ti + 9 );
467 final Matcher m = TRANSLATE_PATTERN.matcher( pair );
470 value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
473 throw new IOException( "ill-formatted translate values: " + pair );
475 if ( value.endsWith( ";" ) ) {
476 value = value.substring( 0, value.length() - 1 );
478 _translate_map.put( key, value );
482 public final void setParseBeastStyleExtendedTags( final boolean parse_beast_style_extended_tags ) {
483 _parse_beast_style_extended_tags = parse_beast_style_extended_tags;
486 private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
487 return s.replaceAll( "\\s+;", ";" );