// $Id: // FORESTER -- software libraries and applications // for evolutionary biology research and applications. // // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.io.parsers.nexus; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.archaeopteryx.Constants; import org.forester.io.parsers.IteratingPhylogenyParser; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nhx.NHXFormatException; import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.util.ParserUtils; import org.forester.io.parsers.util.PhylogenyParserException; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.sequence.BasicSequence; import org.forester.sequence.MolecularSequence; import org.forester.util.ForesterUtil; public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser { final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase(); final private static String end = NexusConstants.END.toLowerCase(); final private static String endblock = "endblock"; final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" ); final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase(); final private static Pattern TITLE_PATTERN = Pattern.compile( "TITLE.?\\s+([^;]+)", Pattern.CASE_INSENSITIVE ); final private static String translate = NexusConstants.TRANSLATE.toLowerCase(); final private static String data = NexusConstants.BEGIN_CHARACTERS.toLowerCase(); final private static String characters = NexusConstants.BEGIN_DATA.toLowerCase(); final private static String tree = NexusConstants.TREE.toLowerCase(); final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+", Pattern.CASE_INSENSITIVE ); final private static Pattern TRANSLATE_PATTERN = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" ); final private static Pattern ALN_PATTERN = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" ); final private static Pattern DATATYPE_PATTERN = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" ); final private static Pattern LINK_TAXA_PATTERN = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)", Pattern.CASE_INSENSITIVE ); final private static String utree = NexusConstants.UTREE.toLowerCase(); private BufferedReader _br; private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT; private boolean _in_taxalabels; private boolean _in_translate; private boolean _in_tree; private boolean _in_trees_block; private boolean _in_data_block; private boolean _is_rooted; private String _datatype; private String _name; private Phylogeny _next; private Object _nexus_source; private StringBuilder _nh; private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT; private boolean _rooted_info_present; private List _taxlabels; private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.NO; private String _title; private Map _translate_map; private StringBuilder _translate_sb; private Map _seqs; private final boolean _add_sequences = true; @Override public String getName() { return "Nexus Phylogenies Parser"; } @Override public final boolean hasNext() { return _next != null; } @Override public final Phylogeny next() throws NHXFormatException, IOException { final Phylogeny phy = _next; getNext(); return phy; } @Override public final Phylogeny[] parse() throws IOException { final List l = new ArrayList(); while ( hasNext() ) { l.add( next() ); } final Phylogeny[] p = new Phylogeny[ l.size() ]; for( int i = 0; i < l.size(); ++i ) { p[ i ] = l.get( i ); } reset(); return p; } @Override public final void reset() throws FileNotFoundException, IOException { _taxlabels = new ArrayList(); _translate_map = new HashMap(); _nh = new StringBuilder(); _name = ""; _title = ""; _translate_sb = null; _next = null; _in_trees_block = false; _in_taxalabels = false; _in_translate = false; _in_tree = false; _rooted_info_present = false; _is_rooted = false; _seqs = new HashMap(); _br = ParserUtils.createReader( _nexus_source ); getNext(); } public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) { _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data; } public final void setReplaceUnderscores( final boolean replace_underscores ) { _replace_underscores = replace_underscores; } @Override public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException { if ( nexus_source == null ) { throw new PhylogenyParserException( "attempt to parse null object" ); } _nexus_source = nexus_source; reset(); } public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) { _taxonomy_extraction = taxonomy_extraction; } private final void createPhylogeny( final String title, final String name, final StringBuilder nhx, final boolean rooted_info_present, final boolean is_rooted ) throws IOException { _next = null; final NHXParser pars = new NHXParser(); pars.setTaxonomyExtraction( _taxonomy_extraction ); pars.setReplaceUnderscores( _replace_underscores ); pars.setIgnoreQuotes( _ignore_quotes_in_nh_data ); if ( rooted_info_present ) { pars.setGuessRootedness( false ); } pars.setSource( nhx ); final Phylogeny p = pars.next(); if ( p == null ) { throw new PhylogenyParserException( "failed to create phylogeny" ); } String myname = null; if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) { myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")"; } else if ( !ForesterUtil.isEmpty( title ) ) { myname = title.replace( '_', ' ' ).trim(); } else if ( !ForesterUtil.isEmpty( name ) ) { myname = name.trim(); } if ( !ForesterUtil.isEmpty( myname ) ) { p.setName( myname ); } if ( rooted_info_present ) { p.setRooted( is_rooted ); } if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) { final PhylogenyNodeIterator it = p.iteratorExternalForward(); while ( it.hasNext() ) { final PhylogenyNode node = it.next(); if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) { node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) ); } else if ( _taxlabels.size() > 0 ) { int i = -1; try { i = Integer.parseInt( node.getName() ); } catch ( final NumberFormatException e ) { // Ignore. } if ( i > 0 ) { node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) ); } } if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) { ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction ); } else if ( _replace_underscores ) { if ( !ForesterUtil.isEmpty( node.getName() ) ) { node.setName( node.getName().replace( '_', ' ' ).trim() ); } } if ( _add_sequences ) { if ( _seqs.containsKey( node.getName() ) ) { final MolecularSequence s = _seqs.get( node.getName() ); //TODO need to check for uniqueness when adding seqs.... final Sequence ns = new Sequence( s ); ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length node.getNodeData().addSequence( ns ); } } } } _next = p; } private final void getNext() throws IOException, NHXFormatException { _next = null; String line; while ( ( line = _br.readLine() ) != null ) { line = line.trim(); if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) { line = ForesterUtil.collapseWhiteSpace( line ); line = removeWhiteSpaceBeforeSemicolon( line ); final String line_lc = line.toLowerCase(); if ( line_lc.startsWith( begin_trees ) ) { _in_trees_block = true; _in_taxalabels = false; _in_translate = false; _in_data_block = false; _datatype = null; _title = ""; } else if ( line_lc.startsWith( taxlabels ) ) { //TODO need to be taxa block instead _in_trees_block = false; _in_taxalabels = true; _in_translate = false; _in_data_block = false; _datatype = null; } else if ( line_lc.startsWith( translate ) ) { _translate_sb = new StringBuilder(); _in_taxalabels = false; _in_translate = true; _in_data_block = false; _datatype = null; } else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) { _in_taxalabels = false; _in_trees_block = false; _in_translate = false; _in_data_block = true; _datatype = null; } else if ( _in_trees_block ) { if ( line_lc.startsWith( "title" ) ) { final Matcher title_m = TITLE_PATTERN.matcher( line ); if ( title_m.lookingAt() ) { _title = title_m.group( 1 ); } } else if ( line_lc.startsWith( "link" ) ) { final Matcher link_m = LINK_TAXA_PATTERN.matcher( line ); if ( link_m.lookingAt() ) { final String link = link_m.group( 1 ); //System.out.println( "link taxa:" + link ); } } else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { _in_trees_block = false; _in_tree = false; _in_translate = false; if ( _nh.length() > 0 ) { createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; _is_rooted = false; if ( _next != null ) { return; } } } else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) { boolean might = false; if ( _nh.length() > 0 ) { might = true; createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; _is_rooted = false; } _in_tree = true; _nh.append( line.substring( line.indexOf( '=' ) ) ); final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line ); if ( name_matcher.matches() ) { _name = name_matcher.group( 1 ); _name = _name.replaceAll( "['\"]+", "" ); } final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line ); if ( rootedness_matcher.matches() ) { final String s = rootedness_matcher.group( 1 ); line = line.replaceAll( "\\[\\&.\\]", "" ); _rooted_info_present = true; if ( s.toUpperCase().equals( "R" ) ) { _is_rooted = true; } } if ( might && ( _next != null ) ) { return; } } else if ( _in_tree && !_in_translate ) { _nh.append( line ); } if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) { _in_tree = false; _in_translate = false; createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; _is_rooted = false; if ( _next != null ) { return; } } } if ( _in_taxalabels ) { if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { _in_taxalabels = false; } else { final String[] labels = line.split( "\\s+" ); for( String label : labels ) { if ( !label.toLowerCase().equals( taxlabels ) ) { if ( label.endsWith( ";" ) ) { _in_taxalabels = false; label = label.substring( 0, label.length() - 1 ); } if ( label.length() > 0 ) { _taxlabels.add( label ); } } } } } if ( _in_translate ) { if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { _in_translate = false; } else { _translate_sb.append( " " ); _translate_sb.append( line.trim() ); if ( line.endsWith( ";" ) ) { _in_translate = false; setTranslateKeyValuePairs( _translate_sb ); } } } if ( _in_data_block ) { if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { _in_data_block = false; _datatype = null; } else if ( line_lc.startsWith( "link" ) ) { final Matcher link_m = LINK_TAXA_PATTERN.matcher( line ); if ( link_m.lookingAt() ) { final String link = link_m.group( 1 ); //System.out.println( "link taxa:" + link ); } } else { final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc ); if ( datatype_matcher.find() ) { _datatype = datatype_matcher.group( 1 ); //System.out.println( _datatype ); } else { if ( ( _datatype != null ) && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype .equals( "rna" ) ) ) { if ( line.endsWith( ";" ) ) { _in_data_block = false; line = line.substring( 0, line.length() - 1 ); } final Matcher aln_matcher = ALN_PATTERN.matcher( line ); if ( aln_matcher.matches() ) { final String id = aln_matcher.group( 1 ); final String seq = aln_matcher.group( 2 ); MolecularSequence s = null; if ( _datatype.equals( "protein" ) ) { s = BasicSequence.createAaSequence( id, seq ); } else if ( _datatype.equals( "dna" ) ) { s = BasicSequence.createDnaSequence( id, seq ); } else { s = BasicSequence.createRnaSequence( id, seq ); } _seqs.put( id, s ); //System.out.println( s ); } } } } } } } if ( _nh.length() > 0 ) { createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); if ( _next != null ) { return; } } } private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException { String s = translate_sb.toString().trim(); if ( s.endsWith( ";" ) ) { s = s.substring( 0, s.length() - 1 ).trim(); } for( String pair : s.split( "," ) ) { String key = ""; String value = ""; final int ti = pair.toLowerCase().indexOf( "translate" ); if ( ti > -1 ) { pair = pair.substring( ti + 9 ); } final Matcher m = TRANSLATE_PATTERN.matcher( pair ); if ( m.find() ) { key = m.group( 1 ); value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim(); } else { throw new IOException( "ill-formatted translate values: " + pair ); } if ( value.endsWith( ";" ) ) { value = value.substring( 0, value.length() - 1 ); } _translate_map.put( key, value ); } } private final static String removeWhiteSpaceBeforeSemicolon( final String s ) { return s.replaceAll( "\\s+;", ";" ); } }