2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.io.parsers.nexus;
28 import java.io.BufferedReader;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.List;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
37 import org.forester.archaeopteryx.Constants;
38 import org.forester.io.parsers.PhylogenyParser;
39 import org.forester.io.parsers.nhx.NHXFormatException;
40 import org.forester.io.parsers.nhx.NHXParser;
41 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
42 import org.forester.io.parsers.util.ParserUtils;
43 import org.forester.io.parsers.util.PhylogenyParserException;
44 import org.forester.phylogeny.Phylogeny;
45 import org.forester.phylogeny.PhylogenyNode;
46 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
47 import org.forester.phylogeny.factories.PhylogenyFactory;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.util.ForesterUtil;
51 public class NexusPhylogeniesParser implements PhylogenyParser {
53 final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase();
54 final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase();
55 final private static String translate = NexusConstants.TRANSLATE.toLowerCase();
56 final private static String tree = NexusConstants.TREE.toLowerCase();
57 final private static String utree = NexusConstants.UTREE.toLowerCase();
58 final private static String end = NexusConstants.END.toLowerCase();
59 final private static String endblock = "endblock";
60 final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
61 Pattern.CASE_INSENSITIVE );
62 final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
63 private Object _nexus_source;
64 private List<Phylogeny> _phylogenies;
65 private List<String> _taxlabels;
66 private Map<String, String> _translate_map;
67 private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
68 private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
69 private TAXONOMY_EXTRACTION _taxonomy_extraction = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
72 public Phylogeny[] parse() throws IOException, NHXFormatException {
74 final BufferedReader reader = ParserUtils.createReader( getNexusSource() );
77 StringBuilder nhx = new StringBuilder();
78 final StringBuilder translate_sb = new StringBuilder();
79 boolean in_trees_block = false;
80 boolean in_taxalabels = false;
81 boolean in_translate = false;
82 boolean in_tree = false;
83 boolean rooted_info_present = false;
84 boolean is_rooted = false;
85 while ( ( line = reader.readLine() ) != null ) {
87 if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
88 line = ForesterUtil.collapseWhiteSpace( line );
89 line = removeWhiteSpaceBeforeSemicolon( line );
90 final String line_lc = line.toLowerCase();
91 if ( line_lc.startsWith( begin_trees ) ) {
92 in_trees_block = true;
93 in_taxalabels = false;
96 else if ( line_lc.startsWith( taxlabels ) ) {
97 in_trees_block = false;
101 else if ( line_lc.startsWith( translate ) ) {
102 in_taxalabels = false;
105 else if ( in_trees_block ) {
106 //FIXME TODO need to work on this "title" and "link"
107 if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
110 else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
111 in_trees_block = false;
113 in_translate = false;
114 if ( nhx.length() > 0 ) {
115 createPhylogeny( name, nhx, rooted_info_present, is_rooted );
116 nhx = new StringBuilder();
118 rooted_info_present = false;
122 else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
123 if ( nhx.length() > 0 ) {
124 createPhylogeny( name, nhx, rooted_info_present, is_rooted );
125 nhx = new StringBuilder();
127 rooted_info_present = false;
131 nhx.append( line.substring( line.indexOf( '=' ) ) );
132 final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
133 if ( name_matcher.matches() ) {
134 name = name_matcher.group( 1 );
135 name = name.replaceAll( "['\"]+", "" );
137 final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
138 if ( rootedness_matcher.matches() ) {
139 final String s = rootedness_matcher.group( 1 );
140 line = line.replaceAll( "\\[\\&.\\]", "" );
141 rooted_info_present = true;
142 if ( s.toUpperCase().equals( "R" ) ) {
147 else if ( in_tree && !in_translate ) {
150 if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !in_translate
151 && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
153 in_translate = false;
154 createPhylogeny( name, nhx, rooted_info_present, is_rooted );
155 nhx = new StringBuilder();
157 rooted_info_present = false;
161 if ( in_taxalabels ) {
162 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
163 in_taxalabels = false;
166 final String[] labels = line.split( "\\s+" );
167 for( String label : labels ) {
168 if ( !label.toLowerCase().equals( taxlabels ) ) {
169 if ( label.endsWith( ";" ) ) {
170 in_taxalabels = false;
171 label = label.substring( 0, label.length() - 1 );
173 if ( label.length() > 0 ) {
174 getTaxlabels().add( label );
180 if ( in_translate ) {
181 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
182 in_translate = false;
185 translate_sb.append( " " );
186 translate_sb.append( line.trim() );
187 if ( line.endsWith( ";" ) ) {
188 in_translate = false;
189 setTranslateKeyValuePairs( translate_sb );
195 if ( nhx.length() > 0 ) {
196 createPhylogeny( name, nhx, rooted_info_present, is_rooted );
198 return getPhylogeniesAsArray();
201 public void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
202 _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
205 public void setReplaceUnderscores( final boolean replace_underscores ) {
206 _replace_underscores = replace_underscores;
210 public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
211 if ( nexus_source == null ) {
212 throw new PhylogenyParserException( getClass() + ": attempt to parse null object." );
214 _nexus_source = nexus_source;
217 public void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
218 _taxonomy_extraction = taxonomy_extraction;
221 private void createPhylogeny( final String name,
222 final StringBuilder nhx,
223 final boolean rooted_info_present,
224 final boolean is_rooted ) throws IOException {
225 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
226 final NHXParser pars = new NHXParser();
227 if ( ( getTaxlabels().size() < 1 ) && ( getTranslateMap().size() < 1 ) ) {
228 pars.setTaxonomyExtraction( getTaxonomyExtraction() );
229 pars.setReplaceUnderscores( isReplaceUnderscores() );
230 pars.setIgnoreQuotes( isIgnoreQuotes() );
233 pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
234 pars.setReplaceUnderscores( false );
235 pars.setIgnoreQuotes( false );
237 if ( rooted_info_present ) {
238 pars.setGuessRootedness( false );
240 final Phylogeny p = factory.create( nhx, pars )[ 0 ];
242 if ( rooted_info_present ) {
243 p.setRooted( is_rooted );
245 if ( ( getTaxlabels().size() > 0 ) || ( getTranslateMap().size() > 0 ) ) {
246 final PhylogenyNodeIterator it = p.iteratorExternalForward();
247 while ( it.hasNext() ) {
248 final PhylogenyNode node = it.next();
249 if ( ( getTranslateMap().size() > 0 ) && getTranslateMap().containsKey( node.getName() ) ) {
250 node.setName( getTranslateMap().get( node.getName() ).replaceAll( "['\"]+", "" ) );
252 else if ( getTaxlabels().size() > 0 ) {
255 i = Integer.parseInt( node.getName() );
257 catch ( final NumberFormatException e ) {
261 node.setName( getTaxlabels().get( i - 1 ).replaceAll( "['\"]+", "" ) );
264 if ( !isReplaceUnderscores() && ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) ) {
265 ParserUtils.extractTaxonomyDataFromNodeName( node, getTaxonomyExtraction() );
266 // final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
267 // getTaxonomyExtraction() );
268 // if ( !ForesterUtil.isEmpty( tax ) ) {
269 // if ( !node.getNodeData().isHasTaxonomy() ) {
270 // node.getNodeData().setTaxonomy( new Taxonomy() );
272 // node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
277 getPhylogenies().add( p );
280 private Object getNexusSource() {
281 return _nexus_source;
284 private List<Phylogeny> getPhylogenies() {
288 private Phylogeny[] getPhylogeniesAsArray() {
289 final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ];
290 for( int i = 0; i < getPhylogenies().size(); ++i ) {
291 p[ i ] = getPhylogenies().get( i );
296 private List<String> getTaxlabels() {
300 private TAXONOMY_EXTRACTION getTaxonomyExtraction() {
301 return _taxonomy_extraction;
304 private Map<String, String> getTranslateMap() {
305 return _translate_map;
308 private boolean isIgnoreQuotes() {
309 return _ignore_quotes_in_nh_data;
312 private boolean isReplaceUnderscores() {
313 return _replace_underscores;
316 private void reset() {
317 setPhylogenies( new ArrayList<Phylogeny>() );
318 setTaxlabels( new ArrayList<String>() );
319 setTranslateMap( new HashMap<String, String>() );
322 private void setPhylogenies( final ArrayList<Phylogeny> phylogenies ) {
323 _phylogenies = phylogenies;
326 private void setTaxlabels( final List<String> taxlabels ) {
327 _taxlabels = taxlabels;
330 private void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
331 String s = translate_sb.toString().trim();
332 if ( s.endsWith( ";" ) ) {
333 s = s.substring( 0, s.length() - 1 ).trim();
335 for( final String pair : s.split( "," ) ) {
336 final String[] kv = pair.trim().split( "\\s+" );
337 if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
338 throw new IOException( "ill-formatted translate values: " + translate_sb );
340 if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
341 throw new IOException( "ill-formatted translate values: " + translate_sb );
345 if ( kv.length == 3 ) {
353 if ( value.endsWith( ";" ) ) {
354 value = value.substring( 0, value.length() - 1 );
356 getTranslateMap().put( key, value );
360 private void setTranslateMap( final Map<String, String> translate_map ) {
361 _translate_map = translate_map;
364 private static String removeWhiteSpaceBeforeSemicolon( final String s ) {
365 return s.replaceAll( "\\s+;", ";" );