2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.io.parsers.nexus;
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
38 import org.forester.archaeopteryx.Constants;
39 import org.forester.io.parsers.IteratingPhylogenyParser;
40 import org.forester.io.parsers.PhylogenyParser;
41 import org.forester.io.parsers.nhx.NHXFormatException;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.parsers.util.PhylogenyParserException;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.util.ForesterUtil;
51 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
53 final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase();
54 final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase();
55 final private static String translate = NexusConstants.TRANSLATE.toLowerCase();
56 final private static String tree = NexusConstants.TREE.toLowerCase();
57 final private static String utree = NexusConstants.UTREE.toLowerCase();
58 final private static String end = NexusConstants.END.toLowerCase();
59 final private static String endblock = "endblock";
60 final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
61 Pattern.CASE_INSENSITIVE );
62 final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
63 private Object _nexus_source;
64 private List<String> _taxlabels;
65 private Map<String, String> _translate_map;
66 private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
67 private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
68 private TAXONOMY_EXTRACTION _taxonomy_extraction = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
69 private Phylogeny _next;
70 private BufferedReader _br;
71 private boolean _in_trees_block;
72 private StringBuilder _nh;
74 private StringBuilder _translate_sb;
75 private boolean _in_taxalabels;
76 private boolean _in_translate;
77 private boolean _is_rooted;
78 private boolean _rooted_info_present;
79 private boolean _in_tree;
82 public final boolean hasNext() {
87 public final Phylogeny next() throws NHXFormatException, IOException {
88 final Phylogeny phy = _next;
94 public final Phylogeny[] parse() throws IOException {
96 final List<Phylogeny> l = new ArrayList<Phylogeny>();
100 final Phylogeny[] p = new Phylogeny[ l.size() ];
101 for( int i = 0; i < l.size(); ++i ) {
108 public final void reset() throws FileNotFoundException, IOException {
109 _taxlabels = new ArrayList<String>();
110 _translate_map = new HashMap<String, String>();
111 _nh = new StringBuilder();
113 _translate_sb = new StringBuilder();
115 _in_trees_block = false;
116 _in_taxalabels = false;
117 _in_translate = false;
119 _rooted_info_present = false;
121 _br = ParserUtils.createReader( _nexus_source );
125 public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
126 _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
129 public final void setReplaceUnderscores( final boolean replace_underscores ) {
130 _replace_underscores = replace_underscores;
134 public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
135 if ( nexus_source == null ) {
136 throw new PhylogenyParserException( "attempt to parse null object" );
138 _nexus_source = nexus_source;
142 public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
143 _taxonomy_extraction = taxonomy_extraction;
146 private final void createPhylogeny( final String name,
147 final StringBuilder nhx,
148 final boolean rooted_info_present,
149 final boolean is_rooted ) throws IOException {
151 final NHXParser pars = new NHXParser();
152 if ( ( _taxlabels.size() < 1 ) && ( _translate_map.size() < 1 ) ) {
153 pars.setTaxonomyExtraction( _taxonomy_extraction );
154 pars.setReplaceUnderscores( _replace_underscores );
155 pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
158 pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
159 pars.setReplaceUnderscores( false );
160 pars.setIgnoreQuotes( false );
162 if ( rooted_info_present ) {
163 pars.setGuessRootedness( false );
165 pars.setSource( nhx );
166 final Phylogeny p = pars.next();
168 throw new PhylogenyParserException( "failed to create phylogeny" );
171 if ( rooted_info_present ) {
172 p.setRooted( is_rooted );
174 if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
175 final PhylogenyNodeIterator it = p.iteratorExternalForward();
176 while ( it.hasNext() ) {
177 final PhylogenyNode node = it.next();
178 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
179 node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
181 else if ( _taxlabels.size() > 0 ) {
184 i = Integer.parseInt( node.getName() );
186 catch ( final NumberFormatException e ) {
190 node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
193 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
194 ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
195 // final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
196 // getTaxonomyExtraction() );
197 // if ( !ForesterUtil.isEmpty( tax ) ) {
198 // if ( !node.getNodeData().isHasTaxonomy() ) {
199 // node.getNodeData().setTaxonomy( new Taxonomy() );
201 // node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
209 private final void getNext() throws IOException, NHXFormatException {
212 while ( ( line = _br.readLine() ) != null ) {
214 if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
215 line = ForesterUtil.collapseWhiteSpace( line );
216 line = removeWhiteSpaceBeforeSemicolon( line );
217 final String line_lc = line.toLowerCase();
218 if ( line_lc.startsWith( begin_trees ) ) {
219 _in_trees_block = true;
220 _in_taxalabels = false;
221 _in_translate = false;
223 else if ( line_lc.startsWith( taxlabels ) ) {
224 _in_trees_block = false;
225 _in_taxalabels = true;
226 _in_translate = false;
228 else if ( line_lc.startsWith( translate ) ) {
229 _in_taxalabels = false;
230 _in_translate = true;
232 else if ( _in_trees_block ) {
233 //FIXME TODO need to work on this "title" and "link"
234 if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
237 else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
238 _in_trees_block = false;
240 _in_translate = false;
241 if ( _nh.length() > 0 ) {
242 createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
243 _nh = new StringBuilder();
245 _rooted_info_present = false;
247 if ( _next != null ) {
252 else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
253 boolean might = false;
254 if ( _nh.length() > 0 ) {
256 createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
257 _nh = new StringBuilder();
259 _rooted_info_present = false;
263 _nh.append( line.substring( line.indexOf( '=' ) ) );
264 final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
265 if ( name_matcher.matches() ) {
266 _name = name_matcher.group( 1 );
267 _name = _name.replaceAll( "['\"]+", "" );
269 final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
270 if ( rootedness_matcher.matches() ) {
271 final String s = rootedness_matcher.group( 1 );
272 line = line.replaceAll( "\\[\\&.\\]", "" );
273 _rooted_info_present = true;
274 if ( s.toUpperCase().equals( "R" ) ) {
278 if ( might && ( _next != null ) ) {
282 else if ( _in_tree && !_in_translate ) {
285 if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
286 && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
288 _in_translate = false;
289 createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
290 _nh = new StringBuilder();
292 _rooted_info_present = false;
294 if ( _next != null ) {
299 if ( _in_taxalabels ) {
300 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
301 _in_taxalabels = false;
304 final String[] labels = line.split( "\\s+" );
305 for( String label : labels ) {
306 if ( !label.toLowerCase().equals( taxlabels ) ) {
307 if ( label.endsWith( ";" ) ) {
308 _in_taxalabels = false;
309 label = label.substring( 0, label.length() - 1 );
311 if ( label.length() > 0 ) {
312 _taxlabels.add( label );
318 if ( _in_translate ) {
319 if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
320 _in_translate = false;
323 _translate_sb.append( " " );
324 _translate_sb.append( line.trim() );
325 if ( line.endsWith( ";" ) ) {
326 _in_translate = false;
327 setTranslateKeyValuePairs( _translate_sb );
333 if ( _nh.length() > 0 ) {
334 createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
335 if ( _next != null ) {
341 private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
342 String s = translate_sb.toString().trim();
343 if ( s.endsWith( ";" ) ) {
344 s = s.substring( 0, s.length() - 1 ).trim();
346 for( final String pair : s.split( "," ) ) {
347 final String[] kv = pair.trim().split( "\\s+" );
348 if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
349 throw new IOException( "ill-formatted translate values: " + translate_sb );
351 if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
352 throw new IOException( "ill-formatted translate values: " + translate_sb );
356 if ( kv.length == 3 ) {
364 if ( value.endsWith( ";" ) ) {
365 value = value.substring( 0, value.length() - 1 );
367 _translate_map.put( key, value );
371 private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
372 return s.replaceAll( "\\s+;", ";" );
376 public String getName() {
377 return "Nexus Phylogenies Parser";