879a0e18e425c29cc5c91a52d02c98a4181324e0
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 // 
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 // 
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.io.parsers.nexus;
27
28 import java.io.BufferedReader;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import org.forester.archaeopteryx.Constants;
38 import org.forester.io.parsers.PhylogenyParser;
39 import org.forester.io.parsers.nhx.NHXFormatException;
40 import org.forester.io.parsers.nhx.NHXParser;
41 import org.forester.io.parsers.util.ParserUtils;
42 import org.forester.io.parsers.util.PhylogenyParserException;
43 import org.forester.phylogeny.Phylogeny;
44 import org.forester.phylogeny.PhylogenyNode;
45 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
46 import org.forester.phylogeny.factories.PhylogenyFactory;
47 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
48 import org.forester.util.ForesterUtil;
49
50 public class NexusPhylogeniesParser implements PhylogenyParser {
51
52     final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
53     final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
54     final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
55     final private static String  tree                      = NexusConstants.TREE.toLowerCase();
56     final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
57     final private static String  end                       = NexusConstants.END.toLowerCase();
58     final private static String  endblock                  = "endblock";
59     final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
60                                                                               Pattern.CASE_INSENSITIVE );
61     final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
62     private Object               _nexus_source;
63     private List<Phylogeny>      _phylogenies;
64     private List<String>         _taxlabels;
65     private Map<String, String>  _translate_map;
66     private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
67     private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
68
69     private void createPhylogeny( final String name,
70                                   final StringBuffer nhx,
71                                   final boolean rooted_info_present,
72                                   final boolean is_rooted ) throws IOException {
73         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
74         final NHXParser pars = new NHXParser();
75         pars.setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.NO );
76         pars.setReplaceUnderscores( isReplaceUnderscores() );
77         pars.setIgnoreQuotes( isIgnoreQuotes() );
78         if ( rooted_info_present ) {
79             pars.setGuessRootedness( false );
80         }
81         final Phylogeny p = factory.create( nhx, pars )[ 0 ];
82         p.setName( name );
83         if ( rooted_info_present ) {
84             p.setRooted( is_rooted );
85         }
86         if ( ( getTaxlabels().size() > 0 ) || ( getTranslateMap().size() > 0 ) ) {
87             final PhylogenyNodeIterator it = p.iteratorExternalForward();
88             while ( it.hasNext() ) {
89                 final PhylogenyNode node = it.next();
90                 if ( ( getTranslateMap().size() > 0 ) && getTranslateMap().containsKey( node.getName() ) ) {
91                     node.setName( getTranslateMap().get( node.getName() ).replaceAll( "['\"]+", "" ) );
92                 }
93                 else if ( getTaxlabels().size() > 0 ) {
94                     int i = -1;
95                     try {
96                         i = Integer.parseInt( node.getName() );
97                     }
98                     catch ( final NumberFormatException e ) {
99                         // Ignore.
100                     }
101                     if ( i > 0 ) {
102                         node.setName( getTaxlabels().get( i - 1 ).replaceAll( "['\"]+", "" ) );
103                     }
104                 }
105             }
106         }
107         getPhylogenies().add( p );
108     }
109
110     private Object getNexusSource() {
111         return _nexus_source;
112     }
113
114     private List<Phylogeny> getPhylogenies() {
115         return _phylogenies;
116     }
117
118     private Phylogeny[] getPhylogeniesAsArray() {
119         final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ];
120         for( int i = 0; i < getPhylogenies().size(); ++i ) {
121             p[ i ] = getPhylogenies().get( i );
122         }
123         return p;
124     }
125
126     private List<String> getTaxlabels() {
127         return _taxlabels;
128     }
129
130     private Map<String, String> getTranslateMap() {
131         return _translate_map;
132     }
133
134     private boolean isIgnoreQuotes() {
135         return _ignore_quotes_in_nh_data;
136     }
137
138     private boolean isReplaceUnderscores() {
139         return _replace_underscores;
140     }
141
142     public Phylogeny[] parse() throws IOException, NHXFormatException {
143         reset();
144         final BufferedReader reader = ParserUtils.createReader( getNexusSource() );
145         String line;
146         String name = "";
147         StringBuffer nhx = new StringBuffer();
148         final StringBuffer translate_sb = new StringBuffer();
149         boolean in_trees_block = false;
150         boolean in_taxalabels = false;
151         boolean in_translate = false;
152         final boolean in_comment = false;
153         boolean in_tree = false;
154         boolean rooted_info_present = false;
155         boolean is_rooted = false;
156         while ( ( line = reader.readLine() ) != null ) {
157             line = line.trim();
158             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
159                 line = ForesterUtil.collapseWhiteSpace( line );
160                 line = removeWhiteSpaceBeforeSemicolon( line );
161                 final String line_lc = line.toLowerCase();
162                 if ( line_lc.startsWith( begin_trees ) ) {
163                     in_trees_block = true;
164                     in_taxalabels = false;
165                     in_translate = false;
166                 }
167                 else if ( line_lc.startsWith( taxlabels ) ) {
168                     in_trees_block = false;
169                     in_taxalabels = true;
170                     in_translate = false;
171                 }
172                 else if ( line_lc.startsWith( translate ) ) {
173                     in_taxalabels = false;
174                     in_translate = true;
175                 }
176                 else if ( in_trees_block ) {
177                     //FIXME TODO need to work on this "title" and "link"
178                     if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
179                         // Do nothing.
180                     }
181                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
182                         in_trees_block = false;
183                         in_tree = false;
184                         in_translate = false;
185                         if ( nhx.length() > 0 ) {
186                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
187                             nhx = new StringBuffer();
188                             name = "";
189                             rooted_info_present = false;
190                             is_rooted = false;
191                         }
192                     }
193                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
194                         if ( nhx.length() > 0 ) {
195                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
196                             nhx = new StringBuffer();
197                             name = "";
198                             rooted_info_present = false;
199                             is_rooted = false;
200                         }
201                         in_tree = true;
202                         nhx.append( line.substring( line.indexOf( '=' ) ) );
203                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
204                         if ( name_matcher.matches() ) {
205                             name = name_matcher.group( 1 );
206                             name = name.replaceAll( "['\"]+", "" );
207                         }
208                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
209                         if ( rootedness_matcher.matches() ) {
210                             final String s = rootedness_matcher.group( 1 );
211                             line = line.replaceAll( "\\[\\&.\\]", "" );
212                             rooted_info_present = true;
213                             if ( s.toUpperCase().equals( "R" ) ) {
214                                 is_rooted = true;
215                             }
216                         }
217                     }
218                     else if ( in_tree && !in_translate ) {
219                         nhx.append( line );
220                     }
221                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !in_translate
222                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
223                         in_tree = false;
224                         in_translate = false;
225                         createPhylogeny( name, nhx, rooted_info_present, is_rooted );
226                         nhx = new StringBuffer();
227                         name = "";
228                         rooted_info_present = false;
229                         is_rooted = false;
230                     }
231                 }
232                 if ( in_taxalabels ) {
233                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
234                         in_taxalabels = false;
235                     }
236                     else {
237                         final String[] labels = line.split( "\\s+" );
238                         for( String label : labels ) {
239                             if ( !label.toLowerCase().equals( taxlabels ) ) {
240                                 if ( label.endsWith( ";" ) ) {
241                                     in_taxalabels = false;
242                                     label = label.substring( 0, label.length() - 1 );
243                                 }
244                                 if ( label.length() > 0 ) {
245                                     getTaxlabels().add( label );
246                                 }
247                             }
248                         }
249                     }
250                 }
251                 if ( in_translate ) {
252                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
253                         in_translate = false;
254                     }
255                     else {
256                         translate_sb.append( " " );
257                         translate_sb.append( line.trim() );
258                         if ( line.endsWith( ";" ) ) {
259                             in_translate = false;
260                             setTranslateKeyValuePairs( translate_sb );
261                         }
262                     }
263                 }
264             }
265         }
266         if ( nhx.length() > 0 ) {
267             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
268         }
269         return getPhylogeniesAsArray();
270     }
271
272     private void reset() {
273         setPhylogenies( new ArrayList<Phylogeny>() );
274         setTaxlabels( new ArrayList<String>() );
275         setTranslateMap( new HashMap<String, String>() );
276     }
277
278     public void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
279         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
280     }
281
282     private void setPhylogenies( final ArrayList<Phylogeny> phylogenies ) {
283         _phylogenies = phylogenies;
284     }
285
286     public void setReplaceUnderscores( final boolean replace_underscores ) {
287         _replace_underscores = replace_underscores;
288     }
289
290     public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
291         if ( nexus_source == null ) {
292             throw new PhylogenyParserException( getClass() + ": attempt to parse null object." );
293         }
294         _nexus_source = nexus_source;
295     }
296
297     private void setTaxlabels( final List<String> taxlabels ) {
298         _taxlabels = taxlabels;
299     }
300
301     private void setTranslateKeyValuePairs( final StringBuffer translate_sb ) throws IOException {
302         String s = translate_sb.toString().trim();
303         if ( s.endsWith( ";" ) ) {
304             s = s.substring( 0, s.length() - 1 ).trim();
305         }
306         for( final String pair : s.split( "," ) ) {
307             final String[] kv = pair.trim().split( "\\s+" );
308             if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
309                 throw new IOException( "ill formatted translate values: " + translate_sb );
310             }
311             if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
312                 throw new IOException( "ill formatted translate values: " + translate_sb );
313             }
314             String key = "";
315             String value = "";
316             if ( kv.length == 3 ) {
317                 key = kv[ 1 ];
318                 value = kv[ 2 ];
319             }
320             else {
321                 key = kv[ 0 ];
322                 value = kv[ 1 ];
323             }
324             if ( value.endsWith( ";" ) ) {
325                 value = value.substring( 0, value.length() - 1 );
326             }
327             getTranslateMap().put( key, value );
328         }
329     }
330
331     private void setTranslateMap( final Map<String, String> translate_map ) {
332         _translate_map = translate_map;
333     }
334
335     private static String removeWhiteSpaceBeforeSemicolon( final String s ) {
336         return s.replaceAll( "\\s+;", ";" );
337     }
338 }