mb parsing
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.io.parsers.nexus;
27
28 import java.io.BufferedReader;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import org.forester.archaeopteryx.Constants;
38 import org.forester.io.parsers.PhylogenyParser;
39 import org.forester.io.parsers.nhx.NHXFormatException;
40 import org.forester.io.parsers.nhx.NHXParser;
41 import org.forester.io.parsers.util.ParserUtils;
42 import org.forester.io.parsers.util.PhylogenyParserException;
43 import org.forester.phylogeny.Phylogeny;
44 import org.forester.phylogeny.PhylogenyMethods;
45 import org.forester.phylogeny.PhylogenyNode;
46 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
47 import org.forester.phylogeny.factories.PhylogenyFactory;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.util.ForesterUtil;
50
51 public class NexusPhylogeniesParser implements PhylogenyParser {
52
53     final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
54     final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
55     final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
56     final private static String  tree                      = NexusConstants.TREE.toLowerCase();
57     final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
58     final private static String  end                       = NexusConstants.END.toLowerCase();
59     final private static String  endblock                  = "endblock";
60     final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
61                                                                               Pattern.CASE_INSENSITIVE );
62     final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
63     private Object               _nexus_source;
64     private List<Phylogeny>      _phylogenies;
65     private List<String>         _taxlabels;
66     private Map<String, String>  _translate_map;
67     private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
68     private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
69
70     private void createPhylogeny( final String name,
71                                   final StringBuffer nhx,
72                                   final boolean rooted_info_present,
73                                   final boolean is_rooted ) throws IOException {
74         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
75         final NHXParser pars = new NHXParser();
76         pars.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.NO );
77         pars.setReplaceUnderscores( isReplaceUnderscores() );
78         pars.setIgnoreQuotes( isIgnoreQuotes() );
79         if ( rooted_info_present ) {
80             pars.setGuessRootedness( false );
81         }
82         final Phylogeny p = factory.create( nhx, pars )[ 0 ];
83         p.setName( name );
84         if ( rooted_info_present ) {
85             p.setRooted( is_rooted );
86         }
87         if ( ( getTaxlabels().size() > 0 ) || ( getTranslateMap().size() > 0 ) ) {
88             final PhylogenyNodeIterator it = p.iteratorExternalForward();
89             while ( it.hasNext() ) {
90                 final PhylogenyNode node = it.next();
91                 if ( ( getTranslateMap().size() > 0 ) && getTranslateMap().containsKey( node.getName() ) ) {
92                     node.setName( getTranslateMap().get( node.getName() ).replaceAll( "['\"]+", "" ) );
93                 }
94                 else if ( getTaxlabels().size() > 0 ) {
95                     int i = -1;
96                     try {
97                         i = Integer.parseInt( node.getName() );
98                     }
99                     catch ( final NumberFormatException e ) {
100                         // Ignore.
101                     }
102                     if ( i > 0 ) {
103                         node.setName( getTaxlabels().get( i - 1 ).replaceAll( "['\"]+", "" ) );
104                     }
105                 }
106             }
107         }
108         getPhylogenies().add( p );
109     }
110
111     private Object getNexusSource() {
112         return _nexus_source;
113     }
114
115     private List<Phylogeny> getPhylogenies() {
116         return _phylogenies;
117     }
118
119     private Phylogeny[] getPhylogeniesAsArray() {
120         final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ];
121         for( int i = 0; i < getPhylogenies().size(); ++i ) {
122             p[ i ] = getPhylogenies().get( i );
123         }
124         return p;
125     }
126
127     private List<String> getTaxlabels() {
128         return _taxlabels;
129     }
130
131     private Map<String, String> getTranslateMap() {
132         return _translate_map;
133     }
134
135     private boolean isIgnoreQuotes() {
136         return _ignore_quotes_in_nh_data;
137     }
138
139     private boolean isReplaceUnderscores() {
140         return _replace_underscores;
141     }
142
143     @Override
144     public Phylogeny[] parse() throws IOException, NHXFormatException {
145         reset();
146         final BufferedReader reader = ParserUtils.createReader( getNexusSource() );
147         String line;
148         String name = "";
149         StringBuffer nhx = new StringBuffer();
150         final StringBuffer translate_sb = new StringBuffer();
151         boolean in_trees_block = false;
152         boolean in_taxalabels = false;
153         boolean in_translate = false;
154         final boolean in_comment = false;
155         boolean in_tree = false;
156         boolean rooted_info_present = false;
157         boolean is_rooted = false;
158         while ( ( line = reader.readLine() ) != null ) {
159             line = line.trim();
160             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
161                 line = ForesterUtil.collapseWhiteSpace( line );
162                 line = removeWhiteSpaceBeforeSemicolon( line );
163                 final String line_lc = line.toLowerCase();
164                 if ( line_lc.startsWith( begin_trees ) ) {
165                     in_trees_block = true;
166                     in_taxalabels = false;
167                     in_translate = false;
168                 }
169                 else if ( line_lc.startsWith( taxlabels ) ) {
170                     in_trees_block = false;
171                     in_taxalabels = true;
172                     in_translate = false;
173                 }
174                 else if ( line_lc.startsWith( translate ) ) {
175                     in_taxalabels = false;
176                     in_translate = true;
177                 }
178                 else if ( in_trees_block ) {
179                     //FIXME TODO need to work on this "title" and "link"
180                     if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
181                         // Do nothing.
182                     }
183                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
184                         in_trees_block = false;
185                         in_tree = false;
186                         in_translate = false;
187                         if ( nhx.length() > 0 ) {
188                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
189                             nhx = new StringBuffer();
190                             name = "";
191                             rooted_info_present = false;
192                             is_rooted = false;
193                         }
194                     }
195                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
196                         if ( nhx.length() > 0 ) {
197                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
198                             nhx = new StringBuffer();
199                             name = "";
200                             rooted_info_present = false;
201                             is_rooted = false;
202                         }
203                         in_tree = true;
204                         nhx.append( line.substring( line.indexOf( '=' ) ) );
205                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
206                         if ( name_matcher.matches() ) {
207                             name = name_matcher.group( 1 );
208                             name = name.replaceAll( "['\"]+", "" );
209                         }
210                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
211                         if ( rootedness_matcher.matches() ) {
212                             final String s = rootedness_matcher.group( 1 );
213                             line = line.replaceAll( "\\[\\&.\\]", "" );
214                             rooted_info_present = true;
215                             if ( s.toUpperCase().equals( "R" ) ) {
216                                 is_rooted = true;
217                             }
218                         }
219                     }
220                     else if ( in_tree && !in_translate ) {
221                         nhx.append( line );
222                     }
223                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !in_translate
224                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
225                         in_tree = false;
226                         in_translate = false;
227                         createPhylogeny( name, nhx, rooted_info_present, is_rooted );
228                         nhx = new StringBuffer();
229                         name = "";
230                         rooted_info_present = false;
231                         is_rooted = false;
232                     }
233                 }
234                 if ( in_taxalabels ) {
235                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
236                         in_taxalabels = false;
237                     }
238                     else {
239                         final String[] labels = line.split( "\\s+" );
240                         for( String label : labels ) {
241                             if ( !label.toLowerCase().equals( taxlabels ) ) {
242                                 if ( label.endsWith( ";" ) ) {
243                                     in_taxalabels = false;
244                                     label = label.substring( 0, label.length() - 1 );
245                                 }
246                                 if ( label.length() > 0 ) {
247                                     getTaxlabels().add( label );
248                                 }
249                             }
250                         }
251                     }
252                 }
253                 if ( in_translate ) {
254                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
255                         in_translate = false;
256                     }
257                     else {
258                         translate_sb.append( " " );
259                         translate_sb.append( line.trim() );
260                         if ( line.endsWith( ";" ) ) {
261                             in_translate = false;
262                             setTranslateKeyValuePairs( translate_sb );
263                         }
264                     }
265                 }
266             }
267         }
268         if ( nhx.length() > 0 ) {
269             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
270         }
271         return getPhylogeniesAsArray();
272     }
273
274     private void reset() {
275         setPhylogenies( new ArrayList<Phylogeny>() );
276         setTaxlabels( new ArrayList<String>() );
277         setTranslateMap( new HashMap<String, String>() );
278     }
279
280     public void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
281         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
282     }
283
284     private void setPhylogenies( final ArrayList<Phylogeny> phylogenies ) {
285         _phylogenies = phylogenies;
286     }
287
288     public void setReplaceUnderscores( final boolean replace_underscores ) {
289         _replace_underscores = replace_underscores;
290     }
291
292     @Override
293     public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
294         if ( nexus_source == null ) {
295             throw new PhylogenyParserException( getClass() + ": attempt to parse null object." );
296         }
297         _nexus_source = nexus_source;
298     }
299
300     private void setTaxlabels( final List<String> taxlabels ) {
301         _taxlabels = taxlabels;
302     }
303
304     private void setTranslateKeyValuePairs( final StringBuffer translate_sb ) throws IOException {
305         String s = translate_sb.toString().trim();
306         if ( s.endsWith( ";" ) ) {
307             s = s.substring( 0, s.length() - 1 ).trim();
308         }
309         for( final String pair : s.split( "," ) ) {
310             final String[] kv = pair.trim().split( "\\s+" );
311             if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
312                 throw new IOException( "ill formatted translate values: " + translate_sb );
313             }
314             if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
315                 throw new IOException( "ill formatted translate values: " + translate_sb );
316             }
317             String key = "";
318             String value = "";
319             if ( kv.length == 3 ) {
320                 key = kv[ 1 ];
321                 value = kv[ 2 ];
322             }
323             else {
324                 key = kv[ 0 ];
325                 value = kv[ 1 ];
326             }
327             if ( value.endsWith( ";" ) ) {
328                 value = value.substring( 0, value.length() - 1 );
329             }
330             getTranslateMap().put( key, value );
331         }
332     }
333
334     private void setTranslateMap( final Map<String, String> translate_map ) {
335         _translate_map = translate_map;
336     }
337
338     private static String removeWhiteSpaceBeforeSemicolon( final String s ) {
339         return s.replaceAll( "\\s+;", ";" );
340     }
341 }