forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java

   1 // $Id:
   2 // FORESTER -- software libraries and applications
   3 // for evolutionary biology research and applications.
   4 //
   5 // Copyright (C) 2008-2009 Christian M. Zmasek
   6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: www.phylosoft.org/forester
  25
  26 package org.forester.io.parsers.nexus;
  27
  28 import java.io.BufferedReader;
  29 import java.io.IOException;
  30 import java.util.ArrayList;
  31 import java.util.HashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34 import java.util.regex.Matcher;
  35 import java.util.regex.Pattern;
  36
  37 import org.forester.archaeopteryx.Constants;
  38 import org.forester.io.parsers.PhylogenyParser;
  39 import org.forester.io.parsers.nhx.NHXFormatException;
  40 import org.forester.io.parsers.nhx.NHXParser;
  41 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
  42 import org.forester.io.parsers.util.ParserUtils;
  43 import org.forester.io.parsers.util.PhylogenyParserException;
  44 import org.forester.phylogeny.Phylogeny;
  45 import org.forester.phylogeny.PhylogenyNode;
  46 import org.forester.phylogeny.data.Taxonomy;
  47 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
  48 import org.forester.phylogeny.factories.PhylogenyFactory;
  49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
  50 import org.forester.util.ForesterUtil;
  51
  52 public class NexusPhylogeniesParser implements PhylogenyParser {
  53
  54     final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
  55     final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
  56     final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
  57     final private static String  tree                      = NexusConstants.TREE.toLowerCase();
  58     final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
  59     final private static String  end                       = NexusConstants.END.toLowerCase();
  60     final private static String  endblock                  = "endblock";
  61     final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
  62                                                                               Pattern.CASE_INSENSITIVE );
  63     final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
  64     private Object               _nexus_source;
  65     private List<Phylogeny>      _phylogenies;
  66     private List<String>         _taxlabels;
  67     private Map<String, String>  _translate_map;
  68     private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
  69     private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
  70     private TAXONOMY_EXTRACTION  _taxonomy_extraction      = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
  71
  72     @Override
  73     public Phylogeny[] parse() throws IOException, NHXFormatException {
  74         reset();
  75         final BufferedReader reader = ParserUtils.createReader( getNexusSource() );
  76         String line;
  77         String name = "";
  78         StringBuffer nhx = new StringBuffer();
  79         final StringBuffer translate_sb = new StringBuffer();
  80         boolean in_trees_block = false;
  81         boolean in_taxalabels = false;
  82         boolean in_translate = false;
  83         boolean in_tree = false;
  84         boolean rooted_info_present = false;
  85         boolean is_rooted = false;
  86         while ( ( line = reader.readLine() ) != null ) {
  87             line = line.trim();
  88             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
  89                 line = ForesterUtil.collapseWhiteSpace( line );
  90                 line = removeWhiteSpaceBeforeSemicolon( line );
  91                 final String line_lc = line.toLowerCase();
  92                 if ( line_lc.startsWith( begin_trees ) ) {
  93                     in_trees_block = true;
  94                     in_taxalabels = false;
  95                     in_translate = false;
  96                 }
  97                 else if ( line_lc.startsWith( taxlabels ) ) {
  98                     in_trees_block = false;
  99                     in_taxalabels = true;
 100                     in_translate = false;
 101                 }
 102                 else if ( line_lc.startsWith( translate ) ) {
 103                     in_taxalabels = false;
 104                     in_translate = true;
 105                 }
 106                 else if ( in_trees_block ) {
 107                     //FIXME TODO need to work on this "title" and "link"
 108                     if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
 109                         // Do nothing.
 110                     }
 111                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 112                         in_trees_block = false;
 113                         in_tree = false;
 114                         in_translate = false;
 115                         if ( nhx.length() > 0 ) {
 116                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
 117                             nhx = new StringBuffer();
 118                             name = "";
 119                             rooted_info_present = false;
 120                             is_rooted = false;
 121                         }
 122                     }
 123                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
 124                         if ( nhx.length() > 0 ) {
 125                             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
 126                             nhx = new StringBuffer();
 127                             name = "";
 128                             rooted_info_present = false;
 129                             is_rooted = false;
 130                         }
 131                         in_tree = true;
 132                         nhx.append( line.substring( line.indexOf( '=' ) ) );
 133                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
 134                         if ( name_matcher.matches() ) {
 135                             name = name_matcher.group( 1 );
 136                             name = name.replaceAll( "['\"]+", "" );
 137                         }
 138                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
 139                         if ( rootedness_matcher.matches() ) {
 140                             final String s = rootedness_matcher.group( 1 );
 141                             line = line.replaceAll( "\\[\\&.\\]", "" );
 142                             rooted_info_present = true;
 143                             if ( s.toUpperCase().equals( "R" ) ) {
 144                                 is_rooted = true;
 145                             }
 146                         }
 147                     }
 148                     else if ( in_tree && !in_translate ) {
 149                         nhx.append( line );
 150                     }
 151                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !in_translate
 152                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
 153                         in_tree = false;
 154                         in_translate = false;
 155                         createPhylogeny( name, nhx, rooted_info_present, is_rooted );
 156                         nhx = new StringBuffer();
 157                         name = "";
 158                         rooted_info_present = false;
 159                         is_rooted = false;
 160                     }
 161                 }
 162                 if ( in_taxalabels ) {
 163                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 164                         in_taxalabels = false;
 165                     }
 166                     else {
 167                         final String[] labels = line.split( "\\s+" );
 168                         for( String label : labels ) {
 169                             if ( !label.toLowerCase().equals( taxlabels ) ) {
 170                                 if ( label.endsWith( ";" ) ) {
 171                                     in_taxalabels = false;
 172                                     label = label.substring( 0, label.length() - 1 );
 173                                 }
 174                                 if ( label.length() > 0 ) {
 175                                     getTaxlabels().add( label );
 176                                 }
 177                             }
 178                         }
 179                     }
 180                 }
 181                 if ( in_translate ) {
 182                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 183                         in_translate = false;
 184                     }
 185                     else {
 186                         translate_sb.append( " " );
 187                         translate_sb.append( line.trim() );
 188                         if ( line.endsWith( ";" ) ) {
 189                             in_translate = false;
 190                             setTranslateKeyValuePairs( translate_sb );
 191                         }
 192                     }
 193                 }
 194             }
 195         }
 196         if ( nhx.length() > 0 ) {
 197             createPhylogeny( name, nhx, rooted_info_present, is_rooted );
 198         }
 199         return getPhylogeniesAsArray();
 200     }
 201
 202     public void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
 203         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
 204     }
 205
 206     public void setReplaceUnderscores( final boolean replace_underscores ) {
 207         _replace_underscores = replace_underscores;
 208     }
 209
 210     @Override
 211     public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
 212         if ( nexus_source == null ) {
 213             throw new PhylogenyParserException( getClass() + ": attempt to parse null object." );
 214         }
 215         _nexus_source = nexus_source;
 216     }
 217
 218     public void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
 219         _taxonomy_extraction = taxonomy_extraction;
 220     }
 221
 222     private void createPhylogeny( final String name,
 223                                   final StringBuffer nhx,
 224                                   final boolean rooted_info_present,
 225                                   final boolean is_rooted ) throws IOException {
 226         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
 227         final NHXParser pars = new NHXParser();
 228         pars.setTaxonomyExtraction( getTaxonomyExtraction() );
 229         pars.setReplaceUnderscores( isReplaceUnderscores() );
 230         pars.setIgnoreQuotes( isIgnoreQuotes() );
 231         if ( rooted_info_present ) {
 232             pars.setGuessRootedness( false );
 233         }
 234         final Phylogeny p = factory.create( nhx, pars )[ 0 ];
 235         p.setName( name );
 236         if ( rooted_info_present ) {
 237             p.setRooted( is_rooted );
 238         }
 239         if ( ( getTaxlabels().size() > 0 ) || ( getTranslateMap().size() > 0 ) ) {
 240             final PhylogenyNodeIterator it = p.iteratorExternalForward();
 241             while ( it.hasNext() ) {
 242                 final PhylogenyNode node = it.next();
 243                 if ( ( getTranslateMap().size() > 0 ) && getTranslateMap().containsKey( node.getName() ) ) {
 244                     node.setName( getTranslateMap().get( node.getName() ).replaceAll( "['\"]+", "" ) );
 245                 }
 246                 else if ( getTaxlabels().size() > 0 ) {
 247                     int i = -1;
 248                     try {
 249                         i = Integer.parseInt( node.getName() );
 250                     }
 251                     catch ( final NumberFormatException e ) {
 252                         // Ignore.
 253                     }
 254                     if ( i > 0 ) {
 255                         node.setName( getTaxlabels().get( i - 1 ).replaceAll( "['\"]+", "" ) );
 256                     }
 257                 }
 258                 if ( !isReplaceUnderscores() && ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) ) {
 259                     final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
 260                                                                                     getTaxonomyExtraction() );
 261                     if ( !ForesterUtil.isEmpty( tax ) ) {
 262                         if ( !node.getNodeData().isHasTaxonomy() ) {
 263                             node.getNodeData().setTaxonomy( new Taxonomy() );
 264                         }
 265                         node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
 266                     }
 267                 }
 268             }
 269         }
 270         getPhylogenies().add( p );
 271     }
 272
 273     private Object getNexusSource() {
 274         return _nexus_source;
 275     }
 276
 277     private List<Phylogeny> getPhylogenies() {
 278         return _phylogenies;
 279     }
 280
 281     private Phylogeny[] getPhylogeniesAsArray() {
 282         final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ];
 283         for( int i = 0; i < getPhylogenies().size(); ++i ) {
 284             p[ i ] = getPhylogenies().get( i );
 285         }
 286         return p;
 287     }
 288
 289     private List<String> getTaxlabels() {
 290         return _taxlabels;
 291     }
 292
 293     private TAXONOMY_EXTRACTION getTaxonomyExtraction() {
 294         return _taxonomy_extraction;
 295     }
 296
 297     private Map<String, String> getTranslateMap() {
 298         return _translate_map;
 299     }
 300
 301     private boolean isIgnoreQuotes() {
 302         return _ignore_quotes_in_nh_data;
 303     }
 304
 305     private boolean isReplaceUnderscores() {
 306         return _replace_underscores;
 307     }
 308
 309     private void reset() {
 310         setPhylogenies( new ArrayList<Phylogeny>() );
 311         setTaxlabels( new ArrayList<String>() );
 312         setTranslateMap( new HashMap<String, String>() );
 313     }
 314
 315     private void setPhylogenies( final ArrayList<Phylogeny> phylogenies ) {
 316         _phylogenies = phylogenies;
 317     }
 318
 319     private void setTaxlabels( final List<String> taxlabels ) {
 320         _taxlabels = taxlabels;
 321     }
 322
 323     private void setTranslateKeyValuePairs( final StringBuffer translate_sb ) throws IOException {
 324         String s = translate_sb.toString().trim();
 325         if ( s.endsWith( ";" ) ) {
 326             s = s.substring( 0, s.length() - 1 ).trim();
 327         }
 328         for( final String pair : s.split( "," ) ) {
 329             final String[] kv = pair.trim().split( "\\s+" );
 330             if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
 331                 throw new IOException( "ill formatted translate values: " + translate_sb );
 332             }
 333             if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
 334                 throw new IOException( "ill formatted translate values: " + translate_sb );
 335             }
 336             String key = "";
 337             String value = "";
 338             if ( kv.length == 3 ) {
 339                 key = kv[ 1 ];
 340                 value = kv[ 2 ];
 341             }
 342             else {
 343                 key = kv[ 0 ];
 344                 value = kv[ 1 ];
 345             }
 346             if ( value.endsWith( ";" ) ) {
 347                 value = value.substring( 0, value.length() - 1 );
 348             }
 349             getTranslateMap().put( key, value );
 350         }
 351     }
 352
 353     private void setTranslateMap( final Map<String, String> translate_map ) {
 354         _translate_map = translate_map;
 355     }
 356
 357     private static String removeWhiteSpaceBeforeSemicolon( final String s ) {
 358         return s.replaceAll( "\\s+;", ";" );
 359     }
 360 }