NexusPhylogeniesParser.java

   1 // $Id:
   2 // FORESTER -- software libraries and applications
   3 // for evolutionary biology research and applications.
   4 //
   5 // Copyright (C) 2008-2009 Christian M. Zmasek
   6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  25
  26 package org.forester.io.parsers.nexus;
  27
  28 import java.io.BufferedReader;
  29 import java.io.FileNotFoundException;
  30 import java.io.IOException;
  31 import java.util.ArrayList;
  32 import java.util.HashMap;
  33 import java.util.List;
  34 import java.util.Map;
  35 import java.util.regex.Matcher;
  36 import java.util.regex.Pattern;
  37
  38 import org.forester.archaeopteryx.Constants;
  39 import org.forester.io.parsers.IteratingPhylogenyParser;
  40 import org.forester.io.parsers.PhylogenyParser;
  41 import org.forester.io.parsers.nhx.NHXFormatException;
  42 import org.forester.io.parsers.nhx.NHXParser;
  43 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
  44 import org.forester.io.parsers.util.ParserUtils;
  45 import org.forester.io.parsers.util.PhylogenyParserException;
  46 import org.forester.phylogeny.Phylogeny;
  47 import org.forester.phylogeny.PhylogenyNode;
  48 import org.forester.phylogeny.data.Sequence;
  49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
  50 import org.forester.sequence.BasicSequence;
  51 import org.forester.sequence.MolecularSequence;
  52 import org.forester.util.ForesterUtil;
  53
  54 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
  55
  56     final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
  57     final private static String            end                       = NexusConstants.END.toLowerCase();
  58     final private static String            endblock                  = "endblock";
  59     final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
  60     final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
  61     final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
  62                                                                                         Pattern.CASE_INSENSITIVE );
  63     final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
  64     final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
  65     final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
  66     final private static String            tree                      = NexusConstants.TREE.toLowerCase();
  67     final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
  68                                                                                         Pattern.CASE_INSENSITIVE );
  69     final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
  70     final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
  71     final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
  72     final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
  73                                                                                         Pattern.CASE_INSENSITIVE );
  74     final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
  75     private BufferedReader                 _br;
  76     private boolean                        _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
  77     private boolean                        _in_taxalabels;
  78     private boolean                        _in_translate;
  79     private boolean                        _in_tree;
  80     private boolean                        _in_trees_block;
  81     private boolean                        _in_data_block;
  82     private boolean                        _is_rooted;
  83     private String                         _datatype;
  84     private String                         _name;
  85     private Phylogeny                      _next;
  86     private Object                         _nexus_source;
  87     private StringBuilder                  _nh;
  88     private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
  89     private boolean                        _rooted_info_present;
  90     private List<String>                   _taxlabels;
  91     private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
  92     private String                         _title;
  93     private Map<String, String>            _translate_map;
  94     private StringBuilder                  _translate_sb;
  95     private Map<String, MolecularSequence> _seqs;
  96     private final boolean                  _add_sequences            = true;
  97
  98     @Override
  99     public String getName() {
 100         return "Nexus Phylogenies Parser";
 101     }
 102
 103     @Override
 104     public final boolean hasNext() {
 105         return _next != null;
 106     }
 107
 108     @Override
 109     public final Phylogeny next() throws NHXFormatException, IOException {
 110         final Phylogeny phy = _next;
 111         getNext();
 112         return phy;
 113     }
 114
 115     @Override
 116     public final Phylogeny[] parse() throws IOException {
 117         final List<Phylogeny> l = new ArrayList<Phylogeny>();
 118         while ( hasNext() ) {
 119             l.add( next() );
 120         }
 121         final Phylogeny[] p = new Phylogeny[ l.size() ];
 122         for( int i = 0; i < l.size(); ++i ) {
 123             p[ i ] = l.get( i );
 124         }
 125         reset();
 126         return p;
 127     }
 128
 129     @Override
 130     public final void reset() throws FileNotFoundException, IOException {
 131         _taxlabels = new ArrayList<String>();
 132         _translate_map = new HashMap<String, String>();
 133         _nh = new StringBuilder();
 134         _name = "";
 135         _title = "";
 136         _translate_sb = null;
 137         _next = null;
 138         _in_trees_block = false;
 139         _in_taxalabels = false;
 140         _in_translate = false;
 141         _in_tree = false;
 142         _rooted_info_present = false;
 143         _is_rooted = false;
 144         _seqs = new HashMap<String, MolecularSequence>();
 145         _br = ParserUtils.createReader( _nexus_source );
 146         getNext();
 147     }
 148
 149     public final void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) {
 150         _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data;
 151     }
 152
 153     public final void setReplaceUnderscores( final boolean replace_underscores ) {
 154         _replace_underscores = replace_underscores;
 155     }
 156
 157     @Override
 158     public final void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException {
 159         if ( nexus_source == null ) {
 160             throw new PhylogenyParserException( "attempt to parse null object" );
 161         }
 162         _nexus_source = nexus_source;
 163         reset();
 164     }
 165
 166     public final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) {
 167         _taxonomy_extraction = taxonomy_extraction;
 168     }
 169
 170     private final void createPhylogeny( final String title,
 171                                         final String name,
 172                                         final StringBuilder nhx,
 173                                         final boolean rooted_info_present,
 174                                         final boolean is_rooted ) throws IOException {
 175         _next = null;
 176         final NHXParser pars = new NHXParser();
 177         pars.setTaxonomyExtraction( _taxonomy_extraction );
 178         pars.setReplaceUnderscores( _replace_underscores );
 179         pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
 180         if ( rooted_info_present ) {
 181             pars.setGuessRootedness( false );
 182         }
 183         pars.setSource( nhx );
 184         final Phylogeny p = pars.next();
 185         if ( p == null ) {
 186             throw new PhylogenyParserException( "failed to create phylogeny" );
 187         }
 188         String myname = null;
 189         if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
 190             myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
 191         }
 192         else if ( !ForesterUtil.isEmpty( title ) ) {
 193             myname = title.replace( '_', ' ' ).trim();
 194         }
 195         else if ( !ForesterUtil.isEmpty( name ) ) {
 196             myname = name.trim();
 197         }
 198         if ( !ForesterUtil.isEmpty( myname ) ) {
 199             p.setName( myname );
 200         }
 201         if ( rooted_info_present ) {
 202             p.setRooted( is_rooted );
 203         }
 204         if ( ( _taxlabels.size() > 0 ) || ( _translate_map.size() > 0 ) ) {
 205             final PhylogenyNodeIterator it = p.iteratorExternalForward();
 206             while ( it.hasNext() ) {
 207                 final PhylogenyNode node = it.next();
 208                 if ( ( _translate_map.size() > 0 ) && _translate_map.containsKey( node.getName() ) ) {
 209                     node.setName( _translate_map.get( node.getName() ).replaceAll( "['\"]+", "" ) );
 210                 }
 211                 else if ( _taxlabels.size() > 0 ) {
 212                     int i = -1;
 213                     try {
 214                         i = Integer.parseInt( node.getName() );
 215                     }
 216                     catch ( final NumberFormatException e ) {
 217                         // Ignore.
 218                     }
 219                     if ( i > 0 ) {
 220                         node.setName( _taxlabels.get( i - 1 ).replaceAll( "['\"]+", "" ) );
 221                     }
 222                 }
 223                 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
 224                     ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
 225                 }
 226                 else if ( _replace_underscores ) {
 227                     if ( !ForesterUtil.isEmpty( node.getName() ) ) {
 228                         node.setName( node.getName().replace( '_', ' ' ).trim() );
 229                     }
 230                 }
 231                 if ( _add_sequences ) {
 232                     if ( _seqs.containsKey( node.getName() ) ) {
 233                         final MolecularSequence s = _seqs.get( node.getName() );
 234                         //TODO need to check for uniqueness when adding seqs....
 235                         final Sequence ns = new Sequence( s );
 236                         ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
 237                         node.getNodeData().addSequence( ns );
 238                     }
 239                 }
 240             }
 241         }
 242         _next = p;
 243     }
 244
 245     private final void getNext() throws IOException, NHXFormatException {
 246         _next = null;
 247         String line;
 248         while ( ( line = _br.readLine() ) != null ) {
 249             line = line.trim();
 250             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
 251                 line = ForesterUtil.collapseWhiteSpace( line );
 252                 line = removeWhiteSpaceBeforeSemicolon( line );
 253                 final String line_lc = line.toLowerCase();
 254                 if ( line_lc.startsWith( begin_trees ) ) {
 255                     _in_trees_block = true;
 256                     _in_taxalabels = false;
 257                     _in_translate = false;
 258                     _in_data_block = false;
 259                     _datatype = null;
 260                     _title = "";
 261                 }
 262                 else if ( line_lc.startsWith( taxlabels ) ) {
 263                     //TODO need to be taxa block instead
 264                     _in_trees_block = false;
 265                     _in_taxalabels = true;
 266                     _in_translate = false;
 267                     _in_data_block = false;
 268                     _datatype = null;
 269                 }
 270                 else if ( line_lc.startsWith( translate ) ) {
 271                     _translate_sb = new StringBuilder();
 272                     _in_taxalabels = false;
 273                     _in_translate = true;
 274                     _in_data_block = false;
 275                     _datatype = null;
 276                 }
 277                 else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
 278                     _in_taxalabels = false;
 279                     _in_trees_block = false;
 280                     _in_translate = false;
 281                     _in_data_block = true;
 282                     _datatype = null;
 283                 }
 284                 else if ( _in_trees_block ) {
 285                     if ( line_lc.startsWith( "title" ) ) {
 286                         final Matcher title_m = TITLE_PATTERN.matcher( line );
 287                         if ( title_m.lookingAt() ) {
 288                             _title = title_m.group( 1 );
 289                         }
 290                     }
 291                     else if ( line_lc.startsWith( "link" ) ) {
 292                         final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
 293                         if ( link_m.lookingAt() ) {
 294                             final String link = link_m.group( 1 );
 295                             //System.out.println( "link taxa:" + link );
 296                         }
 297                     }
 298                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 299                         _in_trees_block = false;
 300                         _in_tree = false;
 301                         _in_translate = false;
 302                         if ( _nh.length() > 0 ) {
 303                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
 304                             _nh = new StringBuilder();
 305                             _name = "";
 306                             _rooted_info_present = false;
 307                             _is_rooted = false;
 308                             if ( _next != null ) {
 309                                 return;
 310                             }
 311                         }
 312                     }
 313                     else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) {
 314                         boolean might = false;
 315                         if ( _nh.length() > 0 ) {
 316                             might = true;
 317                             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
 318                             _nh = new StringBuilder();
 319                             _name = "";
 320                             _rooted_info_present = false;
 321                             _is_rooted = false;
 322                         }
 323                         _in_tree = true;
 324                         _nh.append( line.substring( line.indexOf( '=' ) ) );
 325                         final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line );
 326                         if ( name_matcher.matches() ) {
 327                             _name = name_matcher.group( 1 );
 328                             _name = _name.replaceAll( "['\"]+", "" );
 329                         }
 330                         final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line );
 331                         if ( rootedness_matcher.matches() ) {
 332                             final String s = rootedness_matcher.group( 1 );
 333                             line = line.replaceAll( "\\[\\&.\\]", "" );
 334                             _rooted_info_present = true;
 335                             if ( s.toUpperCase().equals( "R" ) ) {
 336                                 _is_rooted = true;
 337                             }
 338                         }
 339                         if ( might && ( _next != null ) ) {
 340                             return;
 341                         }
 342                     }
 343                     else if ( _in_tree && !_in_translate ) {
 344                         _nh.append( line );
 345                     }
 346                     if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !_in_translate
 347                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
 348                         _in_tree = false;
 349                         _in_translate = false;
 350                         createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
 351                         _nh = new StringBuilder();
 352                         _name = "";
 353                         _rooted_info_present = false;
 354                         _is_rooted = false;
 355                         if ( _next != null ) {
 356                             return;
 357                         }
 358                     }
 359                 }
 360                 if ( _in_taxalabels ) {
 361                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 362                         _in_taxalabels = false;
 363                     }
 364                     else {
 365                         final String[] labels = line.split( "\\s+" );
 366                         for( String label : labels ) {
 367                             if ( !label.toLowerCase().equals( taxlabels ) ) {
 368                                 if ( label.endsWith( ";" ) ) {
 369                                     _in_taxalabels = false;
 370                                     label = label.substring( 0, label.length() - 1 );
 371                                 }
 372                                 if ( label.length() > 0 ) {
 373                                     _taxlabels.add( label );
 374                                 }
 375                             }
 376                         }
 377                     }
 378                 }
 379                 if ( _in_translate ) {
 380                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 381                         _in_translate = false;
 382                     }
 383                     else {
 384                         _translate_sb.append( " " );
 385                         _translate_sb.append( line.trim() );
 386                         if ( line.endsWith( ";" ) ) {
 387                             _in_translate = false;
 388                             setTranslateKeyValuePairs( _translate_sb );
 389                         }
 390                     }
 391                 }
 392                 if ( _in_data_block ) {
 393                     if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
 394                         _in_data_block = false;
 395                         _datatype = null;
 396                     }
 397                     else if ( line_lc.startsWith( "link" ) ) {
 398                         final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
 399                         if ( link_m.lookingAt() ) {
 400                             final String link = link_m.group( 1 );
 401                             //System.out.println( "link taxa:" + link );
 402                         }
 403                     }
 404                     else {
 405                         final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
 406                         if ( datatype_matcher.find() ) {
 407                             _datatype = datatype_matcher.group( 1 );
 408                             //System.out.println( _datatype );
 409                         }
 410                         else {
 411                             if ( ( _datatype != null )
 412                                     && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
 413                                             .equals( "rna" ) ) ) {
 414                                 if ( line.endsWith( ";" ) ) {
 415                                     _in_data_block = false;
 416                                     line = line.substring( 0, line.length() - 1 );
 417                                 }
 418                                 final Matcher aln_matcher = ALN_PATTERN.matcher( line );
 419                                 if ( aln_matcher.matches() ) {
 420                                     final String id = aln_matcher.group( 1 );
 421                                     final String seq = aln_matcher.group( 2 );
 422                                     MolecularSequence s = null;
 423                                     if ( _datatype.equals( "protein" ) ) {
 424                                         s = BasicSequence.createAaSequence( id, seq );
 425                                     }
 426                                     else if ( _datatype.equals( "dna" ) ) {
 427                                         s = BasicSequence.createDnaSequence( id, seq );
 428                                     }
 429                                     else {
 430                                         s = BasicSequence.createRnaSequence( id, seq );
 431                                     }
 432                                     _seqs.put( id, s );
 433                                     //System.out.println( s );
 434                                 }
 435                             }
 436                         }
 437                     }
 438                 }
 439             }
 440         }
 441         if ( _nh.length() > 0 ) {
 442             createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
 443             if ( _next != null ) {
 444                 return;
 445             }
 446         }
 447     }
 448
 449     private final void setTranslateKeyValuePairs( final StringBuilder translate_sb ) throws IOException {
 450         String s = translate_sb.toString().trim();
 451         if ( s.endsWith( ";" ) ) {
 452             s = s.substring( 0, s.length() - 1 ).trim();
 453         }
 454         for( String pair : s.split( "," ) ) {
 455             String key = "";
 456             String value = "";
 457             final int ti = pair.toLowerCase().indexOf( "translate" );
 458             if ( ti > -1 ) {
 459                 pair = pair.substring( ti + 9 );
 460             }
 461             final Matcher m = TRANSLATE_PATTERN.matcher( pair );
 462             if ( m.find() ) {
 463                 key = m.group( 1 );
 464                 value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
 465             }
 466             else {
 467                 throw new IOException( "ill-formatted translate values: " + pair );
 468             }
 469             if ( value.endsWith( ";" ) ) {
 470                 value = value.substring( 0, value.length() - 1 );
 471             }
 472             _translate_map.put( key, value );
 473         }
 474     }
 475
 476     private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
 477         return s.replaceAll( "\\s+;", ";" );
 478     }
 479 }