Fixed issue with reading from TreeBase
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
index 4a25f4d..0f51cd7 100644 (file)
@@ -35,7 +35,7 @@ import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.forester.archaeopteryx.Constants;
+import org.forester.archaeopteryx.AptxConstants;
 import org.forester.io.parsers.IteratingPhylogenyParser;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.nhx.NHXFormatException;
@@ -45,41 +45,57 @@ import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.parsers.util.PhylogenyParserException;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.util.ForesterUtil;
 
 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
 
-    final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
-    final private static String  end                       = NexusConstants.END.toLowerCase();
-    final private static String  endblock                  = "endblock";
-    final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
-    final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
-    final private static Pattern TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
-    final private static String  tree                      = NexusConstants.TREE.toLowerCase();
-    final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
-    private BufferedReader       _br;
-    private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
-    private boolean              _in_taxalabels;
-    private boolean              _in_translate;
-    private boolean              _in_tree;
-    private boolean              _in_trees_block;
-    private boolean              _is_rooted;
-    private String               _name;
-    private Phylogeny            _next;
-    private Object               _nexus_source;
-    private StringBuilder        _nh;
-    private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
-    private boolean              _rooted_info_present;
-    private List<String>         _taxlabels;
-    private TAXONOMY_EXTRACTION  _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
-    private String               _title;
-    private Map<String, String>  _translate_map;
-    private StringBuilder        _translate_sb;
+    final private static boolean DEBUG                               = false;
+    
+    final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
+    final private static String            end                       = NexusConstants.END.toLowerCase();
+    final private static String            endblock                  = "endblock";
+    final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
+    final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
+    final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
+    final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
+    final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
+    final private static String            tree                      = NexusConstants.TREE.toLowerCase();
+    final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
+    final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
+    final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
+    //final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
+    //                                                                                    Pattern.CASE_INSENSITIVE );
+    final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
+    private BufferedReader                 _br;
+    private boolean                        _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
+    private boolean                        _in_taxalabels;
+    private boolean                        _in_translate;
+    private boolean                        _in_tree;
+    private boolean                        _in_trees_block;
+    private boolean                        _in_data_block;
+    private boolean                        _is_rooted;
+    private String                         _datatype;
+    private String                         _name;
+    private Phylogeny                      _next;
+    private Object                         _nexus_source;
+    private StringBuilder                  _nh;
+    private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
+    private boolean                        _rooted_info_present;
+    private List<String>                   _taxlabels;
+    private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
+    private String                         _title;
+    private Map<String, String>            _translate_map;
+    private StringBuilder                  _translate_sb;
+    private Map<String, MolecularSequence> _seqs;
+    private final boolean                  _add_sequences            = true;
 
     @Override
     public String getName() {
@@ -127,6 +143,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _in_tree = false;
         _rooted_info_present = false;
         _is_rooted = false;
+        _seqs = new HashMap<String, MolecularSequence>();
         _br = ParserUtils.createReader( _nexus_source );
         getNext();
     }
@@ -213,6 +230,15 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         node.setName( node.getName().replace( '_', ' ' ).trim() );
                     }
                 }
+                if ( _add_sequences ) {
+                    if ( _seqs.containsKey( node.getName() ) ) {
+                        final MolecularSequence s = _seqs.get( node.getName() );
+                        //TODO need to check for uniqueness when adding seqs....
+                        final Sequence ns = new Sequence( s );
+                        ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
+                        node.getNodeData().addSequence( ns );
+                    }
+                }
             }
         }
         _next = p;
@@ -222,6 +248,9 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _next = null;
         String line;
         while ( ( line = _br.readLine() ) != null ) {
+            if ( DEBUG ) {
+                System.out.println( line );
+            }
             line = line.trim();
             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
                 line = ForesterUtil.collapseWhiteSpace( line );
@@ -231,17 +260,31 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                     _in_trees_block = true;
                     _in_taxalabels = false;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                     _title = "";
                 }
                 else if ( line_lc.startsWith( taxlabels ) ) {
+                    //TODO need to be taxa block instead
                     _in_trees_block = false;
                     _in_taxalabels = true;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                 }
                 else if ( line_lc.startsWith( translate ) ) {
                     _translate_sb = new StringBuilder();
                     _in_taxalabels = false;
                     _in_translate = true;
+                    _in_data_block = false;
+                    _datatype = null;
+                }
+                else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
+                    _in_taxalabels = false;
+                    _in_trees_block = false;
+                    _in_translate = false;
+                    _in_data_block = true;
+                    _datatype = null;
                 }
                 else if ( _in_trees_block ) {
                     if ( line_lc.startsWith( "title" ) ) {
@@ -251,6 +294,10 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         }
                     }
                     else if ( line_lc.startsWith( "link" ) ) {
+                        //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                        //if ( link_m.lookingAt() ) {
+                            //final String link = link_m.group( 1 );  //TODO why?
+                       // }
                     }
                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
                         _in_trees_block = false;
@@ -346,6 +393,50 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         }
                     }
                 }
+                if ( _in_data_block ) {
+                    if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
+                        _in_data_block = false;
+                        _datatype = null;
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                     //   final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                     //   if ( link_m.lookingAt() ) {
+                     //       final String link = link_m.group( 1 );
+                     //   }
+                    }
+                    else {
+                        final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
+                        if ( datatype_matcher.find() ) {
+                            _datatype = datatype_matcher.group( 1 );
+                        }
+                        else {
+                            if ( ( _datatype != null )
+                                    && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
+                                            .equals( "rna" ) ) ) {
+                                if ( line.endsWith( ";" ) ) {
+                                    _in_data_block = false;
+                                    line = line.substring( 0, line.length() - 1 );
+                                }
+                                final Matcher aln_matcher = ALN_PATTERN.matcher( line );
+                                if ( aln_matcher.matches() ) {
+                                    final String id = aln_matcher.group( 1 );
+                                    final String seq = aln_matcher.group( 2 );
+                                    MolecularSequence s = null;
+                                    if ( _datatype.equals( "protein" ) ) {
+                                        s = BasicSequence.createAaSequence( id, seq );
+                                    }
+                                    else if ( _datatype.equals( "dna" ) ) {
+                                        s = BasicSequence.createDnaSequence( id, seq );
+                                    }
+                                    else {
+                                        s = BasicSequence.createRnaSequence( id, seq );
+                                    }
+                                    _seqs.put( id, s );
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
         if ( _nh.length() > 0 ) {
@@ -361,23 +452,20 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         if ( s.endsWith( ";" ) ) {
             s = s.substring( 0, s.length() - 1 ).trim();
         }
-        for( final String pair : s.split( "," ) ) {
-            final String[] kv = pair.trim().split( "\\s+" );
-            if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
-                throw new IOException( "ill-formatted translate values: " + pair );
-            }
-            if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
-                throw new IOException( "ill-formatted translate values: " + pair );
-            }
+        for( String pair : s.split( "," ) ) {
             String key = "";
             String value = "";
-            if ( kv.length == 3 ) {
-                key = kv[ 1 ];
-                value = kv[ 2 ];
+            final int ti = pair.toLowerCase().indexOf( "translate" );
+            if ( ti > -1 ) {
+                pair = pair.substring( ti + 9 );
+            }
+            final Matcher m = TRANSLATE_PATTERN.matcher( pair );
+            if ( m.find() ) {
+                key = m.group( 1 );
+                value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
             }
             else {
-                key = kv[ 0 ];
-                value = kv[ 1 ];
+                throw new IOException( "ill-formatted translate values: " + pair );
             }
             if ( value.endsWith( ";" ) ) {
                 value = value.substring( 0, value.length() - 1 );