inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Nov 2013 21:08:37 +0000 (21:08 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 19 Nov 2013 21:08:37 +0000 (21:08 +0000)
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java

index 2583a90..ae43b31 100644 (file)
@@ -127,7 +127,7 @@ import org.forester.ws.wabi.TxSearch.TAX_RANK;
 @SuppressWarnings( "unused")
 public final class Test {
 
-    private final static boolean PERFORM_DB_TESTS          = false;
+    private final static boolean PERFORM_DB_TESTS          = true;
     private final static double  ZERO_DIFF                 = 1.0E-9;
     private final static String  PATH_TO_TEST_DATA         = System.getProperty( "user.dir" )
                                                                    + ForesterUtil.getFileSeparator() + "test_data"
@@ -490,18 +490,6 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
-        if ( PERFORM_DB_TESTS ) {
-            System.out.print( "Ebi Entry Retrieval: " );
-            if ( Test.testEbiEntryRetrieval() ) {
-                System.out.println( "OK." );
-                succeeded++;
-            }
-            else {
-                System.out.println( "failed." );
-                failed++;
-            }
-        }
-        /////////////////////System.exit( 0 );
         System.out.print( "UniProtKB id extraction: " );
         if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
             System.out.println( "OK." );
@@ -521,6 +509,18 @@ public final class Test {
             failed++;
         }
         if ( PERFORM_DB_TESTS ) {
+            System.out.print( "Ebi Entry Retrieval: " );
+            if ( Test.testEbiEntryRetrieval() ) {
+                System.out.println( "OK." );
+                succeeded++;
+            }
+            else {
+                System.out.println( "failed." );
+                failed++;
+            }
+        }
+        // System.exit( 0 );
+        if ( PERFORM_DB_TESTS ) {
             System.out.print( "Sequence DB tools 2: " );
             if ( testSequenceDbWsTools2() ) {
                 System.out.println( "OK." );
@@ -532,6 +532,7 @@ public final class Test {
                 System.exit( -1 );
             }
         }
+        // System.exit( 0 );
         System.out.print( "Hmmscan output parser: " );
         if ( testHmmscanOutputParser() ) {
             System.out.println( "OK." );
@@ -11297,6 +11298,20 @@ public final class Test {
                 System.out.println( acc.toString() );
                 return false;
             }
+            n.setName( "gi|71845847|1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" );
+            acc = SequenceDbWsTools.obtainSeqAccession( n );
+            if ( ( acc == null ) || !acc.getSource().equals( Source.GI.toString() )
+                    || !acc.getValue().equals( "71845847" ) ) {
+                System.out.println( acc.toString() );
+                return false;
+            }
+            n.setName( "gi|71845847|gb|AAZ45343.1| 1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" );
+            acc = SequenceDbWsTools.obtainSeqAccession( n );
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
+                    || !acc.getValue().equals( "AAZ45343.1" ) ) {
+                System.out.println( acc.toString() );
+                return false;
+            }
         }
         catch ( final Exception e ) {
             return false;
@@ -11322,7 +11337,6 @@ public final class Test {
             }
             final PhylogenyNode n2 = new PhylogenyNode( "NM_001030253" );
             SequenceDbWsTools.obtainSeqInformation( n2 );
-            System.out.println( n2.toString() );
             if ( !n2.getNodeData().getSequence().getName()
                     .equals( "Danio rerio B-cell leukemia/lymphoma 2 (bcl2), mRNA" ) ) {
                 return false;
@@ -11338,7 +11352,6 @@ public final class Test {
             }
             final PhylogenyNode n3 = new PhylogenyNode( "NM_184234.2" );
             SequenceDbWsTools.obtainSeqInformation( n3 );
-            System.out.println( "n=" + n3.toString() );
             if ( !n3.getNodeData().getSequence().getName()
                     .equals( "Homo sapiens RNA binding motif protein 39 (RBM39), transcript variant 1, mRNA" ) ) {
                 return false;
@@ -11503,14 +11516,14 @@ public final class Test {
                 System.out.println( entry4.getGeneName() );
                 return false;
             }
-            if ( !entry4.getChromosome().equals( "ras" ) ) {
-                System.out.println( entry4.getChromosome() );
-                return false;
-            }
-            if ( !entry4.getMap().equals( "ras" ) ) {
-                System.out.println( entry4.getMap() );
-                return false;
-            }
+            //   if ( !entry4.getChromosome().equals( "ras" ) ) {
+            //     System.out.println( entry4.getChromosome() );
+            //     return false;
+            // }
+            // if ( !entry4.getMap().equals( "ras" ) ) {
+            //     System.out.println( entry4.getMap() );
+            //     return false;
+            // }
             //TODO FIXME gi...
             //
             //TODO fails:
@@ -11518,6 +11531,22 @@ public final class Test {
             //            if ( !entry5.getAccession().equals( "HM043801" ) ) {
             //                return false;
             //            }
+            final SequenceDatabaseEntry entry5 = SequenceDbWsTools.obtainEntry( "AAZ45343.1" );
+            if ( !entry5.getAccession().equals( "AAZ45343" ) ) {
+                return false;
+            }
+            if ( !entry5.getTaxonomyScientificName().equals( "Dechloromonas aromatica RCB" ) ) {
+                System.out.println( entry5.getTaxonomyScientificName() );
+                return false;
+            }
+            if ( !entry5.getSequenceName().equals( "Dechloromonas aromatica RCB 1,4-alpha-glucan branching enzyme" ) ) {
+                System.out.println( entry5.getSequenceName() );
+                return false;
+            }
+            if ( !entry5.getTaxonomyIdentifier().equals( "159087" ) ) {
+                System.out.println( entry5.getTaxonomyIdentifier() );
+                return false;
+            }
         }
         catch ( final IOException e ) {
             System.out.println();
index e32a2a4..00c52fc 100644 (file)
@@ -38,273 +38,19 @@ import org.forester.util.ForesterUtil;
 
 public final class EbiDbEntry implements SequenceDatabaseEntry {
 
-    //    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
-    //        final EbiDbEntry e = new EbiDbEntry();
-    //        for( final String line : lines ) {
-    //            if ( line.startsWith( "PA" ) ) {
-    //                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
-    //            }
-    //            else if ( line.startsWith( "DE" ) ) {
-    //                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
-    //            }
-    //            else if ( line.startsWith( "OS" ) ) {
-    //                if ( line.indexOf( "(" ) > 0 ) {
-    //                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
-    //                }
-    //                else {
-    //                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
-    //                }
-    //            }
-    //            else if ( line.startsWith( "OX" ) ) {
-    //                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
-    //                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
-    //                }
-    //            }
-    //        }
-    //        return e;
-    //    }
-    public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
-        final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
-        final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
-        final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
-        final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
-        final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
-        final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
-        final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
-        final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
-        final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
-        final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
-        final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
-        final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
-        final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
-        final EbiDbEntry e = new EbiDbEntry();
-        final StringBuilder def = new StringBuilder();
-        boolean in_definition = false;
-        boolean in_features = false;
-        boolean in_source = false;
-        boolean in_gene = false;
-        boolean in_cds = false;
-        boolean in_mrna = false;
-        boolean in_protein = false;
-        for( final String line : lines ) {
-            if ( line.startsWith( "ACCESSION " ) ) {
-                e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
-                in_definition = false;
-            }
-            else if ( line.startsWith( "ID " ) ) {
-                e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
-                in_definition = false;
-            }
-            else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
-                boolean definiton = false;
-                if ( line.startsWith( "DEFINITION " ) ) {
-                    definiton = true;
-                }
-                if ( line.indexOf( "[" ) > 0 ) {
-                    if ( definiton ) {
-                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
-                    }
-                    else {
-                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
-                    }
-                }
-                else if ( line.indexOf( "." ) > 0 ) {
-                    if ( definiton ) {
-                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
-                    }
-                    else {
-                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
-                    }
-                }
-                else {
-                    if ( definiton ) {
-                        x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
-                    }
-                    else {
-                        x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
-                    }
-                }
-                if ( definiton ) {
-                    in_definition = true;
-                }
-            }
-            else if ( line.startsWith( "  ORGANISM " ) ) {
-                if ( line.indexOf( "(" ) > 0 ) {
-                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "  ORGANISM", "(" ) );
-                }
-                else {
-                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "  ORGANISM" ) );
-                }
-                //  in_def = false;
-            }
-            else if ( line.startsWith( "OS " ) ) {
-                if ( line.indexOf( "(" ) > 0 ) {
-                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
-                }
-                else {
-                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
-                }
-            }
-            else if ( line.startsWith( " " ) && in_definition ) {
-                def.append( " " );
-                if ( line.indexOf( "[" ) > 0 ) {
-                    def.append( SequenceDbWsTools.extractTo( line, "[" ) );
-                }
-                else if ( line.indexOf( "." ) > 0 ) {
-                    def.append( SequenceDbWsTools.extractTo( line, "." ) );
-                }
-                else {
-                    def.append( line.trim() );
-                }
-            }
-            else {
-                in_definition = false;
-            }
-            if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
-                in_features = false;
-                in_source = false;
-                in_gene = false;
-                in_cds = false;
-                in_mrna = false;
-                in_protein = false;
-                // in_def = false;
-            }
-            if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
-                in_features = true;
-            }
-            if ( in_features && ( line.startsWith( "     source " ) || line.startsWith( "FT   source " ) ) ) {
-                in_source = true;
-                in_gene = false;
-                in_cds = false;
-                in_mrna = false;
-                in_protein = false;
-            }
-            if ( in_features && ( line.startsWith( "     gene " ) || line.startsWith( "FT   gene " ) ) ) {
-                in_source = false;
-                in_gene = true;
-                in_cds = false;
-                in_mrna = false;
-                in_protein = false;
-            }
-            if ( in_features && ( line.startsWith( "     CDS " ) || line.startsWith( "FT   CDS " ) ) ) {
-                in_source = false;
-                in_gene = false;
-                in_cds = true;
-                in_mrna = false;
-                in_protein = false;
-            }
-            if ( in_features && ( line.startsWith( "     Protein " ) || line.startsWith( "FT   Protein " ) ) ) {
-                in_source = false;
-                in_gene = false;
-                in_cds = false;
-                in_mrna = false;
-                in_protein = true;
-            }
-            if ( in_features && ( line.startsWith( "     mRNA " ) || line.startsWith( "FT   mRNA " ) ) ) {
-                in_source = false;
-                in_gene = false;
-                in_cds = false;
-                in_mrna = true;
-                in_protein = false;
-            }
-            if ( in_source ) {
-                final Matcher ti = taxon_PATTERN.matcher( line );
-                if ( ti.find() ) {
-                    e.setTaxId( ti.group( 1 ) );
-                }
-                final Matcher chr = chromosome_PATTERN.matcher( line );
-                if ( chr.find() ) {
-                    e.setChromosome( chr.group( 1 ) );
-                }
-                final Matcher map = map_PATTERN.matcher( line );
-                if ( map.find() ) {
-                    e.setMap( map.group( 1 ) );
-                }
-            }
-            if ( in_cds || in_gene ) {
-                final Matcher hgnc = hgnc_PATTERN.matcher( line );
-                if ( hgnc.find() ) {
-                    e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
-                }
-                final Matcher geneid = geneid_PATTERN.matcher( line );
-                if ( geneid.find() ) {
-                    e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
-                }
-            }
-            if ( in_protein || in_cds || in_gene || in_mrna ) {
-                final Matcher ec = ec_PATTERN.matcher( line );
-                if ( ec.find() ) {
-                    e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
-                }
-                final Matcher gene = gene_PATTERN.matcher( line );
-                if ( gene.find() ) {
-                    e.setGeneName( gene.group( 1 ) );
-                }
-                final Matcher uniprot = uniprot_PATTERN.matcher( line );
-                if ( uniprot.find() ) {
-                    e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
-                }
-                final Matcher interpro = interpro_PATTERN.matcher( line );
-                if ( interpro.find() ) {
-                    e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
-                }
-                final Matcher mim = mim_PATTERN.matcher( line );
-                if ( mim.find() ) {
-                    e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
-                }
-                final Matcher product = product_PATTERN.matcher( line );
-                if ( product.find() ) {
-                    e.setSequenceSymbol( product.group( 1 ) );
-                }
-                final Matcher pdb = pdb_PATTERN.matcher( line );
-                if ( pdb.find() ) {
-                    e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
-                }
-            }
-        }
-        if ( def.length() > 0 ) {
-            e.setSequenceName( def.toString().trim() );
-        }
-        return e;
-    }
-    private String _map;
-    private String _chromosome;
-
-    private void setMap( final String map ) {
-        _map = map;
-    }
-
-    private void setChromosome( final String chromosome ) {
-        _chromosome = chromosome;
-    }
-
-    @Override
-    public String getMap() {
-        return _map;
-    }
-
-    @Override
-    public String getChromosome() {
-        return _chromosome;
-    }
-
-    private static void x( final StringBuilder sb, final String s ) {
-        if ( sb.length() > 0 ) {
-            sb.append( " " );
-        }
-        sb.append( s.trim() );
-    }
+    private SortedSet<Annotation> _annotations;
+    private String                _chromosome;
+    private SortedSet<Accession>  _cross_references;
+    private String                _de;
+    private String                _gene_name;
+    private String                _map;
+    private String                _os;
     // FIXME actually this is NCBI entry
     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
     private String                _pa;
-    private String                _de;
-    private String                _os;
-    private String                _tax_id;
-    private String                _symbol;
     private String                _provider;
-    private SortedSet<Accession>  _cross_references;
-    private SortedSet<Annotation> _annotations;
-    private String                _gene_name;
+    private String                _symbol;
+    private String                _tax_id;
 
     // TODO  PUBMED   15798186
     //TODO  (FEATURES) 
@@ -560,14 +306,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
     private EbiDbEntry() {
     }
 
-    private void addCrossReference( final Accession accession ) {
-        if ( _cross_references == null ) {
-            _cross_references = new TreeSet<Accession>();
-        }
-        System.out.println( "XREF ADDED: " + accession );
-        _cross_references.add( accession );
-    }
-
     @Override
     public Object clone() throws CloneNotSupportedException {
         throw new CloneNotSupportedException();
@@ -579,6 +317,16 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
     }
 
     @Override
+    public SortedSet<Annotation> getAnnotations() {
+        return _annotations;
+    }
+
+    @Override
+    public String getChromosome() {
+        return _chromosome;
+    }
+
+    @Override
     public SortedSet<Accession> getCrossReferences() {
         return _cross_references;
     }
@@ -594,6 +342,11 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
     }
 
     @Override
+    public String getMap() {
+        return _map;
+    }
+
+    @Override
     public String getProvider() {
         return _provider;
     }
@@ -608,10 +361,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         return _symbol;
     }
 
-    private void setSequenceSymbol( final String symbol ) {
-        _symbol = symbol;
-    }
-
     @Override
     public String getTaxonomyIdentifier() {
         return _tax_id;
@@ -629,10 +378,33 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
                 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
     }
 
-    private void setSequenceName( final String rec_name ) {
-        if ( _de == null ) {
-            _de = rec_name;
+    public void setProvider( final String provider ) {
+        _provider = provider;
+    }
+
+    private void addAnnotation( final Annotation annotation ) {
+        if ( _annotations == null ) {
+            _annotations = new TreeSet<Annotation>();
         }
+        _annotations.add( annotation );
+    }
+
+    private void addCrossReference( final Accession accession ) {
+        if ( _cross_references == null ) {
+            _cross_references = new TreeSet<Accession>();
+        }
+        System.out.println( "XREF ADDED: " + accession );
+        _cross_references.add( accession );
+    }
+
+    private void setAccession( final String pa ) {
+        if ( _pa == null ) {
+            _pa = pa;
+        }
+    }
+
+    private void setChromosome( final String chromosome ) {
+        _chromosome = chromosome;
     }
 
     private void setGeneName( final String gene_name ) {
@@ -641,20 +413,18 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         }
     }
 
-    private void setTaxonomyScientificName( final String os ) {
-        if ( _os == null ) {
-            _os = os;
-        }
+    private void setMap( final String map ) {
+        _map = map;
     }
 
-    private void setAccession( final String pa ) {
-        if ( _pa == null ) {
-            _pa = pa;
+    private void setSequenceName( final String rec_name ) {
+        if ( _de == null ) {
+            _de = rec_name;
         }
     }
 
-    public void setProvider( final String provider ) {
-        _provider = provider;
+    private void setSequenceSymbol( final String symbol ) {
+        _symbol = symbol;
     }
 
     private void setTaxId( final String tax_id ) {
@@ -663,15 +433,246 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         }
     }
 
-    @Override
-    public SortedSet<Annotation> getAnnotations() {
-        return _annotations;
+    private void setTaxonomyScientificName( final String os ) {
+        if ( _os == null ) {
+            _os = os;
+        }
     }
 
-    private void addAnnotation( final Annotation annotation ) {
-        if ( _annotations == null ) {
-            _annotations = new TreeSet<Annotation>();
+    //    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
+    //        final EbiDbEntry e = new EbiDbEntry();
+    //        for( final String line : lines ) {
+    //            if ( line.startsWith( "PA" ) ) {
+    //                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
+    //            }
+    //            else if ( line.startsWith( "DE" ) ) {
+    //                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
+    //            }
+    //            else if ( line.startsWith( "OS" ) ) {
+    //                if ( line.indexOf( "(" ) > 0 ) {
+    //                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+    //                }
+    //                else {
+    //                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
+    //                }
+    //            }
+    //            else if ( line.startsWith( "OX" ) ) {
+    //                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
+    //                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
+    //                }
+    //            }
+    //        }
+    //        return e;
+    //    }
+    public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
+        final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
+        final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
+        final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
+        final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
+        final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
+        final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
+        final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
+        final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
+        final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
+        final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
+        final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
+        final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
+        final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
+        final EbiDbEntry e = new EbiDbEntry();
+        final StringBuilder def = new StringBuilder();
+        boolean in_definition = false;
+        boolean in_features = false;
+        boolean in_source = false;
+        boolean in_gene = false;
+        boolean in_cds = false;
+        boolean in_mrna = false;
+        boolean in_protein = false;
+        for( final String line : lines ) {
+            if ( line.startsWith( "ACCESSION " ) ) {
+                e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
+                in_definition = false;
+            }
+            else if ( line.startsWith( "ID " ) ) {
+                e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
+                in_definition = false;
+            }
+            else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
+                boolean definiton = false;
+                if ( line.startsWith( "DEFINITION " ) ) {
+                    definiton = true;
+                }
+                if ( line.indexOf( "[" ) > 0 ) {
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
+                    }
+                }
+                else if ( line.indexOf( "." ) > 0 ) {
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
+                    }
+                }
+                else {
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
+                    }
+                }
+                if ( definiton ) {
+                    in_definition = true;
+                }
+            }
+            else if ( line.startsWith( "  ORGANISM " ) ) {
+                if ( line.indexOf( "(" ) > 0 ) {
+                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "  ORGANISM", "(" ) );
+                }
+                else {
+                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "  ORGANISM" ) );
+                }
+                //  in_def = false;
+            }
+            else if ( line.startsWith( "OS " ) ) {
+                if ( line.indexOf( "(" ) > 0 ) {
+                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+                }
+                else {
+                    e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
+                }
+            }
+            else if ( line.startsWith( " " ) && in_definition ) {
+                def.append( " " );
+                if ( line.indexOf( "[" ) > 0 ) {
+                    def.append( SequenceDbWsTools.extractTo( line, "[" ) );
+                }
+                else if ( line.indexOf( "." ) > 0 ) {
+                    def.append( SequenceDbWsTools.extractTo( line, "." ) );
+                }
+                else {
+                    def.append( line.trim() );
+                }
+            }
+            else {
+                in_definition = false;
+            }
+            if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
+                in_features = false;
+                in_source = false;
+                in_gene = false;
+                in_cds = false;
+                in_mrna = false;
+                in_protein = false;
+                // in_def = false;
+            }
+            if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
+                in_features = true;
+            }
+            if ( in_features && ( line.startsWith( "     source " ) || line.startsWith( "FT   source " ) ) ) {
+                in_source = true;
+                in_gene = false;
+                in_cds = false;
+                in_mrna = false;
+                in_protein = false;
+            }
+            if ( in_features && ( line.startsWith( "     gene " ) || line.startsWith( "FT   gene " ) ) ) {
+                in_source = false;
+                in_gene = true;
+                in_cds = false;
+                in_mrna = false;
+                in_protein = false;
+            }
+            if ( in_features && ( line.startsWith( "     CDS " ) || line.startsWith( "FT   CDS " ) ) ) {
+                in_source = false;
+                in_gene = false;
+                in_cds = true;
+                in_mrna = false;
+                in_protein = false;
+            }
+            if ( in_features && ( line.startsWith( "     Protein " ) || line.startsWith( "FT   Protein " ) ) ) {
+                in_source = false;
+                in_gene = false;
+                in_cds = false;
+                in_mrna = false;
+                in_protein = true;
+            }
+            if ( in_features && ( line.startsWith( "     mRNA " ) || line.startsWith( "FT   mRNA " ) ) ) {
+                in_source = false;
+                in_gene = false;
+                in_cds = false;
+                in_mrna = true;
+                in_protein = false;
+            }
+            if ( in_source ) {
+                final Matcher ti = taxon_PATTERN.matcher( line );
+                if ( ti.find() ) {
+                    e.setTaxId( ti.group( 1 ) );
+                }
+                final Matcher chr = chromosome_PATTERN.matcher( line );
+                if ( chr.find() ) {
+                    e.setChromosome( chr.group( 1 ) );
+                }
+                final Matcher map = map_PATTERN.matcher( line );
+                if ( map.find() ) {
+                    e.setMap( map.group( 1 ) );
+                }
+            }
+            if ( in_cds || in_gene ) {
+                final Matcher hgnc = hgnc_PATTERN.matcher( line );
+                if ( hgnc.find() ) {
+                    e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
+                }
+                final Matcher geneid = geneid_PATTERN.matcher( line );
+                if ( geneid.find() ) {
+                    e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
+                }
+            }
+            if ( in_protein || in_cds || in_gene || in_mrna ) {
+                final Matcher ec = ec_PATTERN.matcher( line );
+                if ( ec.find() ) {
+                    e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
+                }
+                final Matcher gene = gene_PATTERN.matcher( line );
+                if ( gene.find() ) {
+                    e.setGeneName( gene.group( 1 ) );
+                }
+                final Matcher uniprot = uniprot_PATTERN.matcher( line );
+                if ( uniprot.find() ) {
+                    e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
+                }
+                final Matcher interpro = interpro_PATTERN.matcher( line );
+                if ( interpro.find() ) {
+                    e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
+                }
+                final Matcher mim = mim_PATTERN.matcher( line );
+                if ( mim.find() ) {
+                    e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
+                }
+                final Matcher product = product_PATTERN.matcher( line );
+                if ( product.find() ) {
+                    e.setSequenceSymbol( product.group( 1 ) );
+                }
+                final Matcher pdb = pdb_PATTERN.matcher( line );
+                if ( pdb.find() ) {
+                    e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
+                }
+            }
         }
-        _annotations.add( annotation );
+        if ( def.length() > 0 ) {
+            e.setSequenceName( def.toString().trim() );
+        }
+        return e;
+    }
+
+    private static void x( final StringBuilder sb, final String s ) {
+        if ( sb.length() > 0 ) {
+            sb.append( " " );
+        }
+        sb.append( s.trim() );
     }
 }
index 29376bd..d7cafd1 100644 (file)
@@ -61,6 +61,7 @@ public final class SequenceDbWsTools {
     public final static String   EMBL_DBS_REFSEQ_P       = "refseqp";
     public final static String   EMBL_GENBANK            = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id=";
     public final static String   EMBL_REFSEQ             = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
+    public final static String   EMBL_EMBL               = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=EMBL&style=raw&id=";
     private final static boolean DEBUG                   = true;
     private final static String  URL_ENC                 = "UTF-8";
 
@@ -257,28 +258,24 @@ public final class SequenceDbWsTools {
         return result;
     }
 
-    public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
+    public static List<String> queryEmblDb( final Accession acc, final int max_lines_to_return ) throws IOException {
         final StringBuilder url_sb = new StringBuilder();
         //  url_sb.append( BASE_EMBL_DB_URL );
-        if ( id.getSource().equals( Source.NCBI.toString() ) ) {
+        System.out.println( "source: " + acc.getSource() );
+        if ( acc.getSource().equals( Source.NCBI.toString() ) ) {
             url_sb.append( EMBL_GENBANK );
             //url_sb.append( '/' );
         }
-        else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) {
+        else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) {
             url_sb.append( EMBL_REFSEQ );
-            //            if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
-            //                url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
-            //                url_sb.append( '/' );
-            //            }
-            //            else {
-            //                url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N );
-            //                url_sb.append( '/' );
-            //            }
+        }
+        else if ( acc.getSource().equals( Source.EMBL.toString() ) ) {
+            url_sb.append( EMBL_EMBL );
         }
         else {
-            throw new IllegalArgumentException( "unable to handle source: " + id.getSource() );
+            throw new IllegalArgumentException( "unable to handle source: " + acc.getSource() );
         }
-        return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
+        return queryDb( acc.getValue(), max_lines_to_return, url_sb.toString() );
     }
 
     public static List<String> queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return )
@@ -330,20 +327,32 @@ public final class SequenceDbWsTools {
                 // Eat this, and move to next.
             }
         }
-        else if ( acc.getSource().equals( Source.EMBL.toString() ) ) {
+        else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) {
             if ( DEBUG ) {
-                System.out.println( "embl: " + query );
+                System.out.println( "refseq: " + query );
             }
             try {
-                db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
+                db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
             }
             catch ( final FileNotFoundException e ) {
                 // Eat this, and move to next.
             }
         }
-        else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) {
+        else if ( acc.getSource().equals( Source.EMBL.toString() ) || acc.getSource().equals( Source.NCBI.toString() )
+                || acc.getSource().equals( Source.EMBL.toString() ) ) {
             if ( DEBUG ) {
-                System.out.println( "refseq: " + query );
+                System.out.println( acc.toString() );
+            }
+            try {
+                db_entry = obtainEmblEntry( acc, lines_to_return );
+            }
+            catch ( final FileNotFoundException e ) {
+                // Eat this, and move to next.
+            }
+        }
+        else if ( acc.getSource().equals( Source.GI.toString() ) ) {
+            if ( DEBUG ) {
+                System.out.println( "gi: " + query );
             }
             try {
                 db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );