inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 12 Oct 2013 03:37:19 +0000 (03:37 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 12 Oct 2013 03:37:19 +0000 (03:37 +0000)
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java
forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java
forester/java/src/org/forester/ws/seqdb/UniProtEntry.java

index 3de1c55..b073bb4 100644 (file)
@@ -237,6 +237,18 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+        if ( PERFORM_DB_TESTS ) {
+            System.out.print( "Ebi Entry Retrieval: " );
+            if ( Test.testEbiEntryRetrieval() ) {
+                System.out.println( "OK." );
+                succeeded++;
+            }
+            else {
+                System.out.println( "failed." );
+                failed++;
+            }
+        }
+        System.exit( 0 );
         System.out.print( "UniProtKB id extraction: " );
         if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
             System.out.println( "OK." );
@@ -267,7 +279,6 @@ public final class Test {
                 System.exit( -1 );
             }
         }
-        // System.exit( 0 );
         System.out.print( "Hmmscan output parser: " );
         if ( testHmmscanOutputParser() ) {
             System.out.println( "OK." );
@@ -829,8 +840,8 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
-        System.out.print( "EMBL Entry Retrieval: " );
-        if ( Test.testEmblEntryRetrieval() ) {
+        System.out.print( "Genbank accessor parsing: " );
+        if ( Test.testGenbankAccessorParsing() ) {
             System.out.println( "OK." );
             succeeded++;
         }
@@ -3362,7 +3373,7 @@ public final class Test {
         return true;
     }
 
-    private static boolean testEmblEntryRetrieval() {
+    private static boolean testGenbankAccessorParsing() {
         //The format for GenBank Accession numbers are:
         //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
         //Protein:    3 letters + 5 numerals
@@ -11038,6 +11049,49 @@ public final class Test {
         return true;
     }
 
+    private static boolean testEbiEntryRetrieval() {
+        try {
+            final SequenceDatabaseEntry entry = SequenceDbWsTools
+                    .obtainEmblEntry( new Accession( "AAK41263", Accession.Source.NCBI ) );
+            if ( !entry.getAccession().equals( "AAK41263" ) ) {
+                System.out.println( entry.getAccession() );
+                return false;
+            }
+            if ( !entry.getTaxonomyScientificName().equals( "Sulfolobus solfataricus P2" ) ) {
+                System.out.println( entry.getTaxonomyScientificName() );
+                return false;
+            }
+            if ( !entry.getSequenceName()
+                    .equals( "Sulfolobus solfataricus P2 Glycogen debranching enzyme, hypothetical (treX-like)" ) ) {
+                System.out.println( entry.getSequenceName() );
+                return false;
+            }
+            // if ( !entry.getSequenceSymbol().equals( "mAspAT" ) ) {
+            //     System.out.println( entry.getSequenceSymbol() );
+            //     return false;
+            // }
+            if ( !entry.getGeneName().equals( "treX-like" ) ) {
+                System.out.println( entry.getGeneName() );
+                return false;
+            }
+            if ( !entry.getTaxonomyIdentifier().equals( "273057" ) ) {
+                System.out.println( entry.getTaxonomyIdentifier() );
+                return false;
+            }
+        }
+        catch ( final IOException e ) {
+            System.out.println();
+            System.out.println( "the following might be due to absence internet connection:" );
+            e.printStackTrace( System.out );
+            return true;
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace();
+            return false;
+        }
+        return true;
+    }
+
     private static boolean testUniprotEntryRetrieval() {
         try {
             final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 );
index 5f2cad2..c167259 100644 (file)
@@ -27,10 +27,12 @@ package org.forester.ws.seqdb;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.forester.go.GoTerm;
 import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
 import org.forester.util.ForesterUtil;
 
 public final class EbiDbEntry implements SequenceDatabaseEntry {
@@ -70,9 +72,10 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
         final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
         final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
+        final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"[\\.\\-\\d]+\"" );
         final EbiDbEntry e = new EbiDbEntry();
         final StringBuilder def = new StringBuilder();
-        boolean in_def = false;
+        boolean in_definition = false;
         boolean in_features = false;
         boolean in_source = false;
         boolean in_gene = false;
@@ -81,19 +84,44 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         for( final String line : lines ) {
             if ( line.startsWith( "ACCESSION " ) ) {
                 e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
-                in_def = false;
+                in_definition = false;
             }
-            else if ( line.startsWith( "DEFINITION " ) ) {
+            else if ( line.startsWith( "ID " ) ) {
+                e.setPA( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
+                in_definition = false;
+            }
+            else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
+                boolean definiton = false;
+                if ( line.startsWith( "DEFINITION " ) ) {
+                    definiton = true;
+                }
                 if ( line.indexOf( "[" ) > 0 ) {
-                    def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) );
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
+                    }
                 }
                 else if ( line.indexOf( "." ) > 0 ) {
-                    def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) );
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
+                    }
                 }
                 else {
-                    def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) );
+                    if ( definiton ) {
+                        x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
+                    }
+                    else {
+                        x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
+                    }
+                }
+                if ( definiton ) {
+                    in_definition = true;
                 }
-                in_def = true;
             }
             else if ( line.startsWith( "  ORGANISM " ) ) {
                 if ( line.indexOf( "(" ) > 0 ) {
@@ -104,7 +132,15 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
                 }
                 //  in_def = false;
             }
-            else if ( line.startsWith( " " ) && in_def ) {
+            else if ( line.startsWith( "OS " ) ) {
+                if ( line.indexOf( "(" ) > 0 ) {
+                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+                }
+                else {
+                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
+                }
+            }
+            else if ( line.startsWith( " " ) && in_definition ) {
                 def.append( " " );
                 if ( line.indexOf( "[" ) > 0 ) {
                     def.append( SequenceDbWsTools.extractTo( line, "[" ) );
@@ -117,7 +153,7 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
                 }
             }
             else {
-                in_def = false;
+                in_definition = false;
             }
             if ( X_PATTERN.matcher( line ).find() ) {
                 in_features = false;
@@ -154,22 +190,36 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
                 in_cds = false;
                 in_protein = true;
             }
+            if ( in_protein || in_cds ) {
+                final Matcher m = ec_PATTERN.matcher( line );
+                if ( m.find() ) {
+                    e.addAnnotation( new Annotation( "EC", m.group( 1 ) ) );
+                }
+            }
         }
         if ( def.length() > 0 ) {
             e.setDe( def.toString().trim() );
         }
         return e;
     }
+
+    private static void x( final StringBuilder sb, final String s ) {
+        if ( sb.length() > 0 ) {
+            sb.append( " " );
+        }
+        sb.append( s.trim() );
+    }
     // FIXME actually this is NCBI entry
     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
-    private String               _pa;
-    private String               _de;
-    private String               _os;
-    private String               _tax_id;
-    private String               _symbol;
-    private String               _provider;
-    private ArrayList<Accession> _cross_references;
-    private String               _gene_name;
+    private String           _pa;
+    private String           _de;
+    private String           _os;
+    private String           _tax_id;
+    private String           _symbol;
+    private String           _provider;
+    private List<Accession>  _cross_references;
+    private List<Annotation> _annotations;
+    private String           _gene_name;
 
     // TODO  PUBMED   15798186
     //TODO  (FEATURES) 
@@ -523,4 +573,16 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
             _tax_id = tax_id;
         }
     }
+
+    @Override
+    public List<Annotation> getAnnotations() {
+        return _annotations;
+    }
+
+    private void addAnnotation( final Annotation annotation ) {
+        if ( _annotations == null ) {
+            _annotations = new ArrayList<Annotation>();
+        }
+        _annotations.add( annotation );
+    }
 }
index 3a28d6a..b060d0d 100644 (file)
@@ -29,6 +29,7 @@ import java.util.List;
 
 import org.forester.go.GoTerm;
 import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
 
 public interface SequenceDatabaseEntry {
 
@@ -38,6 +39,8 @@ public interface SequenceDatabaseEntry {
 
     public List<GoTerm> getGoTerms();
 
+    public List<Annotation> getAnnotations();
+
     public String getProvider();
 
     public String getSequenceName();
index cecef9f..209d284 100644 (file)
@@ -55,8 +55,9 @@ import org.forester.util.SequenceAccessionTools;
 public final class SequenceDbWsTools {
 
     public final static String   EMBL_REFSEQ             = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
+    public final static String   EMBL_GENBANK            = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id=";
     public final static String   BASE_UNIPROT_URL        = "http://www.uniprot.org/";
-    public final static String   EMBL_DBS_EMBL           = "embl";
+    //public final static String   EMBL_DBS_EMBL           = "embl";
     public final static String   EMBL_DBS_REFSEQ_N       = "refseqn";
     public final static String   EMBL_DBS_REFSEQ_P       = "refseqp";
     private final static boolean DEBUG                   = true;
@@ -141,10 +142,14 @@ public final class SequenceDbWsTools {
         return null;
     }
 
-    public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return )
+    public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc, final int max_lines_to_return )
             throws IOException {
-        final List<String> lines = queryEmblDb( id, max_lines_to_return );
-        return EbiDbEntry.createInstanceFromPlainText( lines );
+        final List<String> lines = queryEmblDb( acc, max_lines_to_return );
+        return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
+    }
+
+    public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException {
+        return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN );
     }
 
     public final static Accession obtainSeqAccession( final PhylogenyNode node ) {
@@ -155,12 +160,16 @@ public final class SequenceDbWsTools {
         return acc;
     }
 
-    public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
+    public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc, final int max_lines_to_return )
             throws IOException {
-        final List<String> lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return );
+        final List<String> lines = queryEmblDbForRefSeqEntry( acc, max_lines_to_return );
         return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
     }
 
+    public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException {
+        return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN );
+    }
+
     public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data,
                                                    final int lines_to_return,
                                                    final SortedSet<String> not_found,
@@ -206,6 +215,10 @@ public final class SequenceDbWsTools {
         return UniProtEntry.createInstanceFromPlainText( lines );
     }
 
+    public static SequenceDatabaseEntry obtainUniProtEntry( final String query ) throws IOException {
+        return obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN );
+    }
+
     public static List<String> queryDb( final String query, int max_lines_to_return, final String base_url )
             throws IOException {
         if ( ForesterUtil.isEmpty( query ) ) {
@@ -252,9 +265,9 @@ public final class SequenceDbWsTools {
     public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
         final StringBuilder url_sb = new StringBuilder();
         //  url_sb.append( BASE_EMBL_DB_URL );
-        if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) {
-            url_sb.append( EMBL_DBS_EMBL );
-            url_sb.append( '/' );
+        if ( id.getSource().equals( Source.NCBI.toString() ) ) {
+            url_sb.append( EMBL_GENBANK );
+            //url_sb.append( '/' );
         }
         else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) {
             url_sb.append( EMBL_REFSEQ );
@@ -267,6 +280,9 @@ public final class SequenceDbWsTools {
             //                url_sb.append( '/' );
             //            }
         }
+        else {
+            throw new IllegalArgumentException( "unable to handle source: " + id.getSource() );
+        }
         return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
     }
 
index 4ba10de..1345523 100644 (file)
@@ -34,6 +34,7 @@ import org.forester.go.BasicGoTerm;
 import org.forester.go.GoNameSpace;
 import org.forester.go.GoTerm;
 import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
 import org.forester.util.ForesterUtil;
 
 public final class UniProtEntry implements SequenceDatabaseEntry {
@@ -124,7 +125,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
         if ( _cross_references == null ) {
             _cross_references = new ArrayList<Accession>();
         }
-        System.out.println( "XREF ADDED: " + accession );
         _cross_references.add( accession );
     }
 
@@ -288,4 +288,9 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
         }
         return e;
     }
+
+    @Override
+    public List<Annotation> getAnnotations() {
+        return null;
+    }
 }