in progress...
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Mon, 25 Apr 2011 01:06:43 +0000 (01:06 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Mon, 25 Apr 2011 01:06:43 +0000 (01:06 +0000)
forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java
forester/java/src/org/forester/development/Hello3d.java [new file with mode: 0644]
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/ws/uniprot/DatabaseTools.java [new file with mode: 0644]
forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java [new file with mode: 0644]
forester/java/src/org/forester/ws/uniprot/UniProtEntry.java
forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java

index 0960425..168a163 100644 (file)
@@ -43,6 +43,7 @@ import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.util.ForesterUtil;
+import org.forester.ws.uniprot.DatabaseTools;
 import org.forester.ws.uniprot.SequenceDatabaseEntry;
 import org.forester.ws.uniprot.UniProtWsTools;
 
@@ -54,7 +55,7 @@ public final class SequenceDataRetriver implements Runnable {
     private final static boolean       DEBUG = true;
 
     private enum Db {
-        UNKNOWN, UNIPROT;
+        UNKNOWN, UNIPROT, EMBL;
     }
 
     public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
@@ -173,11 +174,23 @@ public final class SequenceDataRetriver implements Runnable {
                 query = node.getNodeData().getSequence().getAccession().getValue();
                 db = Db.UNIPROT;
             }
+            else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
+                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) 
+                     || node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" )      
+                    ) ) {
+                query = node.getNodeData().getSequence().getAccession().getValue();
+                db = Db.EMBL;
+            }
             else if ( !ForesterUtil.isEmpty( node.getName() ) ) {
-                query = UniProtWsTools.parseUniProtAccessor( node.getName() );
-                if ( !ForesterUtil.isEmpty( query ) ) {
+               
+                if (  (query = UniProtWsTools.parseUniProtAccessor( node.getName() ))!=null  ) {
                     db = Db.UNIPROT;
                 }
+                else if (( query = DatabaseTools.parseGenbankAccessor( node.getName())) !=null ) {
+                    db = Db.EMBL;
+                }
             }
             if ( !ForesterUtil.isEmpty( query ) ) {
                 SequenceDatabaseEntry db_entry = null;
@@ -192,6 +205,17 @@ public final class SequenceDataRetriver implements Runnable {
                         // Ignore.
                     }
                 }
+                else if ( db == Db.EMBL ) {
+                    if ( DEBUG ) {
+                        System.out.println( "embl: " + query );
+                    }
+                    try {
+                        db_entry = UniProtWsTools.obtainEmblEntry( query, 200 );
+                    }
+                    catch ( final FileNotFoundException e ) {
+                        // Ignore.
+                    }
+                }
                 if ( db_entry != null ) {
                     if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
                         seq.setAccession( new Accession( db_entry.getAccession(), "uniprot" ) );
diff --git a/forester/java/src/org/forester/development/Hello3d.java b/forester/java/src/org/forester/development/Hello3d.java
new file mode 100644 (file)
index 0000000..dc0d099
--- /dev/null
@@ -0,0 +1,31 @@
+// http://www.java3d.org/tutorial.html
+// http://download.java.net/media/java3d/builds/release/1.5.1/README-download.html
+//
+// /usr/lib/jvm/java-6-sun-1.6.0.24/jre
+//  lib/ext/vecmath.jar
+//  lib/ext/j3dcore.jar
+//  lib/ext/j3dutils.jar
+//
+
+package org.forester.development;
+import com.sun.j3d.utils.universe.SimpleUniverse;
+import com.sun.j3d.utils.geometry.ColorCube;
+import com.sun.j3d.utils.geometry.Cylinder;
+import javax.media.j3d.BranchGroup;
+public class Hello3d {
+
+    public Hello3d() {
+        SimpleUniverse universe = new SimpleUniverse();
+        BranchGroup group = new BranchGroup();
+        //ColorCube cc0 = new ColorCube( 0.1);
+       // Appearance a = new Appearance();
+        group.addChild( new Cylinder( 0,1));
+        group.addChild( new ColorCube( 0.3 ) );
+        universe.getViewingPlatform().setNominalViewingTransform();
+        universe.addBranchGraph( group );
+    }
+
+    public static void main( String[] args ) {
+        new Hello3d();
+    }
+} // end of class Hello3d
index dc436c0..f01c864 100644 (file)
@@ -99,6 +99,7 @@ import org.forester.util.DescriptiveStatistics;
 import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
 import org.forester.util.GeneralTable;
+import org.forester.ws.uniprot.DatabaseTools;
 import org.forester.ws.uniprot.SequenceDatabaseEntry;
 import org.forester.ws.uniprot.UniProtTaxonomy;
 import org.forester.ws.uniprot.UniProtWsTools;
@@ -646,6 +647,17 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+        
+        System.out.print( "EMBL Entry Retrieval: " );
+        if ( Test.testEmblEntryRetrieval() ) {
+            System.out.println( "OK." );
+            succeeded++;
+        }
+        else {
+            System.out.println( "failed." );
+            failed++;
+        }
+        
         System.out.print( "Uniprot Entry Retrieval: " );
         if ( Test.testUniprotEntryRetrieval() ) {
             System.out.println( "OK." );
@@ -7731,10 +7743,21 @@ public final class Test {
         return true;
     }
 
+    private static boolean testEmblEntryRetrieval() {
+        if ( !DatabaseTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
+            System.out.println( DatabaseTools.parseGenbankAccessor( "AY423861" ));
+            return false;
+        }
+        return true;
+    }
+    
     private static boolean testUniprotEntryRetrieval() {
         if ( !UniProtWsTools.parseUniProtAccessor( "P12345" ).equals( "P12345" ) ) {
             return false;
         }
+        if ( UniProtWsTools.parseUniProtAccessor( "EP12345" ) != null ) {
+            return false;
+        }
         if ( !UniProtWsTools.parseUniProtAccessor( "P1DDD5" ).equals( "P1DDD5" ) ) {
             return false;
         }
diff --git a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java
new file mode 100644 (file)
index 0000000..3826e89
--- /dev/null
@@ -0,0 +1,72 @@
+package org.forester.ws.uniprot;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+public class DatabaseTools {
+    //The format for GenBank Accession numbers are:
+    //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
+    //Protein:    3 letters + 5 numerals
+    //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
+    
+    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
+    .compile( "^.*[^a-zA-Z0-9]?([A-Z]\\d{5})[^a-zA-Z0-9]?" );
+    
+    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
+    .compile( "^.*[^a-zA-Z0-9]?([A-Z]{2}\\d{6})[^a-zA-Z0-9]?" );
+
+    private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
+    .compile( "^.*[^a-zA-Z0-9]?([A-Z]{3}\\d{5})[^a-zA-Z0-9]?" );
+
+    
+    
+    private final static boolean DEBUG              = false;
+
+    /**
+     * Returns null if no match.
+     * 
+     * @param query
+     * @param db 
+     * @return
+     */
+    static public String parseGenbankAccessor( final String query ) {
+        Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
+        if ( m.lookingAt() ) {
+            return m.group( 1 );
+        }
+        else {
+             m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
+            if ( m.lookingAt() ) {
+                return m.group( 1 );
+            } 
+            else {
+                m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
+                if ( m.lookingAt() ) {
+                    return m.group( 1 );
+                }
+                else {
+                    return null;
+                }
+            }
+        }
+    }
+
+    static String extract( final String target, final String a, final String b ) {
+        final int i_a = target.indexOf( a );
+        final int i_b = target.indexOf( b );
+        if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
+            throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and ["
+                    + b + "]" );
+        }
+        return target.substring( i_a + a.length(), i_b ).trim();
+    }
+
+
+
+    static String extract( final String target, final String a ) {
+        final int i_a = target.indexOf( a );
+        return target.substring( i_a + a.length() ).trim();
+    }
+
+}
diff --git a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java
new file mode 100644 (file)
index 0000000..4f7b779
--- /dev/null
@@ -0,0 +1,137 @@
+// $Id:
+// forester -- software libraries and applications
+// for genomics and evolutionary biology research.
+//
+// Copyright (C) 2010 Christian M Zmasek
+// Copyright (C) 2010 Sanford-Burnham Medical Research Institute
+// All rights reserved
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+//
+// Contact: phylosoft @ gmail . com
+// WWW: www.phylosoft.org/forester
+
+package org.forester.ws.uniprot;
+
+import java.util.List;
+
+import sun.reflect.generics.reflectiveObjects.NotImplementedException;
+
+public final class EbiDbEntry implements SequenceDatabaseEntry {
+//http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
+    
+    
+    private String _pa;
+    private String _de;
+    private String _os;
+    private String _tax_id;
+    private String _symbol;
+
+    private EbiDbEntry() {
+    }
+
+    
+    public Object clone() {
+        throw new NotImplementedException();
+    }
+    
+    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
+        final EbiDbEntry e = new EbiDbEntry();
+        for( final String line : lines ) {
+            if ( line.startsWith( "PA" ) ) {
+                e.setPA( DatabaseTools.extract( line, "PA", ";" ) );
+            }
+            else if ( line.startsWith( "DE" ) ) {
+               // if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
+                    e.setDe( DatabaseTools.extract( line, "DE" ) );
+                //}
+            }
+          //  else if ( line.startsWith( "GN" ) ) {
+          //      if ( ( line.indexOf( "Name=" ) > 0 ) ) {
+          //          e.setSymbol( extract( line, "Name=", ";" ) );
+          //      }
+          //  }
+            else if ( line.startsWith( "OS" ) ) {
+                if ( line.indexOf( "(" ) > 0 ) {
+                    e.setOs( DatabaseTools.extract( line, "OS", "(" ) );
+                }
+                else {
+                    e.setOs( DatabaseTools.extract( line, "OS" ) );
+                }
+            }
+            else if ( line.startsWith( "OX" ) ) {
+                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
+                    e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) );
+                }
+            }
+        }
+        return e;
+    }
+
+    @Override
+    public String getAccession() {
+        return _pa;
+    }
+
+    private void setPA( final String pa ) {
+        if ( _pa == null ) {
+            _pa= pa;
+        }
+    }
+
+    @Override
+    public String getSequenceName() {
+        return _de;
+    }
+
+    private void setDe( final String rec_name ) {
+        if ( _de == null ) {
+            _de = rec_name;
+        }
+    }
+
+    @Override
+    public String getTaxonomyScientificName() {
+        return _os;
+    }
+
+    private void setOs( final String os ) {
+        if ( _os== null ) {
+            _os = os;
+        }
+    }
+
+    @Override
+    public String getTaxonomyIdentifier() {
+        return _tax_id;
+    }
+
+    private void setTaxId( final String tax_id ) {
+        if ( _tax_id == null ) {
+            _tax_id = tax_id;
+        }
+    }
+
+    @Override
+    public String getSequenceSymbol() {
+        return _symbol;
+    }
+
+    private void setSymbol( final String symbol ) {
+        if ( _symbol == null ) {
+            _symbol = symbol;
+        }
+    }
+}
index f74b1c7..2b74883 100644 (file)
@@ -42,44 +42,36 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
         final UniProtEntry e = new UniProtEntry();
         for( final String line : lines ) {
             if ( line.startsWith( "AC" ) ) {
-                e.setAc( extract( line, "AC", ";" ) );
+                e.setAc( DatabaseTools.extract( line, "AC", ";" ) );
             }
             else if ( line.startsWith( "DE" ) ) {
                 if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
-                    e.setRecName( extract( line, "Full=", ";" ) );
+                    e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) );
                 }
             }
             else if ( line.startsWith( "GN" ) ) {
                 if ( ( line.indexOf( "Name=" ) > 0 ) ) {
-                    e.setSymbol( extract( line, "Name=", ";" ) );
+                    e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) );
                 }
             }
             else if ( line.startsWith( "OS" ) ) {
                 if ( line.indexOf( "(" ) > 0 ) {
-                    e.setOsScientificName( extract( line, "OS", "(" ) );
+                    e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) );
                 }
                 else {
-                    e.setOsScientificName( extract( line, "OS", "." ) );
+                    e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) );
                 }
             }
             else if ( line.startsWith( "OX" ) ) {
                 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
-                    e.setTaxId( extract( line, "NCBI_TaxID=", ";" ) );
+                    e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) );
                 }
             }
         }
         return e;
     }
 
-    private static String extract( final String target, final String a, final String b ) {
-        final int i_a = target.indexOf( a );
-        final int i_b = target.indexOf( b );
-        if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
-            throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and ["
-                    + b + "]" );
-        }
-        return target.substring( i_a + a.length(), i_b ).trim();
-    }
+   
 
     @Override
     public String getAccession() {
index cd8ab43..dd8d760 100644 (file)
@@ -45,6 +45,8 @@ public final class UniProtWsTools {
         UNKNOWN, UNIPROT;
     }
     public final static String   BASE_URL           = "http://www.uniprot.org/";
+    
+    public final static String   BASE_EMBL_DB_URL   = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/";
     private final static String  URL_ENC            = "UTF-8";
     // uniprot/expasy accession number format (6 chars):
     // letter digit letter-or-digit letter-or-digit letter-or-digit digit
@@ -57,7 +59,7 @@ public final class UniProtWsTools {
     }
 
     /**
-     * Return null if no match.
+     * Returns null if no match.
      * 
      * @param query
      * @param db 
@@ -72,6 +74,8 @@ public final class UniProtWsTools {
             return null;
         }
     }
+    
+  
 
     public static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
             throws IOException {
@@ -208,14 +212,33 @@ public final class UniProtWsTools {
         return taxonomies;
     }
 
+    
+    public static List<String> queryEmblDb( final String query, int max_lines_to_return ) throws IOException {
+        return queryDb( query,
+                        max_lines_to_return,
+                        BASE_EMBL_DB_URL ) ;
+    }
+    
+    
+    
     public static List<String> queryUniprot( final String query, int max_lines_to_return ) throws IOException {
+        return queryDb( query,
+                max_lines_to_return,
+                BASE_URL ) ;
+        
+       
+    }
+
+    public static List<String> queryDb( final String query,
+                                        int max_lines_to_return,
+                                        final String base_url ) throws IOException {
         if ( ForesterUtil.isEmpty( query ) ) {
             throw new IllegalArgumentException( "illegal attempt to use empty query " );
         }
         if ( max_lines_to_return < 1 ) {
             max_lines_to_return = 1;
         }
-        final URL url = new URL( BASE_URL + query );
+        final URL url = new URL( base_url + query );
         if ( DEBUG ) {
             System.out.println( "url: " + url.toString() );
         }
@@ -232,10 +255,16 @@ public final class UniProtWsTools {
         in.close();
         return result;
     }
-
+    
+    
     public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return )
             throws IOException {
         final List<String> lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return );
         return UniProtEntry.createInstanceFromPlainText( lines );
     }
+
+    public static SequenceDatabaseEntry obtainEmblEntry( String query, int max_lines_to_return ) throws IOException {
+        final List<String> lines = queryEmblDb( "query", max_lines_to_return );
+        return EbiDbEntry.createInstanceFromPlainText( lines );
+    }
 }