From 0368d0ed99717796cff01aad68cf176338652354 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Mon, 25 Apr 2011 01:06:43 +0000 Subject: [PATCH] in progress... --- .../archaeopteryx/tools/SequenceDataRetriver.java | 30 ++++- .../java/src/org/forester/development/Hello3d.java | 31 +++++ forester/java/src/org/forester/test/Test.java | 23 ++++ .../src/org/forester/ws/uniprot/DatabaseTools.java | 72 ++++++++++ .../src/org/forester/ws/uniprot/EbiDbEntry.java | 137 ++++++++++++++++++++ .../src/org/forester/ws/uniprot/UniProtEntry.java | 22 +--- .../org/forester/ws/uniprot/UniProtWsTools.java | 35 ++++- 7 files changed, 329 insertions(+), 21 deletions(-) create mode 100644 forester/java/src/org/forester/development/Hello3d.java create mode 100644 forester/java/src/org/forester/ws/uniprot/DatabaseTools.java create mode 100644 forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 0960425..168a163 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -43,6 +43,7 @@ import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; +import org.forester.ws.uniprot.DatabaseTools; import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtWsTools; @@ -54,7 +55,7 @@ public final class SequenceDataRetriver implements Runnable { private final static boolean DEBUG = true; private enum Db { - UNKNOWN, UNIPROT; + UNKNOWN, UNIPROT, EMBL; } public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { @@ -173,11 +174,23 @@ public final class SequenceDataRetriver implements Runnable { query = node.getNodeData().getSequence().getAccession().getValue(); db = Db.UNIPROT; } + else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) + || node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) + ) ) { + query = node.getNodeData().getSequence().getAccession().getValue(); + db = Db.EMBL; + } else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - query = UniProtWsTools.parseUniProtAccessor( node.getName() ); - if ( !ForesterUtil.isEmpty( query ) ) { + + if ( (query = UniProtWsTools.parseUniProtAccessor( node.getName() ))!=null ) { db = Db.UNIPROT; } + else if (( query = DatabaseTools.parseGenbankAccessor( node.getName())) !=null ) { + db = Db.EMBL; + } } if ( !ForesterUtil.isEmpty( query ) ) { SequenceDatabaseEntry db_entry = null; @@ -192,6 +205,17 @@ public final class SequenceDataRetriver implements Runnable { // Ignore. } } + else if ( db == Db.EMBL ) { + if ( DEBUG ) { + System.out.println( "embl: " + query ); + } + try { + db_entry = UniProtWsTools.obtainEmblEntry( query, 200 ); + } + catch ( final FileNotFoundException e ) { + // Ignore. + } + } if ( db_entry != null ) { if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { seq.setAccession( new Accession( db_entry.getAccession(), "uniprot" ) ); diff --git a/forester/java/src/org/forester/development/Hello3d.java b/forester/java/src/org/forester/development/Hello3d.java new file mode 100644 index 0000000..dc0d099 --- /dev/null +++ b/forester/java/src/org/forester/development/Hello3d.java @@ -0,0 +1,31 @@ +// http://www.java3d.org/tutorial.html +// http://download.java.net/media/java3d/builds/release/1.5.1/README-download.html +// +// /usr/lib/jvm/java-6-sun-1.6.0.24/jre +// lib/ext/vecmath.jar +// lib/ext/j3dcore.jar +// lib/ext/j3dutils.jar +// + +package org.forester.development; +import com.sun.j3d.utils.universe.SimpleUniverse; +import com.sun.j3d.utils.geometry.ColorCube; +import com.sun.j3d.utils.geometry.Cylinder; +import javax.media.j3d.BranchGroup; +public class Hello3d { + + public Hello3d() { + SimpleUniverse universe = new SimpleUniverse(); + BranchGroup group = new BranchGroup(); + //ColorCube cc0 = new ColorCube( 0.1); + // Appearance a = new Appearance(); + group.addChild( new Cylinder( 0,1)); + group.addChild( new ColorCube( 0.3 ) ); + universe.getViewingPlatform().setNominalViewingTransform(); + universe.addBranchGraph( group ); + } + + public static void main( String[] args ) { + new Hello3d(); + } +} // end of class Hello3d diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index dc436c0..f01c864 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -99,6 +99,7 @@ import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; import org.forester.util.GeneralTable; +import org.forester.ws.uniprot.DatabaseTools; import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtTaxonomy; import org.forester.ws.uniprot.UniProtWsTools; @@ -646,6 +647,17 @@ public final class Test { System.out.println( "failed." ); failed++; } + + System.out.print( "EMBL Entry Retrieval: " ); + if ( Test.testEmblEntryRetrieval() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Uniprot Entry Retrieval: " ); if ( Test.testUniprotEntryRetrieval() ) { System.out.println( "OK." ); @@ -7731,10 +7743,21 @@ public final class Test { return true; } + private static boolean testEmblEntryRetrieval() { + if ( !DatabaseTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { + System.out.println( DatabaseTools.parseGenbankAccessor( "AY423861" )); + return false; + } + return true; + } + private static boolean testUniprotEntryRetrieval() { if ( !UniProtWsTools.parseUniProtAccessor( "P12345" ).equals( "P12345" ) ) { return false; } + if ( UniProtWsTools.parseUniProtAccessor( "EP12345" ) != null ) { + return false; + } if ( !UniProtWsTools.parseUniProtAccessor( "P1DDD5" ).equals( "P1DDD5" ) ) { return false; } diff --git a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java new file mode 100644 index 0000000..3826e89 --- /dev/null +++ b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java @@ -0,0 +1,72 @@ +package org.forester.ws.uniprot; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class DatabaseTools { + //The format for GenBank Accession numbers are: + //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals + //Protein: 3 letters + 5 numerals + //http://www.ncbi.nlm.nih.gov/Sequin/acc.html + + private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern + .compile( "^.*[^a-zA-Z0-9]?([A-Z]\\d{5})[^a-zA-Z0-9]?" ); + + private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern + .compile( "^.*[^a-zA-Z0-9]?([A-Z]{2}\\d{6})[^a-zA-Z0-9]?" ); + + private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern + .compile( "^.*[^a-zA-Z0-9]?([A-Z]{3}\\d{5})[^a-zA-Z0-9]?" ); + + + + private final static boolean DEBUG = false; + + /** + * Returns null if no match. + * + * @param query + * @param db + * @return + */ + static public String parseGenbankAccessor( final String query ) { + Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + } + } + + static String extract( final String target, final String a, final String b ) { + final int i_a = target.indexOf( a ); + final int i_b = target.indexOf( b ); + if ( ( i_a < 0 ) || ( i_b < i_a ) ) { + throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and [" + + b + "]" ); + } + return target.substring( i_a + a.length(), i_b ).trim(); + } + + + + static String extract( final String target, final String a ) { + final int i_a = target.indexOf( a ); + return target.substring( i_a + a.length() ).trim(); + } + +} diff --git a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java new file mode 100644 index 0000000..4f7b779 --- /dev/null +++ b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java @@ -0,0 +1,137 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.uniprot; + +import java.util.List; + +import sun.reflect.generics.reflectiveObjects.NotImplementedException; + +public final class EbiDbEntry implements SequenceDatabaseEntry { +//http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ + + + private String _pa; + private String _de; + private String _os; + private String _tax_id; + private String _symbol; + + private EbiDbEntry() { + } + + + public Object clone() { + throw new NotImplementedException(); + } + + public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { + final EbiDbEntry e = new EbiDbEntry(); + for( final String line : lines ) { + if ( line.startsWith( "PA" ) ) { + e.setPA( DatabaseTools.extract( line, "PA", ";" ) ); + } + else if ( line.startsWith( "DE" ) ) { + // if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { + e.setDe( DatabaseTools.extract( line, "DE" ) ); + //} + } + // else if ( line.startsWith( "GN" ) ) { + // if ( ( line.indexOf( "Name=" ) > 0 ) ) { + // e.setSymbol( extract( line, "Name=", ";" ) ); + // } + // } + else if ( line.startsWith( "OS" ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOs( DatabaseTools.extract( line, "OS", "(" ) ); + } + else { + e.setOs( DatabaseTools.extract( line, "OS" ) ); + } + } + else if ( line.startsWith( "OX" ) ) { + if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { + e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); + } + } + } + return e; + } + + @Override + public String getAccession() { + return _pa; + } + + private void setPA( final String pa ) { + if ( _pa == null ) { + _pa= pa; + } + } + + @Override + public String getSequenceName() { + return _de; + } + + private void setDe( final String rec_name ) { + if ( _de == null ) { + _de = rec_name; + } + } + + @Override + public String getTaxonomyScientificName() { + return _os; + } + + private void setOs( final String os ) { + if ( _os== null ) { + _os = os; + } + } + + @Override + public String getTaxonomyIdentifier() { + return _tax_id; + } + + private void setTaxId( final String tax_id ) { + if ( _tax_id == null ) { + _tax_id = tax_id; + } + } + + @Override + public String getSequenceSymbol() { + return _symbol; + } + + private void setSymbol( final String symbol ) { + if ( _symbol == null ) { + _symbol = symbol; + } + } +} diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java index f74b1c7..2b74883 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java @@ -42,44 +42,36 @@ public final class UniProtEntry implements SequenceDatabaseEntry { final UniProtEntry e = new UniProtEntry(); for( final String line : lines ) { if ( line.startsWith( "AC" ) ) { - e.setAc( extract( line, "AC", ";" ) ); + e.setAc( DatabaseTools.extract( line, "AC", ";" ) ); } else if ( line.startsWith( "DE" ) ) { if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setRecName( extract( line, "Full=", ";" ) ); + e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) ); } } else if ( line.startsWith( "GN" ) ) { if ( ( line.indexOf( "Name=" ) > 0 ) ) { - e.setSymbol( extract( line, "Name=", ";" ) ); + e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) ); } } else if ( line.startsWith( "OS" ) ) { if ( line.indexOf( "(" ) > 0 ) { - e.setOsScientificName( extract( line, "OS", "(" ) ); + e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) ); } else { - e.setOsScientificName( extract( line, "OS", "." ) ); + e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) ); } } else if ( line.startsWith( "OX" ) ) { if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - e.setTaxId( extract( line, "NCBI_TaxID=", ";" ) ); + e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); } } } return e; } - private static String extract( final String target, final String a, final String b ) { - final int i_a = target.indexOf( a ); - final int i_b = target.indexOf( b ); - if ( ( i_a < 0 ) || ( i_b < i_a ) ) { - throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and [" - + b + "]" ); - } - return target.substring( i_a + a.length(), i_b ).trim(); - } + @Override public String getAccession() { diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java index cd8ab43..dd8d760 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java @@ -45,6 +45,8 @@ public final class UniProtWsTools { UNKNOWN, UNIPROT; } public final static String BASE_URL = "http://www.uniprot.org/"; + + public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/"; private final static String URL_ENC = "UTF-8"; // uniprot/expasy accession number format (6 chars): // letter digit letter-or-digit letter-or-digit letter-or-digit digit @@ -57,7 +59,7 @@ public final class UniProtWsTools { } /** - * Return null if no match. + * Returns null if no match. * * @param query * @param db @@ -72,6 +74,8 @@ public final class UniProtWsTools { return null; } } + + public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { @@ -208,14 +212,33 @@ public final class UniProtWsTools { return taxonomies; } + + public static List queryEmblDb( final String query, int max_lines_to_return ) throws IOException { + return queryDb( query, + max_lines_to_return, + BASE_EMBL_DB_URL ) ; + } + + + public static List queryUniprot( final String query, int max_lines_to_return ) throws IOException { + return queryDb( query, + max_lines_to_return, + BASE_URL ) ; + + + } + + public static List queryDb( final String query, + int max_lines_to_return, + final String base_url ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { throw new IllegalArgumentException( "illegal attempt to use empty query " ); } if ( max_lines_to_return < 1 ) { max_lines_to_return = 1; } - final URL url = new URL( BASE_URL + query ); + final URL url = new URL( base_url + query ); if ( DEBUG ) { System.out.println( "url: " + url.toString() ); } @@ -232,10 +255,16 @@ public final class UniProtWsTools { in.close(); return result; } - + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) throws IOException { final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); return UniProtEntry.createInstanceFromPlainText( lines ); } + + public static SequenceDatabaseEntry obtainEmblEntry( String query, int max_lines_to_return ) throws IOException { + final List lines = queryEmblDb( "query", max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainText( lines ); + } } -- 1.7.10.2