From 6721c2fefe276eef3e17773812b05474d609ebc4 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 6 Jun 2012 20:57:02 +0000 Subject: [PATCH] phylotastic hackathon at NESCENT 120606 --- .../org/forester/analysis/TaxonomyDataManager.java | 3 +- .../archaeopteryx/MainFrameApplication.java | 4 +- .../tools/AncestralTaxonomyInferrer.java | 4 +- .../archaeopteryx/tools/SequenceDataRetriver.java | 69 ++++++++++---------- .../org/forester/phylogeny/data/Identifier.java | 5 ++ forester/java/src/org/forester/test/Test.java | 4 +- .../src/org/forester/util/SequenceIdParser.java | 10 +-- .../src/org/forester/ws/uniprot/EbiDbEntry.java | 35 ++++++++++ .../org/forester/ws/uniprot/UniProtWsTools.java | 44 +++++++++++-- 9 files changed, 127 insertions(+), 51 deletions(-) diff --git a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java index 674c9c6..019df24 100644 --- a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java +++ b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java @@ -38,6 +38,7 @@ import javax.swing.JOptionPane; import org.forester.archaeopteryx.MainFrameApplication; import org.forester.archaeopteryx.TreePanel; +import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer; import org.forester.archaeopteryx.tools.RunnableProcess; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.phylogeny.Phylogeny; @@ -521,7 +522,7 @@ public final class TaxonomyDataManager extends RunnableProcess { } private final String getBaseUrl() { - return UniProtWsTools.BASE_URL; + return AncestralTaxonomyInferrer.getBaseUrl(); } @Override diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index ac531ba..9a7619b 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -954,9 +954,9 @@ public final class MainFrameApplication extends MainFrame { _obtain_detailed_taxonomic_information_deleting_jmi .setToolTipText( "To add additional taxonomic information, deletes nodes for which taxonomy cannot found (from UniProt Taxonomy)" ); _tools_menu - .add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information (from UniProt)" ) ); + .add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) ); customizeJMenuItem( _obtain_uniprot_seq_information_jmi ); - _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information (from UniProt)" ); + _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information" ); _tools_menu.addSeparator(); if ( !Constants.__RELEASE ) { _tools_menu.add( _function_analysis = new JMenuItem( "Add UniProtKB Annotations" ) ); diff --git a/forester/java/src/org/forester/archaeopteryx/tools/AncestralTaxonomyInferrer.java b/forester/java/src/org/forester/archaeopteryx/tools/AncestralTaxonomyInferrer.java index 7475d1d..a42d1ec 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/AncestralTaxonomyInferrer.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/AncestralTaxonomyInferrer.java @@ -48,8 +48,8 @@ public class AncestralTaxonomyInferrer extends RunnableProcess { _treepanel = treepanel; } - private String getBaseUrl() { - return UniProtWsTools.BASE_URL; + public static String getBaseUrl() { + return UniProtWsTools.BASE_UNIPROT_URL; } private void inferTaxonomies() { diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 21dcf1f..7423832 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -44,7 +44,6 @@ import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; import org.forester.util.SequenceIdParser; -import org.forester.ws.uniprot.DatabaseTools; import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtWsTools; @@ -53,7 +52,7 @@ public final class SequenceDataRetriver extends RunnableProcess { private final Phylogeny _phy; private final MainFrameApplication _mf; private final TreePanel _treepanel; - private final static boolean DEBUG = false; + private final static boolean DEBUG = true; private enum Db { UNKNOWN, UNIPROT, EMBL, NCBI; @@ -65,10 +64,6 @@ public final class SequenceDataRetriver extends RunnableProcess { _treepanel = treepanel; } - private String getBaseUrl() { - return UniProtWsTools.BASE_URL; - } - private void execute() { start( _mf, "sequence data" ); SortedSet not_found = null; @@ -76,8 +71,9 @@ public final class SequenceDataRetriver extends RunnableProcess { not_found = obtainSeqInformation( _phy ); } catch ( final UnknownHostException e ) { + final String what = "_"; //TODO FIXME JOptionPane.showMessageDialog( _mf, - "Could not connect to \"" + getBaseUrl() + "\"", + "Could not connect to \"" + what + "\"", "Network error during taxonomic information gathering", JOptionPane.ERROR_MESSAGE ); return; @@ -165,6 +161,7 @@ public final class SequenceDataRetriver extends RunnableProcess { tax = new Taxonomy(); } String query = null; + Identifier id = null; Db db = Db.UNKNOWN; if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) @@ -186,12 +183,12 @@ public final class SequenceDataRetriver extends RunnableProcess { if ( ( query = UniProtWsTools.parseUniProtAccessor( node.getName() ) ) != null ) { db = Db.UNIPROT; } - else if ( ( query = SequenceIdParser.parseGenbankAccessor( node.getName() ) ) != null ) { + else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { db = Db.NCBI; } } + SequenceDatabaseEntry db_entry = null; if ( !ForesterUtil.isEmpty( query ) ) { - SequenceDatabaseEntry db_entry = null; if ( db == Db.UNIPROT ) { if ( DEBUG ) { System.out.println( "uniprot: " + query ); @@ -217,35 +214,39 @@ public final class SequenceDataRetriver extends RunnableProcess { db = Db.EMBL; } } - if ( ( db_entry != null ) && !db_entry.isEmpty() ) { - if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { - String type = null; - if ( db == Db.EMBL ) { - type = "embl"; - } - else if ( db == Db.UNIPROT ) { - type = "uniprot"; - } - seq.setAccession( new Accession( db_entry.getAccession(), type ) ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { - seq.setName( db_entry.getSequenceName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { - seq.setSymbol( db_entry.getSequenceSymbol() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { - tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + else if ( ( db == Db.NCBI ) && ( id != null ) ) { + System.out.println( "db == Db.NCBI && id != null" ); + db_entry = UniProtWsTools.obtainrefSeqentryFromEmbl( id, 200 ); + } + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + String type = null; + if ( db == Db.EMBL ) { + type = "embl"; } - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { - tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + else if ( db == Db.UNIPROT ) { + type = "uniprot"; } - node.getNodeData().setTaxonomy( tax ); - node.getNodeData().setSequence( seq ); + seq.setAccession( new Accession( db_entry.getAccession(), type ) ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + seq.setSymbol( db_entry.getSequenceSymbol() ); } - else { - not_found.add( node.getName() ); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); } + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); + } + else { + not_found.add( node.getName() ); } } return not_found; diff --git a/forester/java/src/org/forester/phylogeny/data/Identifier.java b/forester/java/src/org/forester/phylogeny/data/Identifier.java index cd58bad..555743e 100644 --- a/forester/java/src/org/forester/phylogeny/data/Identifier.java +++ b/forester/java/src/org/forester/phylogeny/data/Identifier.java @@ -34,9 +34,14 @@ import org.forester.util.ForesterUtil; public final class Identifier implements PhylogenyData { + final public static String NCBI = "ncbi"; + final public static String REFSEQ = "refseq"; + final private String _value; final private String _provider; final private String _value_provider; + + public Identifier() { _value = ""; diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index bbfa6dd..cdad772 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -9021,7 +9021,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "XP_002434188" ) - || !id.getProvider().equals( "ncbi" ) ) { + || !id.getProvider().equals( "refseq" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -9034,7 +9034,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "XP_002434188" ) - || !id.getProvider().equals( "ncbi" ) ) { + || !id.getProvider().equals( "refseq" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java index 2f50f63..767a310 100644 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ b/forester/java/src/org/forester/util/SequenceIdParser.java @@ -67,6 +67,8 @@ public final class SequenceIdParser { private final static boolean DEBUG = true; + + /** @@ -76,11 +78,11 @@ public final class SequenceIdParser { public final static Identifier parse( final String s ) { String v = parseGenbankAccessor( s ); if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, "ncbi" ); + return new Identifier( v, Identifier.NCBI ); } v = parseRefSeqAccessor( s ); if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, "ncbi" ); + return new Identifier( v, Identifier.REFSEQ ); } return null; } @@ -89,7 +91,7 @@ public final class SequenceIdParser { * Returns null if no match. * */ - static public String parseGenbankAccessor( final String query ) { + public static String parseGenbankAccessor( final String query ) { Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); if ( m.lookingAt() ) { return m.group( 1 ); @@ -115,7 +117,7 @@ public final class SequenceIdParser { * Returns null if no match. * */ - public final static String parseRefSeqAccessor( final String query ) { + private final static String parseRefSeqAccessor( final String query ) { Matcher m = REFSEQ_PATTERN.matcher( query ); if ( m.lookingAt() ) { return m.group( 1 ); diff --git a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java index 5f0d599..ef44f38 100644 --- a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java +++ b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java @@ -46,9 +46,44 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { throw new CloneNotSupportedException(); } + + public static SequenceDatabaseEntry createInstanceForRefSeq( final List lines ) { + final EbiDbEntry e = new EbiDbEntry(); + for( final String line : lines ) { + System.out.println( "-" + line ); + if ( line.startsWith( "ACCESSION" ) ) { + e.setPA( DatabaseTools.extract( line, "ACCESSION" ) ); + } + else if ( line.startsWith( "DEFINITION" ) ) { + if ( line.indexOf( "[" ) > 0 ) { + e.setDe( DatabaseTools.extract( line, "DEFINITIO", "[" ) ); + } + else { + e.setDe( DatabaseTools.extract( line, "DEFINITION" ) ); + } + + + } + + else if ( line.startsWith( "SOURCE" ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOs( DatabaseTools.extract( line, "SOURCE", "(" ) ); + } + else { + e.setOs( DatabaseTools.extract( line, "SOURCE" ) ); + } + } + + } + return e; + } + + + public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { final EbiDbEntry e = new EbiDbEntry(); for( final String line : lines ) { + System.out.println( "->" + line ); if ( line.startsWith( "PA" ) ) { e.setPA( DatabaseTools.extract( line, "PA" ) ); } diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java index bec583f..65bea2d 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java @@ -37,6 +37,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.forester.phylogeny.data.Identifier; import org.forester.util.ForesterUtil; public final class UniProtWsTools { @@ -46,8 +47,12 @@ public final class UniProtWsTools { public enum Db { UNKNOWN, UNIPROT; } - public final static String BASE_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/"; + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; + public final static String EMBL_DBS_EMBL = "embl"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + private final static String URL_ENC = "UTF-8"; // uniprot/expasy accession number format (6 chars): // letter digit letter-or-digit letter-or-digit letter-or-digit digit @@ -322,12 +327,33 @@ public final class UniProtWsTools { return taxonomies; } - public static List queryEmblDb( final String query, final int max_lines_to_return ) throws IOException { - return queryDb( query, max_lines_to_return, BASE_EMBL_DB_URL ); + public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { + + StringBuilder url_sb = new StringBuilder(); + url_sb.append( BASE_EMBL_DB_URL ); + + if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { + url_sb.append( '/'); + url_sb.append( UniProtWsTools.EMBL_DBS_EMBL ); + url_sb.append( '/'); + } + else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { + if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { + url_sb.append( '/'); + url_sb.append( UniProtWsTools.EMBL_DBS_REFSEQ_P ); + url_sb.append( '/'); + } + else { + url_sb.append( '/'); + url_sb.append( UniProtWsTools.EMBL_DBS_REFSEQ_N ); + url_sb.append( '/'); + } + } + return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); } public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { - return queryDb( query, max_lines_to_return, BASE_URL ); + return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); } public static List queryDb( final String query, int max_lines_to_return, final String base_url ) @@ -365,9 +391,15 @@ public final class UniProtWsTools { return UniProtEntry.createInstanceFromPlainText( lines ); } + public static SequenceDatabaseEntry obtainrefSeqentryFromEmbl( final Identifier id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDb( id, max_lines_to_return ); + return EbiDbEntry.createInstanceForRefSeq( lines ); + } + public static SequenceDatabaseEntry obtainEmblEntry( final String query, final int max_lines_to_return ) throws IOException { - final List lines = queryEmblDb( query, max_lines_to_return ); + final List lines = queryEmblDb( new Identifier( query ), max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainText( lines ); } } -- 1.7.10.2