From 8a0a03d1fe15e36142994518585a185fb0ea6543 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 4 Dec 2012 01:26:41 +0000 Subject: [PATCH] in progress --- .../forester/application/gene_tree_preprocess.java | 26 +- .../src/org/forester/archaeopteryx/MainFrame.java | 2 +- .../archaeopteryx/MainFrameApplication.java | 12 +- .../archaeopteryx/tools/SequenceDataRetriver.java | 152 +------ .../org/forester/ws/seqdb/SequenceDbWsTools.java | 436 +++++++++++++------- 5 files changed, 314 insertions(+), 314 deletions(-) diff --git a/forester/java/src/org/forester/application/gene_tree_preprocess.java b/forester/java/src/org/forester/application/gene_tree_preprocess.java index f755d91..bcb93f4 100644 --- a/forester/java/src/org/forester/application/gene_tree_preprocess.java +++ b/forester/java/src/org/forester/application/gene_tree_preprocess.java @@ -32,7 +32,6 @@ import java.io.IOException; import java.util.SortedSet; import java.util.TreeSet; -import org.forester.archaeopteryx.tools.SequenceDataRetriver; import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; @@ -42,17 +41,19 @@ import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; +import org.forester.ws.seqdb.SequenceDbWsTools; public class gene_tree_preprocess { - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String PRG_NAME = "gene_tree_preprocess"; - final static private String PRG_DESC = "gene tree preprocessing for SDI analysis"; - final static private String PRG_VERSION = "1.01"; - final static private String PRG_DATE = "2012.06.07"; - final static private String E_MAIL = "phylosoft@gmail.com"; - final static private String WWW = "www.phylosoft.org/forester"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String PRG_NAME = "gene_tree_preprocess"; + final static private String PRG_DESC = "gene tree preprocessing for SDI analysis"; + final static private String PRG_VERSION = "1.01"; + final static private String PRG_DATE = "2012.06.07"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "www.phylosoft.org/forester"; + private final static int DEFAULT_LINES_TO_RETURN = 50; public static void main( final String[] args ) { try { @@ -82,7 +83,10 @@ public class gene_tree_preprocess { ForesterUtil.fatalError( PRG_NAME, "phylogeny has " + phy.getNumberOfExternalNodes() + " external node(s), aborting" ); } - final SortedSet not_found = SequenceDataRetriver.obtainSeqInformation( phy, true, false ); + final SortedSet not_found = SequenceDbWsTools.obtainSeqInformation( phy, + true, + false, + DEFAULT_LINES_TO_RETURN ); for( final String remove_me : not_found ) { phy.deleteSubtree( phy.getNode( remove_me ), true ); } @@ -144,7 +148,7 @@ public class gene_tree_preprocess { } } - public static void checkForOutputFileWriteability( final File outfile ) { + private static void checkForOutputFileWriteability( final File outfile ) { final String error = ForesterUtil.isWritableFile( outfile ); if ( !ForesterUtil.isEmpty( error ) ) { ForesterUtil.fatalError( PRG_NAME, error ); diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrame.java b/forester/java/src/org/forester/archaeopteryx/MainFrame.java index 8b98088..a8c58ee 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrame.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrame.java @@ -139,7 +139,7 @@ public abstract class MainFrame extends JFrame implements ActionListener { JMenuItem _collapse_below_threshold; //TODO implememt me JMenuItem _obtain_detailed_taxonomic_information_jmi; JMenuItem _obtain_detailed_taxonomic_information_deleting_jmi; - JMenuItem _obtain_uniprot_seq_information_jmi; + JMenuItem _obtain_seq_information_jmi; JMenuItem _move_node_names_to_tax_sn_jmi; JMenuItem _move_node_names_to_seq_names_jmi; JMenuItem _extract_tax_code_from_node_names_jmi; diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index f5cda0b..80a315b 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -555,8 +555,8 @@ public final class MainFrameApplication extends MainFrame { } obtainDetailedTaxonomicInformationDelete(); } - else if ( o == _obtain_uniprot_seq_information_jmi ) { - obtainUniProtSequenceInformation(); + else if ( o == _obtain_seq_information_jmi ) { + obtainSequenceInformation(); } else if ( o == _read_values_jmi ) { if ( isSubtreeDisplayed() ) { @@ -951,9 +951,9 @@ public final class MainFrameApplication extends MainFrame { customizeJMenuItem( _obtain_detailed_taxonomic_information_deleting_jmi ); _obtain_detailed_taxonomic_information_deleting_jmi .setToolTipText( "To add additional taxonomic information, deletes nodes for which taxonomy cannot found (from UniProt Taxonomy)" ); - _tools_menu.add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) ); - customizeJMenuItem( _obtain_uniprot_seq_information_jmi ); - _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information" ); + _tools_menu.add( _obtain_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) ); + customizeJMenuItem( _obtain_seq_information_jmi ); + _obtain_seq_information_jmi.setToolTipText( "To add additional sequence information" ); _tools_menu.addSeparator(); if ( !Constants.__RELEASE ) { _tools_menu.add( _function_analysis = new JMenuItem( "Add UniProtKB Annotations" ) ); @@ -1511,7 +1511,7 @@ public final class MainFrameApplication extends MainFrame { } } - private void obtainUniProtSequenceInformation() { + private void obtainSequenceInformation() { if ( getCurrentTreePanel() != null ) { final Phylogeny phy = getCurrentTreePanel().getPhylogeny(); if ( ( phy != null ) && !phy.isEmpty() ) { diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 3f4ddda..454f139 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -28,35 +28,21 @@ package org.forester.archaeopteryx.tools; import java.io.IOException; import java.net.UnknownHostException; import java.util.SortedSet; -import java.util.TreeSet; import javax.swing.JOptionPane; import org.forester.archaeopteryx.MainFrameApplication; import org.forester.archaeopteryx.TreePanel; import org.forester.phylogeny.Phylogeny; -import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Accession; -import org.forester.phylogeny.data.Identifier; -import org.forester.phylogeny.data.Sequence; -import org.forester.phylogeny.data.Taxonomy; -import org.forester.phylogeny.iterators.PhylogenyNodeIterator; -import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; -import org.forester.ws.seqdb.SequenceDatabaseEntry; import org.forester.ws.seqdb.SequenceDbWsTools; public final class SequenceDataRetriver extends RunnableProcess { - public final static int DEFAULT_LINES_TO_RETURN = 50; + private final static int DEFAULT_LINES_TO_RETURN = 50; private final Phylogeny _phy; private final MainFrameApplication _mf; private final TreePanel _treepanel; - private final static boolean DEBUG = false; - - private enum Db { - UNIPROT, EMBL, NCBI, NONE, REFSEQ; - } + public final static boolean DEBUG = false; public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { _phy = phy; @@ -64,17 +50,21 @@ public final class SequenceDataRetriver extends RunnableProcess { _treepanel = treepanel; } + @Override + public void run() { + execute(); + } + private void execute() { start( _mf, "sequence data" ); SortedSet not_found = null; try { - not_found = obtainSeqInformation( _phy, false, true ); + not_found = SequenceDbWsTools.obtainSeqInformation( _phy, false, true, DEFAULT_LINES_TO_RETURN ); } catch ( final UnknownHostException e ) { - final String what = "_"; //TODO FIXME JOptionPane.showMessageDialog( _mf, - "Could not connect to \"" + what + "\"", - "Network error during taxonomic information gathering", + e.getLocalizedMessage(), + "Network error during sequence data gathering", JOptionPane.ERROR_MESSAGE ); return; } @@ -82,7 +72,7 @@ public final class SequenceDataRetriver extends RunnableProcess { e.printStackTrace(); JOptionPane.showMessageDialog( _mf, e.toString(), - "Failed to obtain taxonomic information", + "Failed to obtain sequence data", JOptionPane.ERROR_MESSAGE ); return; } @@ -141,124 +131,4 @@ public final class SequenceDataRetriver extends RunnableProcess { } } } - - public static SortedSet obtainSeqInformation( final Phylogeny phy, - final boolean ext_nodes_only, - final boolean allow_to_set_taxonomic_data ) - throws IOException { - final SortedSet not_found = new TreeSet(); - for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - if ( ext_nodes_only && node.isInternal() ) { - continue; - } - final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence(); - final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy(); - String query = null; - Identifier id = null; - Db db = Db.NONE; - if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "uniprot" ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.UNIPROT; - } - else if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node - .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.EMBL; - } - else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - if ( ( query = SequenceDbWsTools.parseUniProtAccessor( node.getName() ) ) != null ) { - db = Db.UNIPROT; - } - else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - } - if ( db == Db.NONE ) { - not_found.add( node.getName() ); - } - SequenceDatabaseEntry db_entry = null; - if ( !ForesterUtil.isEmpty( query ) ) { - if ( db == Db.UNIPROT ) { - if ( DEBUG ) { - System.out.println( "uniprot: " + query ); - } - db_entry = SequenceDbWsTools.obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN ); - } - if ( ( db == Db.EMBL ) || ( ( db == Db.UNIPROT ) && ( db_entry == null ) ) ) { - if ( DEBUG ) { - System.out.println( "embl: " + query ); - } - db_entry = SequenceDbWsTools.obtainEmblEntry( new Identifier( query ), DEFAULT_LINES_TO_RETURN ); - if ( ( db == Db.UNIPROT ) && ( db_entry != null ) ) { - db = Db.EMBL; - } - } - } - else if ( ( db == Db.REFSEQ ) && ( id != null ) ) { - db_entry = SequenceDbWsTools.obtainRefSeqEntryFromEmbl( id, DEFAULT_LINES_TO_RETURN ); - } - else if ( ( db == Db.NCBI ) && ( id != null ) ) { - db_entry = SequenceDbWsTools.obtainEmblEntry( id, DEFAULT_LINES_TO_RETURN ); - } - if ( ( db_entry != null ) && !db_entry.isEmpty() ) { - if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { - String type = null; - if ( db == Db.EMBL ) { - type = "embl"; - } - else if ( db == Db.UNIPROT ) { - type = "uniprot"; - } - else if ( db == Db.NCBI ) { - type = "ncbi"; - } - else if ( db == Db.REFSEQ ) { - type = "refseq"; - } - seq.setAccession( new Accession( db_entry.getAccession(), type ) ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { - seq.setName( db_entry.getSequenceName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { - seq.setSymbol( db_entry.getSequenceSymbol() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { - tax.setScientificName( db_entry.getTaxonomyScientificName() ); - } - if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { - tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); - } - node.getNodeData().setTaxonomy( tax ); - node.getNodeData().setSequence( seq ); - } - else if ( db != Db.NONE ) { - not_found.add( node.getName() ); - } - try { - Thread.sleep( 10 );// Sleep for 10 ms - } - catch ( final InterruptedException ie ) { - } - } - return not_found; - } - - @Override - public void run() { - execute(); - } } diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index b56a633..5004a47 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -34,54 +34,38 @@ import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; +import org.forester.util.SequenceIdParser; public final class SequenceDbWsTools { - private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! - - public enum Db { - UNKNOWN, UNIPROT; - } - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; - public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - private final static String URL_ENC = "UTF-8"; + private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; + public final static String EMBL_DBS_EMBL = "embl"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + private final static String URL_ENC = "UTF-8"; // uniprot/expasy accession number format (6 chars): // letter digit letter-or-digit letter-or-digit letter-or-digit digit // ?: => no back-reference // \A => begin of String // \Z => end of String - private final static Pattern UNIPROT_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static boolean DEBUG = false; - - private static String encode( final String str ) throws UnsupportedEncodingException { - return URLEncoder.encode( str.trim(), URL_ENC ); - } - - /** - * Returns null if no match. - * - * @param query - * @param db - * @return - */ - static public String parseUniProtAccessor( final String query ) { - final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } + private final static Pattern UNIPROT_AC_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); + private final static boolean DEBUG = false; public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { @@ -176,6 +160,266 @@ public final class SequenceDbWsTools { return null; } + public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDb( id, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainText( lines ); + } + + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDb( id, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); + } + + public static SortedSet obtainSeqInformation( final Phylogeny phy, + final boolean ext_nodes_only, + final boolean allow_to_set_taxonomic_data, + final int lines_to_return ) throws IOException { + final SortedSet not_found = new TreeSet(); + for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( ext_nodes_only && node.isInternal() ) { + continue; + } + String query = null; + Identifier id = null; + Db db = Db.NONE; + if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "uniprot" ) ) { + query = node.getNodeData().getSequence().getAccession().getValue(); + db = Db.UNIPROT; + } + else if ( node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node + .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) { + query = node.getNodeData().getSequence().getAccession().getValue(); + db = Db.EMBL; + } + else if ( !ForesterUtil.isEmpty( node.getName() ) ) { + if ( ( query = parseUniProtAccessor( node.getName() ) ) != null ) { + db = Db.UNIPROT; + } + else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { + if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { + db = Db.NCBI; + } + else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { + db = Db.REFSEQ; + } + } + } + if ( db == Db.NONE ) { + not_found.add( node.getName() ); + } + SequenceDatabaseEntry db_entry = null; + if ( !ForesterUtil.isEmpty( query ) ) { + if ( db == Db.UNIPROT ) { + if ( DEBUG ) { + System.out.println( "uniprot: " + query ); + } + db_entry = obtainUniProtEntry( query, lines_to_return ); + } + if ( ( db == Db.EMBL ) || ( ( db == Db.UNIPROT ) && ( db_entry == null ) ) ) { + if ( DEBUG ) { + System.out.println( "embl: " + query ); + } + db_entry = obtainEmblEntry( new Identifier( query ), lines_to_return ); + if ( ( db == Db.UNIPROT ) && ( db_entry != null ) ) { + db = Db.EMBL; + } + } + } + else if ( ( db == Db.REFSEQ ) && ( id != null ) ) { + db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return ); + } + else if ( ( db == Db.NCBI ) && ( id != null ) ) { + db_entry = obtainEmblEntry( id, lines_to_return ); + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() + : new Sequence(); + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + String type = null; + if ( db == Db.EMBL ) { + type = "embl"; + } + else if ( db == Db.UNIPROT ) { + type = "uniprot"; + } + else if ( db == Db.NCBI ) { + type = "ncbi"; + } + else if ( db == Db.REFSEQ ) { + type = "refseq"; + } + seq.setAccession( new Accession( db_entry.getAccession(), type ) ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + seq.setSymbol( db_entry.getSequenceSymbol() ); + } + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() + : new Taxonomy(); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); + } + else if ( db != Db.NONE ) { + not_found.add( node.getName() ); + } + try { + Thread.sleep( 10 );// Sleep for 10 ms + } + catch ( final InterruptedException ie ) { + } + } + } + return not_found; + } + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) + throws IOException { + final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); + return UniProtEntry.createInstanceFromPlainText( lines ); + } + + /** + * Returns null if no match. + * + * @param query + * @param db + * @return + */ + static public String parseUniProtAccessor( final String query ) { + final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + + public static List queryDb( final String query, int max_lines_to_return, final String base_url ) + throws IOException { + if ( ForesterUtil.isEmpty( query ) ) { + throw new IllegalArgumentException( "illegal attempt to use empty query " ); + } + if ( max_lines_to_return < 1 ) { + max_lines_to_return = 1; + } + final URL url = new URL( base_url + query ); + if ( DEBUG ) { + System.out.println( "url: " + url.toString() ); + } + final URLConnection urlc = url.openConnection(); + final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); + String line; + final List result = new ArrayList(); + while ( ( line = in.readLine() ) != null ) { + if ( DEBUG ) { + System.out.println( line ); + } + result.add( line ); + if ( result.size() > max_lines_to_return ) { + break; + } + } + in.close(); + try { + // To prevent accessing online dbs in too quick succession. + Thread.sleep( 20 ); + } + catch ( final InterruptedException e ) { + e.printStackTrace(); + } + return result; + } + + public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { + final StringBuilder url_sb = new StringBuilder(); + url_sb.append( BASE_EMBL_DB_URL ); + if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { + url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); + url_sb.append( '/' ); + } + else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { + if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { + url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); + url_sb.append( '/' ); + } + else { + url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); + url_sb.append( '/' ); + } + } + return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { + return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); + } + + private static String encode( final String str ) throws UnsupportedEncodingException { + return URLEncoder.encode( str.trim(), URL_ENC ); + } + + private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); + } + + private static List parseUniProtTaxonomy( final List result ) throws IOException { + final List taxonomies = new ArrayList(); + for( final String line : result ) { + if ( ForesterUtil.isEmpty( line ) ) { + // Ignore empty lines. + } + else if ( line.startsWith( "Taxon" ) ) { + final String[] items = line.split( "\t" ); + if ( !( items[ 1 ].equalsIgnoreCase( "Mnemonic" ) && items[ 2 ].equalsIgnoreCase( "Scientific name" ) + && items[ 3 ].equalsIgnoreCase( "Common name" ) && items[ 4 ].equalsIgnoreCase( "Synonym" ) + && items[ 5 ].equalsIgnoreCase( "Other Names" ) && items[ 6 ].equalsIgnoreCase( "Reviewed" ) + && items[ 7 ].equalsIgnoreCase( "Rank" ) && items[ 8 ].equalsIgnoreCase( "Lineage" ) ) ) { + throw new IOException( "Unreconized UniProt Taxonomy format: " + line ); + } + } + else { + if ( line.split( "\t" ).length > 4 ) { + taxonomies.add( new UniProtTaxonomy( line ) ); + } + } + } + return taxonomies; + } + private static List resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code ) throws IOException { if ( code.equals( "CAP" ) ) { @@ -276,131 +520,13 @@ public final class SequenceDbWsTools { } } - private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); - } - private static List uniProtTaxonomyToList( final UniProtTaxonomy tax ) { final List l = new ArrayList(); l.add( tax ); return l; } - private static List parseUniProtTaxonomy( final List result ) throws IOException { - final List taxonomies = new ArrayList(); - for( final String line : result ) { - if ( ForesterUtil.isEmpty( line ) ) { - // Ignore empty lines. - } - else if ( line.startsWith( "Taxon" ) ) { - final String[] items = line.split( "\t" ); - if ( !( items[ 1 ].equalsIgnoreCase( "Mnemonic" ) && items[ 2 ].equalsIgnoreCase( "Scientific name" ) - && items[ 3 ].equalsIgnoreCase( "Common name" ) && items[ 4 ].equalsIgnoreCase( "Synonym" ) - && items[ 5 ].equalsIgnoreCase( "Other Names" ) && items[ 6 ].equalsIgnoreCase( "Reviewed" ) - && items[ 7 ].equalsIgnoreCase( "Rank" ) && items[ 8 ].equalsIgnoreCase( "Lineage" ) ) ) { - throw new IOException( "Unreconized UniProt Taxonomy format: " + line ); - } - } - else { - if ( line.split( "\t" ).length > 4 ) { - taxonomies.add( new UniProtTaxonomy( line ) ); - } - } - } - return taxonomies; - } - - public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { - final StringBuilder url_sb = new StringBuilder(); - url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); - url_sb.append( '/' ); - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - url_sb.append( '/' ); - } - else { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - url_sb.append( '/' ); - } - } - return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); - } - - public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { - return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); - } - - public static List queryDb( final String query, int max_lines_to_return, final String base_url ) - throws IOException { - if ( ForesterUtil.isEmpty( query ) ) { - throw new IllegalArgumentException( "illegal attempt to use empty query " ); - } - if ( max_lines_to_return < 1 ) { - max_lines_to_return = 1; - } - final URL url = new URL( base_url + query ); - if ( DEBUG ) { - System.out.println( "url: " + url.toString() ); - } - final URLConnection urlc = url.openConnection(); - final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); - String line; - final List result = new ArrayList(); - while ( ( line = in.readLine() ) != null ) { - if ( DEBUG ) { - System.out.println( line ); - } - result.add( line ); - if ( result.size() > max_lines_to_return ) { - break; - } - } - in.close(); - try { - // To prevent accessing online dbs in too quick succession. - Thread.sleep( 20 ); - } - catch ( final InterruptedException e ) { - e.printStackTrace(); - } - return result; - } - - public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) - throws IOException { - final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); - return UniProtEntry.createInstanceFromPlainText( lines ); - } - - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); - } - - public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainText( lines ); + public enum Db { + UNIPROT, EMBL, NCBI, NONE, REFSEQ; } } -- 1.7.10.2