X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FSequenceDbWsTools.java;h=cecef9f27a251e6251eee06bbedbde48206d6202;hb=482c9a54ff2b83c947c43449c7b0d86dc9c8dafd;hp=b56a6330457c998553478de66be69d6a5477662b;hpb=4cdad3c78a8ce85b18a977a916ed7ed91a350f3a;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index b56a633..cecef9f 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -21,11 +21,12 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.ws.seqdb; import java.io.BufferedReader; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; @@ -34,62 +35,52 @@ import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.SortedSet; +import java.util.TreeSet; +import org.forester.go.GoTerm; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Accession.Source; +import org.forester.phylogeny.data.Annotation; import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; +import org.forester.util.SequenceAccessionTools; public final class SequenceDbWsTools { - private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! + public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static String EMBL_DBS_EMBL = "embl"; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + private final static boolean DEBUG = true; + private final static String URL_ENC = "UTF-8"; + public final static int DEFAULT_LINES_TO_RETURN = 4000; - public enum Db { - UNKNOWN, UNIPROT; + final static String extractFrom( final String target, final String a ) { + final int i_a = target.indexOf( a ); + return target.substring( i_a + a.length() ).trim(); } - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; - public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - private final static String URL_ENC = "UTF-8"; - // uniprot/expasy accession number format (6 chars): - // letter digit letter-or-digit letter-or-digit letter-or-digit digit - // ?: => no back-reference - // \A => begin of String - // \Z => end of String - private final static Pattern UNIPROT_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static boolean DEBUG = false; - private static String encode( final String str ) throws UnsupportedEncodingException { - return URLEncoder.encode( str.trim(), URL_ENC ); - } - - /** - * Returns null if no match. - * - * @param query - * @param db - * @return - */ - static public String parseUniProtAccessor( final String query ) { - final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } + final static String extractFromTo( final String target, final String a, final String b ) { + final int i_a = target.indexOf( a ); + final int i_b = target.indexOf( b ); + if ( ( i_a < 0 ) || ( i_b < i_a ) ) { + throw new IllegalArgumentException( "attempt to extract from \"" + target + "\" between \"" + a + + "\" and \"" + b + "\"" ); + } + return target.substring( i_a + a.length(), i_b ).trim(); } - public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) - throws IOException { - final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; + final static String extractTo( final String target, final String b ) { + final int i_b = target.indexOf( b ); + return target.substring( 0, i_b ).trim(); } public static List getTaxonomiesFromCommonNameStrict( final String cn, @@ -117,26 +108,6 @@ public final class SequenceDbWsTools { return null; } - public static List getTaxonomiesFromScientificName( final String sn, - final int max_taxonomies_return ) - throws IOException { - // Hack! Craniata? .. - if ( sn.equals( "Drosophila" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS ); - } - else if ( sn.equals( "Xenopus" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS ); - } - // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) { - // return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA ); - // } - final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - /** * Does not return "sub-types". * For example, for "Mus musculus" only returns "Mus musculus" @@ -163,12 +134,6 @@ public final class SequenceDbWsTools { final int max_taxonomies_return ) throws IOException { final String my_code = new String( code ); - if ( ALLOW_TAXONOMY_CODE_HACKS ) { - final List l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code ); - if ( l != null ) { - return l; - } - } final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); @@ -176,104 +141,253 @@ public final class SequenceDbWsTools { return null; } - private static List resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code ) + public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return ) throws IOException { - if ( code.equals( "CAP" ) ) { - return getTaxonomiesFromId( "283909", max_taxonomies_return ); - } - else if ( code.equals( "FUGRU" ) ) { - return getTaxonomiesFromId( "31033", max_taxonomies_return ); - } - else if ( code.equals( "GIALA" ) ) { - return getTaxonomiesFromId( "5741", max_taxonomies_return ); - } - else if ( code.equals( "TRIVE" ) ) { - return getTaxonomiesFromId( "413071", max_taxonomies_return ); - } - else if ( code.equals( "CAPOWC" ) ) { - return getTaxonomiesFromId( "192875", max_taxonomies_return ); - } - else if ( code.equals( "SPHARC" ) ) { - return getTaxonomiesFromId( "667725", max_taxonomies_return ); - } - else if ( code.equals( "THETRA" ) ) { - return getTaxonomiesFromId( "529818", max_taxonomies_return ); - } - else if ( code.equals( "CHLVUL" ) ) { - return getTaxonomiesFromId( "574566", max_taxonomies_return ); - } - else if ( code.equals( "CITCLE" ) ) { - return getTaxonomiesFromId( "85681", max_taxonomies_return ); - } - else if ( code.equals( "MYCPOP" ) ) { - return getTaxonomiesFromId( "85929", max_taxonomies_return ); - } - else if ( code.equals( "AGABB" ) ) { - return getTaxonomiesFromId( "597362", max_taxonomies_return ); - } - else if ( code.equals( "BAUCOM" ) ) { - return getTaxonomiesFromId( "430998", max_taxonomies_return ); + final List lines = queryEmblDb( id, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainText( lines ); + } + + public final static Accession obtainSeqAccession( final PhylogenyNode node ) { + Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); + if ( !isAccessionAcceptable( acc ) ) { + acc = SequenceAccessionTools.obtainAccessorFromDataFields( node ); } - else if ( code.equals( "DICSQU" ) ) { - return getTaxonomiesFromId( "114155", max_taxonomies_return ); + return acc; + } + + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); + } + + public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, + final int lines_to_return, + final SortedSet not_found, + final PhylogenyNode node ) throws IOException { + final Accession acc = obtainSeqAccession( node ); + if ( !isAccessionAcceptable( acc ) ) { + if ( node.isExternal() || !node.isEmpty() ) { + not_found.add( node.toString() ); + } } - else if ( code.equals( "FOMPIN" ) ) { - return getTaxonomiesFromId( "40483", max_taxonomies_return ); + else { + addDataFromDbToNode( allow_to_set_taxonomic_data, lines_to_return, not_found, node, acc ); } - else if ( code.equals( "HYDMA" ) ) { - return getTaxonomiesFromId( "6085", max_taxonomies_return ); + } + + public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, + final SortedSet not_found, + final PhylogenyNode node ) throws IOException { + obtainSeqInformation( allow_to_set_taxonomic_data, DEFAULT_LINES_TO_RETURN, not_found, node ); + } + + public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException { + obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet(), node ); + } + + public final static SortedSet obtainSeqInformation( final Phylogeny phy, + final boolean ext_nodes_only, + final boolean allow_to_set_taxonomic_data, + final int lines_to_return ) throws IOException { + final SortedSet not_found = new TreeSet(); + for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isExternal() || !ext_nodes_only ) { + obtainSeqInformation( allow_to_set_taxonomic_data, lines_to_return, not_found, node ); + } } - else if ( code.equals( "MYCFI" ) ) { - return getTaxonomiesFromId( "83344", max_taxonomies_return ); + return not_found; + } + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) + throws IOException { + final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); + return UniProtEntry.createInstanceFromPlainText( lines ); + } + + public static List queryDb( final String query, int max_lines_to_return, final String base_url ) + throws IOException { + if ( ForesterUtil.isEmpty( query ) ) { + throw new IllegalArgumentException( "illegal attempt to use empty query " ); } - else if ( code.equals( "OIDMAI" ) ) { - return getTaxonomiesFromId( "78148", max_taxonomies_return ); + if ( max_lines_to_return < 1 ) { + max_lines_to_return = 1; } - else if ( code.equals( "OSTRC" ) ) { - return getTaxonomiesFromId( "385169", max_taxonomies_return ); + final URL url = new URL( base_url + query ); + if ( DEBUG ) { + System.out.println( "url: " + url.toString() ); } - else if ( code.equals( "POSPL" ) ) { - return getTaxonomiesFromId( "104341", max_taxonomies_return ); + final URLConnection urlc = url.openConnection(); + final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); + String line; + final List result = new ArrayList(); + while ( ( line = in.readLine() ) != null ) { + if ( DEBUG ) { + System.out.println( line ); + } + result.add( line ); + if ( result.size() > max_lines_to_return ) { + break; + } } - else if ( code.equals( "SAICOM" ) ) { - return getTaxonomiesFromId( "5606", max_taxonomies_return ); + in.close(); + try { + // To prevent accessing online dbs in too quick succession. + Thread.sleep( 20 ); } - else if ( code.equals( "SERLA" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); + catch ( final InterruptedException e ) { + e.printStackTrace(); } - else if ( code.equals( "SPORO" ) ) { - return getTaxonomiesFromId( "40563", max_taxonomies_return ); + return result; + } + + public static List queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return ) + throws IOException { + final StringBuilder url_sb = new StringBuilder(); + url_sb.append( EMBL_REFSEQ ); + return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException { + final StringBuilder url_sb = new StringBuilder(); + // url_sb.append( BASE_EMBL_DB_URL ); + if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) { + url_sb.append( EMBL_DBS_EMBL ); + url_sb.append( '/' ); } - else if ( code.equals( "ACRALC" ) ) { - return getTaxonomiesFromId( "398408", max_taxonomies_return ); + else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) { + url_sb.append( EMBL_REFSEQ ); + // if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { + // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); + // url_sb.append( '/' ); + // } + // else { + // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); + // url_sb.append( '/' ); + // } } - else if ( code.equals( "THITER" ) ) { - return getTaxonomiesFromId( "35720", max_taxonomies_return ); + return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { + return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); + } + + private static void addDataFromDbToNode( final boolean allow_to_set_taxonomic_data, + final int lines_to_return, + final SortedSet not_found, + final PhylogenyNode node, + final Accession acc ) throws IOException { + SequenceDatabaseEntry db_entry = null; + final String query = acc.getValue(); + if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) { + if ( DEBUG ) { + System.out.println( "uniprot: " + query ); + } + try { + db_entry = obtainUniProtEntry( query, lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } } - else if ( code.equals( "MYCTHE" ) ) { - return getTaxonomiesFromId( "78579", max_taxonomies_return ); + else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + if ( DEBUG ) { + System.out.println( "embl: " + query ); + } + try { + db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } } - else if ( code.equals( "CONPUT" ) ) { - return getTaxonomiesFromId( "80637", max_taxonomies_return ); + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { + if ( DEBUG ) { + System.out.println( "refseq: " + query ); + } + try { + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } } - else if ( code.equals( "WOLCOC" ) ) { - return getTaxonomiesFromId( "81056", max_taxonomies_return ); + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence(); + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) { + seq.setGeneName( db_entry.getGeneName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + try { + seq.setSymbol( db_entry.getSequenceSymbol() ); + } + catch ( final PhyloXmlDataFormatException e ) { + // Eat this exception. + } + } + if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { + for( final GoTerm go : db_entry.getGoTerms() ) { + final Annotation ann = new Annotation( go.getGoId().getId() ); + ann.setDesc( go.getName() ); + seq.addAnnotation( ann ); + } + } + if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) { + for( final Accession x : db_entry.getCrossReferences() ) { + seq.addCrossReference( x ); + } + } + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy(); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); } - else if ( code.equals( "CLAGRA" ) ) { - return getTaxonomiesFromId( "27339", max_taxonomies_return ); + else { + if ( node.isExternal() || !node.isEmpty() ) { + not_found.add( node.toString() ); + } } - else if ( code.equals( "XANPAR" ) ) { - return getTaxonomiesFromId( "107463", max_taxonomies_return ); + try { + Thread.sleep( 10 );// Sleep for 10 ms } - else if ( code.equals( "HYDPIN" ) ) { - return getTaxonomiesFromId( "388859", max_taxonomies_return ); + catch ( final InterruptedException ie ) { } - else if ( code.equals( "SERLAC" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); + } + + private static String encode( final String str ) throws UnsupportedEncodingException { + return URLEncoder.encode( str.trim(), URL_ENC ); + } + + private static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); } - else { - return null; + return null; + } + + private static List getTaxonomiesFromScientificName( final String sn, + final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); } + return null; } private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) @@ -296,10 +410,11 @@ public final class SequenceDbWsTools { return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); } - private static List uniProtTaxonomyToList( final UniProtTaxonomy tax ) { - final List l = new ArrayList(); - l.add( tax ); - return l; + private final static boolean isAccessionAcceptable( final Accession acc ) { + return ( !( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc + .getSource().equals( Source.UNIPROT.toString() ) ) + && ( acc.getSource().toString().equals( Source.EMBL.toString() ) ) && ( acc.getSource().toString() + .equals( Source.REFSEQ.toString() ) ) ) ) ); } private static List parseUniProtTaxonomy( final List result ) throws IOException { @@ -325,82 +440,4 @@ public final class SequenceDbWsTools { } return taxonomies; } - - public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { - final StringBuilder url_sb = new StringBuilder(); - url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); - url_sb.append( '/' ); - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - url_sb.append( '/' ); - } - else { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - url_sb.append( '/' ); - } - } - return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); - } - - public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { - return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); - } - - public static List queryDb( final String query, int max_lines_to_return, final String base_url ) - throws IOException { - if ( ForesterUtil.isEmpty( query ) ) { - throw new IllegalArgumentException( "illegal attempt to use empty query " ); - } - if ( max_lines_to_return < 1 ) { - max_lines_to_return = 1; - } - final URL url = new URL( base_url + query ); - if ( DEBUG ) { - System.out.println( "url: " + url.toString() ); - } - final URLConnection urlc = url.openConnection(); - final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); - String line; - final List result = new ArrayList(); - while ( ( line = in.readLine() ) != null ) { - if ( DEBUG ) { - System.out.println( line ); - } - result.add( line ); - if ( result.size() > max_lines_to_return ) { - break; - } - } - in.close(); - try { - // To prevent accessing online dbs in too quick succession. - Thread.sleep( 20 ); - } - catch ( final InterruptedException e ) { - e.printStackTrace(); - } - return result; - } - - public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) - throws IOException { - final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); - return UniProtEntry.createInstanceFromPlainText( lines ); - } - - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); - } - - public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainText( lines ); - } }