X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FSequenceDbWsTools.java;h=a54a3b92bf727975379f04c6df65b3995c3a6303;hb=0c6757717df4ae39a92fcea587461481dc2cde1d;hp=f5f83e49343878a0f73bd18634cb512a9b4401b4;hpb=f8ecfc1d254f7f413ee5c47fbb012b609f7f4fd1;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index f5f83e4..a54a3b9 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -26,6 +26,7 @@ package org.forester.ws.seqdb; import java.io.BufferedReader; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; @@ -42,32 +43,29 @@ import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Accession.Source; import org.forester.phylogeny.data.Annotation; import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sequence.MolecularSequence.TYPE; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; public final class SequenceDbWsTools { - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; - public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - private final static String URL_ENC = "UTF-8"; - private final static boolean DEBUG = false; - - private static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) - throws IOException { - final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static int DEFAULT_LINES_TO_RETURN = 4000; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id="; + public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; + public final static String EMBL_EMBL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=EMBL&style=raw&id="; + private final static boolean DEBUG = false; + private final static String URL_ENC = "UTF-8"; + private final static int SLEEP = 200; + private static final boolean ALLOW_TO_OVERWRITE_MOL_SEQ = false; public static List getTaxonomiesFromCommonNameStrict( final String cn, final int max_taxonomies_return ) @@ -94,21 +92,11 @@ public final class SequenceDbWsTools { return null; } - private static List getTaxonomiesFromScientificName( final String sn, - final int max_taxonomies_return ) - throws IOException { - final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - /** * Does not return "sub-types". * For example, for "Mus musculus" only returns "Mus musculus" * and not "Mus musculus", "Mus musculus bactrianus", ... - * + * */ public static List getTaxonomiesFromScientificNameStrict( final String sn, final int max_taxonomies_return ) @@ -137,219 +125,103 @@ public final class SequenceDbWsTools { return null; } - public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException { + return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); + } + + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc, final int max_lines_to_return ) throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainText( lines ); + final List lines = queryEmblDb( acc, max_lines_to_return ); + return EbiDbEntry.createInstance( lines ); + } + + public static SequenceDatabaseEntry obtainEntry( final String acc_str ) throws IOException { + if ( ForesterUtil.isEmpty( acc_str ) ) { + throw new IllegalArgumentException( "cannot not extract sequence db accessor from null or empty string" ); + } + final Accession acc = SequenceAccessionTools.parseAccessorFromString( acc_str ); + if ( acc == null ) { + throw new IllegalArgumentException( "could not extract acceptable sequence db accessor from \"" + acc_str + + "\"" ); + } + if ( acc.getSource().equals( Source.REFSEQ.toString() ) || acc.getSource().equals( Source.EMBL.toString() ) + || acc.getSource().equals( Source.NCBI.toString() ) ) { + return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); + } + else if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) { + return obtainUniProtEntry( acc.getValue(), DEFAULT_LINES_TO_RETURN ); + } + else { + throw new IllegalArgumentException( "don't know how to handle request for source \"" + acc.getSource() + + "\"" ); + } + } + + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException { + return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN ); } - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc, final int max_lines_to_return ) throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); + final List lines = queryEmblDbForRefSeqEntry( acc, max_lines_to_return ); + return EbiDbEntry.createInstance( lines ); + } + + public final static Accession obtainSeqAccession( final PhylogenyNode node ) { + Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); + if ( !isAccessionAcceptable( acc ) ) { + acc = SequenceAccessionTools.obtainAccessorFromDataFields( node ); + } + return acc; + } + + public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, + final int lines_to_return, + final SortedSet not_found, + final PhylogenyNode node ) throws IOException { + final Accession acc = obtainSeqAccession( node ); + if ( !isAccessionAcceptable( acc ) ) { + if ( node.isExternal() || !node.isEmpty() ) { + not_found.add( node.toString() ); + } + } + else { + addDataFromDbToNode( allow_to_set_taxonomic_data, lines_to_return, not_found, node, acc ); + } + } + + public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, + final SortedSet not_found, + final PhylogenyNode node ) throws IOException { + obtainSeqInformation( allow_to_set_taxonomic_data, DEFAULT_LINES_TO_RETURN, not_found, node ); } - public static SortedSet obtainSeqInformation( final Phylogeny phy, - final boolean ext_nodes_only, - final boolean allow_to_set_taxonomic_data, - final int lines_to_return ) throws IOException { + public final static SortedSet obtainSeqInformation( final Phylogeny phy, + final boolean ext_nodes_only, + final boolean allow_to_set_taxonomic_data, + final int lines_to_return ) throws IOException { final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); - if ( ext_nodes_only && node.isInternal() ) { - continue; - } - String query = null; - Identifier id = null; - Db db = Db.NONE; - if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "uniprot" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "swissprot" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "trembl" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "sp" ) || node.getNodeData().getSequence().getAccession().getValue() - .toLowerCase().startsWith( "uniprotkb" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.UNIPROT; - } - else if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node - .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.EMBL; - } - else if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ncbi" ) || node - .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "genbank" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - // db = Db.NCBI; - } - else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "refseq" ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.REFSEQ; - } - else { - if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) { - db = Db.UNIPROT; - } - else if ( node.getNodeData().isHasSequence() ) { - if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - } - } - if ( db == Db.NONE ) { - not_found.add( node.toString() ); - } - SequenceDatabaseEntry db_entry = null; - if ( !ForesterUtil.isEmpty( query ) ) { - if ( db == Db.UNIPROT ) { - if ( DEBUG ) { - System.out.println( "uniprot: " + query ); - } - db_entry = obtainUniProtEntry( query, lines_to_return ); - } - else if ( db == Db.EMBL ) { - if ( DEBUG ) { - System.out.println( "embl: " + query ); - } - db_entry = obtainEmblEntry( new Identifier( query ), lines_to_return ); - } - else if ( db == Db.REFSEQ ) { - if ( DEBUG ) { - System.out.println( "refseq: " + query ); - } - db_entry = obtainRefSeqEntryFromEmbl( new Identifier( query ), lines_to_return ); - } - // else if ( db == Db.NCBI ) { - // if ( DEBUG ) { - // System.out.println( "ncbi: " + query ); - // } - // db_entry = obtainNcbiEntry( new Identifier( query ), lines_to_return ); - // } - } - else if ( ( db == Db.REFSEQ ) && ( id != null ) ) { - db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return ); - } - //else if ( ( db == Db.NCBI ) && ( id != null ) ) { - // db_entry = obtainNcbiEntry( id, lines_to_return ); - //} - if ( ( db_entry != null ) && !db_entry.isEmpty() ) { - final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() - : new Sequence(); - if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { - String type = null; - if ( db == Db.EMBL ) { - type = "embl"; - } - else if ( db == Db.UNIPROT ) { - type = "uniprot"; - } - // else if ( db == Db.NCBI ) { - // type = "ncbi"; - // } - else if ( db == Db.REFSEQ ) { - type = "refseq"; - } - seq.setAccession( new Accession( db_entry.getAccession(), type ) ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { - seq.setName( db_entry.getSequenceName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) { - seq.setGeneName( db_entry.getGeneName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { - try { - seq.setSymbol( db_entry.getSequenceSymbol() ); - } - catch ( final PhyloXmlDataFormatException e ) { - // Eat this exception. - } - } - if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { - for( final GoTerm go : db_entry.getGoTerms() ) { - final Annotation ann = new Annotation( go.getGoId().getId() ); - ann.setDesc( go.getName() ); - seq.addAnnotation( ann ); - } - } - if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) { - for( final Accession x : db_entry.getCrossReferences() ) { - seq.addCrossReference( x ); - } - } - final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() - : new Taxonomy(); - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { - tax.setScientificName( db_entry.getTaxonomyScientificName() ); - } - if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { - tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); - } - node.getNodeData().setTaxonomy( tax ); - node.getNodeData().setSequence( seq ); - } - else if ( db != Db.NONE ) { - not_found.add( node.getName() ); - } - try { - Thread.sleep( 10 );// Sleep for 10 ms - } - catch ( final InterruptedException ie ) { + if ( node.isExternal() || !ext_nodes_only ) { + obtainSeqInformation( allow_to_set_taxonomic_data, lines_to_return, not_found, node ); } } return not_found; } + public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException { + obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet(), node ); + } + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query ) throws IOException { + return obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN ); + } + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) throws IOException { final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); - return UniProtEntry.createInstanceFromPlainText( lines ); + return UniProtEntry.createInstance( lines ); } public static List queryDb( final String query, int max_lines_to_return, final String base_url ) @@ -379,8 +251,8 @@ public final class SequenceDbWsTools { } in.close(); try { - // To prevent accessing online dbs in too quick succession. - Thread.sleep( 20 ); + // To prevent accessing online dbs in too quick succession. + Thread.sleep( SLEEP ); } catch ( final InterruptedException e ) { e.printStackTrace(); @@ -388,23 +260,32 @@ public final class SequenceDbWsTools { return result; } - public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { + public static List queryEmblDb( final Accession acc, final int max_lines_to_return ) throws IOException { final StringBuilder url_sb = new StringBuilder(); - url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); - url_sb.append( '/' ); + // url_sb.append( BASE_EMBL_DB_URL ); + if ( DEBUG ) { + System.out.println( "source: " + acc.getSource() ); } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - url_sb.append( '/' ); - } - else { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - url_sb.append( '/' ); - } + if ( acc.getSource().equals( Source.NCBI.toString() ) ) { + url_sb.append( EMBL_GENBANK ); + //url_sb.append( '/' ); + } + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { + url_sb.append( EMBL_REFSEQ ); + } + else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + url_sb.append( EMBL_EMBL ); } + else { + throw new IllegalArgumentException( "unable to handle source: " + acc.getSource() ); + } + return queryDb( acc.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return ) + throws IOException { + final StringBuilder url_sb = new StringBuilder(); + url_sb.append( EMBL_REFSEQ ); return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); } @@ -412,10 +293,177 @@ public final class SequenceDbWsTools { return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); } + final static String extractFrom( final String target, final String a ) { + final int i_a = target.indexOf( a ); + return target.substring( i_a + a.length() ).trim(); + } + + final static String extractFromTo( final String target, final String a, final String b ) { + final int i_a = target.indexOf( a ); + final int i_b = target.indexOf( b ); + if ( ( i_a < 0 ) || ( i_b < i_a ) ) { + return ""; + } + return target.substring( i_a + a.length(), i_b ).trim(); + } + + final static String extractTo( final String target, final String b ) { + final int i_b = target.indexOf( b ); + return target.substring( 0, i_b ).trim(); + } + + private static void addDataFromDbToNode( final boolean allow_to_set_taxonomic_data, + final int lines_to_return, + final SortedSet not_found, + final PhylogenyNode node, + final Accession acc ) throws IOException { + SequenceDatabaseEntry db_entry = null; + final String query = acc.getValue(); + if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) { + if ( DEBUG ) { + System.out.println( "uniprot: " + query ); + } + try { + db_entry = obtainUniProtEntry( query, lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { + if ( DEBUG ) { + System.out.println( "refseq: " + query ); + } + try { + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + else if ( acc.getSource().equals( Source.EMBL.toString() ) || acc.getSource().equals( Source.NCBI.toString() ) + || acc.getSource().equals( Source.EMBL.toString() ) ) { + if ( DEBUG ) { + System.out.println( acc.toString() ); + } + try { + db_entry = obtainEmblEntry( acc, lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + else if ( acc.getSource().equals( Source.GI.toString() ) ) { + if ( DEBUG ) { + System.out.println( "gi: " + query ); + } + try { + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence(); + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) { + seq.setGeneName( db_entry.getGeneName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + try { + seq.setSymbol( db_entry.getSequenceSymbol() ); + } + catch ( final PhyloXmlDataFormatException e ) { + // Eat this exception. + } + } + if ( ( db_entry.getMolecularSequence() != null ) + && !ForesterUtil.isEmpty( db_entry.getMolecularSequence().getMolecularSequenceAsString() ) + && ( ALLOW_TO_OVERWRITE_MOL_SEQ || seq.getMolecularSequence().isEmpty() ) ) { + seq.setMolecularSequence( db_entry.getMolecularSequence().getMolecularSequenceAsString() ); + seq.setMolecularSequenceAligned( false ); + if ( db_entry.getMolecularSequence().getType() == TYPE.AA ) { + seq.setType( "protein" ); + } + else if ( db_entry.getMolecularSequence().getType() == TYPE.DNA ) { + seq.setType( "dna" ); + } + else if ( db_entry.getMolecularSequence().getType() == TYPE.RNA ) { + seq.setType( "rna" ); + } + } + if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { + for( final GoTerm go : db_entry.getGoTerms() ) { + final Annotation ann = new Annotation( go.getGoId().getId() ); + ann.setDesc( go.getName() ); + seq.addAnnotation( ann ); + } + } + if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) { + for( final Accession x : db_entry.getCrossReferences() ) { + seq.addCrossReference( x ); + } + } + if ( !ForesterUtil.isEmpty( db_entry.getChromosome() ) && !ForesterUtil.isEmpty( db_entry.getMap() ) ) { + seq.setLocation( "chr " + db_entry.getChromosome() + ", " + db_entry.getMap() ); + } + else if ( !ForesterUtil.isEmpty( db_entry.getChromosome() ) ) { + seq.setLocation( "chr " + db_entry.getChromosome() ); + } + else if ( !ForesterUtil.isEmpty( db_entry.getMap() ) ) { + seq.setLocation( db_entry.getMap() ); + } + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy(); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); + } + else { + if ( node.isExternal() || !node.isEmpty() ) { + not_found.add( node.toString() ); + } + } + try { + Thread.sleep( SLEEP ); + } + catch ( final InterruptedException ie ) { + } + } + private static String encode( final String str ) throws UnsupportedEncodingException { return URLEncoder.encode( str.trim(), URL_ENC ); } + private static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + + private static List getTaxonomiesFromScientificName( final String sn, + final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); @@ -436,6 +484,13 @@ public final class SequenceDbWsTools { return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); } + private final static boolean isAccessionAcceptable( final Accession acc ) { + return ( !( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc + .getSource().equals( Source.UNIPROT.toString() ) ) + && ( acc.getSource().toString().equals( Source.EMBL.toString() ) ) && ( acc.getSource().toString() + .equals( Source.REFSEQ.toString() ) ) ) ) ); + } + private static List parseUniProtTaxonomy( final List result ) throws IOException { final List taxonomies = new ArrayList(); for( final String line : result ) { @@ -459,8 +514,4 @@ public final class SequenceDbWsTools { } return taxonomies; } - - public enum Db { - UNIPROT, EMBL, NCBI, NONE, REFSEQ; - } }