X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FSequenceDbWsTools.java;h=9106c38aae62aa1472ad384338ce31384a409a6c;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=209d28419f1296b7cd96edd019cbd7c668d60236;hpb=22b24f8bfc6470aae914f5c97c826542f2697e77;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index 209d284..9106c38 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -49,44 +49,27 @@ import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sequence.MolecularSequence.TYPE; import org.forester.util.ForesterUtil; import org.forester.util.SequenceAccessionTools; public final class SequenceDbWsTools { - public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; - public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id="; - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - //public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - private final static boolean DEBUG = true; - private final static String URL_ENC = "UTF-8"; - public final static int DEFAULT_LINES_TO_RETURN = 4000; - - final static String extractFrom( final String target, final String a ) { - final int i_a = target.indexOf( a ); - return target.substring( i_a + a.length() ).trim(); - } - - final static String extractFromTo( final String target, final String a, final String b ) { - final int i_a = target.indexOf( a ); - final int i_b = target.indexOf( b ); - if ( ( i_a < 0 ) || ( i_b < i_a ) ) { - throw new IllegalArgumentException( "attempt to extract from \"" + target + "\" between \"" + a - + "\" and \"" + b + "\"" ); - } - return target.substring( i_a + a.length(), i_b ).trim(); - } - - final static String extractTo( final String target, final String b ) { - final int i_b = target.indexOf( b ); - return target.substring( 0, i_b ).trim(); - } + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static int DEFAULT_LINES_TO_RETURN = 4000; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id="; + public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; + public final static String EMBL_EMBL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=EMBL&style=raw&id="; + private final static boolean DEBUG = true; + private final static String URL_ENC = "UTF-8"; + private final static int SLEEP = 200; + private static final boolean ALLOW_TO_OVERWRITE_MOL_SEQ = false; public static List getTaxonomiesFromCommonNameStrict( final String cn, final int max_taxonomies_return ) - throws IOException { + throws IOException { final List taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { final List filtered_taxonomies = new ArrayList(); @@ -113,11 +96,11 @@ public final class SequenceDbWsTools { * Does not return "sub-types". * For example, for "Mus musculus" only returns "Mus musculus" * and not "Mus musculus", "Mus musculus bactrianus", ... - * + * */ public static List getTaxonomiesFromScientificNameStrict( final String sn, final int max_taxonomies_return ) - throws IOException { + throws IOException { final List taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { final List filtered_taxonomies = new ArrayList(); @@ -133,7 +116,7 @@ public final class SequenceDbWsTools { public static List getTaxonomiesFromTaxonomyCode( final String code, final int max_taxonomies_return ) - throws IOException { + throws IOException { final String my_code = new String( code ); final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); if ( result.size() > 0 ) { @@ -142,22 +125,40 @@ public final class SequenceDbWsTools { return null; } + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException { + return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); + } + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc, final int max_lines_to_return ) throws IOException { final List lines = queryEmblDb( acc, max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); } - public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException { - return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); + public static SequenceDatabaseEntry obtainEntry( final String acc_str ) throws IOException { + if ( ForesterUtil.isEmpty( acc_str ) ) { + throw new IllegalArgumentException( "cannot not extract sequence db accessor from null or empty string" ); + } + final Accession acc = SequenceAccessionTools.parseAccessorFromString( acc_str ); + if ( acc == null ) { + throw new IllegalArgumentException( "could not extract acceptable sequence db accessor from \"" + acc_str + + "\"" ); + } + if ( acc.getSource().equals( Source.REFSEQ.toString() ) || acc.getSource().equals( Source.EMBL.toString() ) + || acc.getSource().equals( Source.NCBI.toString() ) ) { + return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); + } + else if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) { + return obtainUniProtEntry( acc.getValue(), DEFAULT_LINES_TO_RETURN ); + } + else { + throw new IllegalArgumentException( "don't know how to handle request for source \"" + acc.getSource() + + "\"" ); + } } - public final static Accession obtainSeqAccession( final PhylogenyNode node ) { - Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); - if ( !isAccessionAcceptable( acc ) ) { - acc = SequenceAccessionTools.obtainAccessorFromDataFields( node ); - } - return acc; + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException { + return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN ); } public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc, final int max_lines_to_return ) @@ -166,8 +167,12 @@ public final class SequenceDbWsTools { return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); } - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException { - return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN ); + public final static Accession obtainSeqAccession( final PhylogenyNode node ) { + Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); + if ( !isAccessionAcceptable( acc ) ) { + acc = SequenceAccessionTools.obtainAccessorFromDataFields( node ); + } + return acc; } public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, @@ -191,10 +196,6 @@ public final class SequenceDbWsTools { obtainSeqInformation( allow_to_set_taxonomic_data, DEFAULT_LINES_TO_RETURN, not_found, node ); } - public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException { - obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet(), node ); - } - public final static SortedSet obtainSeqInformation( final Phylogeny phy, final boolean ext_nodes_only, final boolean allow_to_set_taxonomic_data, @@ -209,16 +210,20 @@ public final class SequenceDbWsTools { return not_found; } - public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) - throws IOException { - final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); - return UniProtEntry.createInstanceFromPlainText( lines ); + public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException { + obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet(), node ); } public static SequenceDatabaseEntry obtainUniProtEntry( final String query ) throws IOException { return obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN ); } + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) + throws IOException { + final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); + return UniProtEntry.createInstanceFromPlainText( lines ); + } + public static List queryDb( final String query, int max_lines_to_return, final String base_url ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { @@ -246,8 +251,8 @@ public final class SequenceDbWsTools { } in.close(); try { - // To prevent accessing online dbs in too quick succession. - Thread.sleep( 20 ); + // To prevent accessing online dbs in too quick succession. + Thread.sleep( SLEEP ); } catch ( final InterruptedException e ) { e.printStackTrace(); @@ -255,34 +260,30 @@ public final class SequenceDbWsTools { return result; } - public static List queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return ) - throws IOException { - final StringBuilder url_sb = new StringBuilder(); - url_sb.append( EMBL_REFSEQ ); - return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); - } - - public static List queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException { + public static List queryEmblDb( final Accession acc, final int max_lines_to_return ) throws IOException { final StringBuilder url_sb = new StringBuilder(); // url_sb.append( BASE_EMBL_DB_URL ); - if ( id.getSource().equals( Source.NCBI.toString() ) ) { + System.out.println( "source: " + acc.getSource() ); + if ( acc.getSource().equals( Source.NCBI.toString() ) ) { url_sb.append( EMBL_GENBANK ); //url_sb.append( '/' ); } - else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) { + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { url_sb.append( EMBL_REFSEQ ); - // if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - // url_sb.append( '/' ); - // } - // else { - // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - // url_sb.append( '/' ); - // } + } + else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + url_sb.append( EMBL_EMBL ); } else { - throw new IllegalArgumentException( "unable to handle source: " + id.getSource() ); + throw new IllegalArgumentException( "unable to handle source: " + acc.getSource() ); } + return queryDb( acc.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return ) + throws IOException { + final StringBuilder url_sb = new StringBuilder(); + url_sb.append( EMBL_REFSEQ ); return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); } @@ -290,6 +291,26 @@ public final class SequenceDbWsTools { return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); } + final static String extractFrom( final String target, final String a ) { + final int i_a = target.indexOf( a ); + return target.substring( i_a + a.length() ).trim(); + } + + final static String extractFromTo( final String target, final String a, final String b ) { + final int i_a = target.indexOf( a ); + final int i_b = target.indexOf( b ); + if ( ( i_a < 0 ) || ( i_b < i_a ) ) { + throw new IllegalArgumentException( "attempt to extract from \"" + target + "\" between \"" + a + + "\" and \"" + b + "\"" ); + } + return target.substring( i_a + a.length(), i_b ).trim(); + } + + final static String extractTo( final String target, final String b ) { + final int i_b = target.indexOf( b ); + return target.substring( 0, i_b ).trim(); + } + private static void addDataFromDbToNode( final boolean allow_to_set_taxonomic_data, final int lines_to_return, final SortedSet not_found, @@ -308,20 +329,32 @@ public final class SequenceDbWsTools { // Eat this, and move to next. } } - else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { if ( DEBUG ) { - System.out.println( "embl: " + query ); + System.out.println( "refseq: " + query ); } try { - db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); } catch ( final FileNotFoundException e ) { // Eat this, and move to next. } } - else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { + else if ( acc.getSource().equals( Source.EMBL.toString() ) || acc.getSource().equals( Source.NCBI.toString() ) + || acc.getSource().equals( Source.EMBL.toString() ) ) { if ( DEBUG ) { - System.out.println( "refseq: " + query ); + System.out.println( acc.toString() ); + } + try { + db_entry = obtainEmblEntry( acc, lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + else if ( acc.getSource().equals( Source.GI.toString() ) ) { + if ( DEBUG ) { + System.out.println( "gi: " + query ); } try { db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); @@ -349,6 +382,21 @@ public final class SequenceDbWsTools { // Eat this exception. } } + if ( ( db_entry.getMolecularSequence() != null ) + && !ForesterUtil.isEmpty( db_entry.getMolecularSequence().getMolecularSequenceAsString() ) + && ( ALLOW_TO_OVERWRITE_MOL_SEQ || seq.getMolecularSequence().isEmpty() ) ) { + seq.setMolecularSequence( db_entry.getMolecularSequence().getMolecularSequenceAsString() ); + seq.setMolecularSequenceAligned( false ); + if ( db_entry.getMolecularSequence().getType() == TYPE.AA ) { + seq.setType( "protein" ); + } + else if ( db_entry.getMolecularSequence().getType() == TYPE.DNA ) { + seq.setType( "dna" ); + } + else if ( db_entry.getMolecularSequence().getType() == TYPE.RNA ) { + seq.setType( "rna" ); + } + } if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { for( final GoTerm go : db_entry.getGoTerms() ) { final Annotation ann = new Annotation( go.getGoId().getId() ); @@ -361,6 +409,15 @@ public final class SequenceDbWsTools { seq.addCrossReference( x ); } } + if ( !ForesterUtil.isEmpty( db_entry.getChromosome() ) && !ForesterUtil.isEmpty( db_entry.getMap() ) ) { + seq.setLocation( "chr " + db_entry.getChromosome() + ", " + db_entry.getMap() ); + } + else if ( !ForesterUtil.isEmpty( db_entry.getChromosome() ) ) { + seq.setLocation( "chr " + db_entry.getChromosome() ); + } + else if ( !ForesterUtil.isEmpty( db_entry.getMap() ) ) { + seq.setLocation( db_entry.getMap() ); + } final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy(); if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { tax.setScientificName( db_entry.getTaxonomyScientificName() ); @@ -377,7 +434,7 @@ public final class SequenceDbWsTools { } } try { - Thread.sleep( 10 );// Sleep for 10 ms + Thread.sleep( SLEEP ); } catch ( final InterruptedException ie ) { } @@ -398,7 +455,7 @@ public final class SequenceDbWsTools { private static List getTaxonomiesFromScientificName( final String sn, final int max_taxonomies_return ) - throws IOException { + throws IOException { final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); @@ -430,7 +487,7 @@ public final class SequenceDbWsTools { return ( !( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc .getSource().equals( Source.UNIPROT.toString() ) ) && ( acc.getSource().toString().equals( Source.EMBL.toString() ) ) && ( acc.getSource().toString() - .equals( Source.REFSEQ.toString() ) ) ) ) ); + .equals( Source.REFSEQ.toString() ) ) ) ) ); } private static List parseUniProtTaxonomy( final List result ) throws IOException {