2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.io.BufferedReader;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.io.InputStreamReader;
32 import java.io.UnsupportedEncodingException;
34 import java.net.URLConnection;
35 import java.net.URLEncoder;
36 import java.util.ArrayList;
37 import java.util.List;
38 import java.util.SortedSet;
39 import java.util.TreeSet;
41 import org.forester.go.GoTerm;
42 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
43 import org.forester.phylogeny.Phylogeny;
44 import org.forester.phylogeny.PhylogenyNode;
45 import org.forester.phylogeny.data.Accession;
46 import org.forester.phylogeny.data.Accession.Source;
47 import org.forester.phylogeny.data.Annotation;
48 import org.forester.phylogeny.data.Identifier;
49 import org.forester.phylogeny.data.Sequence;
50 import org.forester.phylogeny.data.Taxonomy;
51 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
52 import org.forester.util.ForesterUtil;
53 import org.forester.util.SequenceAccessionTools;
55 public final class SequenceDbWsTools {
57 public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
58 public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
59 public final static String EMBL_DBS_EMBL = "embl";
60 public final static String EMBL_DBS_REFSEQ_N = "refseqn";
61 public final static String EMBL_DBS_REFSEQ_P = "refseqp";
62 private final static boolean DEBUG = true;
63 private final static String URL_ENC = "UTF-8";
64 public final static int DEFAULT_LINES_TO_RETURN = 4000;
66 final static String extractFrom( final String target, final String a ) {
67 final int i_a = target.indexOf( a );
68 return target.substring( i_a + a.length() ).trim();
71 final static String extractFromTo( final String target, final String a, final String b ) {
72 final int i_a = target.indexOf( a );
73 final int i_b = target.indexOf( b );
74 if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
75 throw new IllegalArgumentException( "attempt to extract from \"" + target + "\" between \"" + a
76 + "\" and \"" + b + "\"" );
78 return target.substring( i_a + a.length(), i_b ).trim();
81 final static String extractTo( final String target, final String b ) {
82 final int i_b = target.indexOf( b );
83 return target.substring( 0, i_b ).trim();
86 public static List<UniProtTaxonomy> getTaxonomiesFromCommonNameStrict( final String cn,
87 final int max_taxonomies_return )
89 final List<UniProtTaxonomy> taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return );
90 if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) {
91 final List<UniProtTaxonomy> filtered_taxonomies = new ArrayList<UniProtTaxonomy>();
92 for( final UniProtTaxonomy taxonomy : taxonomies ) {
93 if ( taxonomy.getCommonName().equalsIgnoreCase( cn ) ) {
94 filtered_taxonomies.add( taxonomy );
97 return filtered_taxonomies;
102 public static List<UniProtTaxonomy> getTaxonomiesFromId( final String id, final int max_taxonomies_return )
104 final List<String> result = getTaxonomyStringFromId( id, max_taxonomies_return );
105 if ( result.size() > 0 ) {
106 return parseUniProtTaxonomy( result );
112 * Does not return "sub-types".
113 * For example, for "Mus musculus" only returns "Mus musculus"
114 * and not "Mus musculus", "Mus musculus bactrianus", ...
117 public static List<UniProtTaxonomy> getTaxonomiesFromScientificNameStrict( final String sn,
118 final int max_taxonomies_return )
120 final List<UniProtTaxonomy> taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return );
121 if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) {
122 final List<UniProtTaxonomy> filtered_taxonomies = new ArrayList<UniProtTaxonomy>();
123 for( final UniProtTaxonomy taxonomy : taxonomies ) {
124 if ( taxonomy.getScientificName().equalsIgnoreCase( sn ) ) {
125 filtered_taxonomies.add( taxonomy );
128 return filtered_taxonomies;
133 public static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String code,
134 final int max_taxonomies_return )
136 final String my_code = new String( code );
137 final List<String> result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return );
138 if ( result.size() > 0 ) {
139 return parseUniProtTaxonomy( result );
144 public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return )
146 final List<String> lines = queryEmblDb( id, max_lines_to_return );
147 return EbiDbEntry.createInstanceFromPlainText( lines );
150 public final static Accession obtainSeqAccession( final PhylogenyNode node ) {
151 Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node );
152 if ( !isAccessionAcceptable( acc ) ) {
153 acc = SequenceAccessionTools.obtainAccessorFromDataFields( node );
158 public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
160 final List<String> lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return );
161 return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
164 public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data,
165 final int lines_to_return,
166 final SortedSet<String> not_found,
167 final PhylogenyNode node ) throws IOException {
168 final Accession acc = obtainSeqAccession( node );
169 if ( !isAccessionAcceptable( acc ) ) {
170 if ( node.isExternal() || !node.isEmpty() ) {
171 not_found.add( node.toString() );
175 addDataFromDbToNode( allow_to_set_taxonomic_data, lines_to_return, not_found, node, acc );
179 public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data,
180 final SortedSet<String> not_found,
181 final PhylogenyNode node ) throws IOException {
182 obtainSeqInformation( allow_to_set_taxonomic_data, DEFAULT_LINES_TO_RETURN, not_found, node );
185 public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException {
186 obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet<String>(), node );
189 public final static SortedSet<String> obtainSeqInformation( final Phylogeny phy,
190 final boolean ext_nodes_only,
191 final boolean allow_to_set_taxonomic_data,
192 final int lines_to_return ) throws IOException {
193 final SortedSet<String> not_found = new TreeSet<String>();
194 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
195 final PhylogenyNode node = iter.next();
196 if ( node.isExternal() || !ext_nodes_only ) {
197 obtainSeqInformation( allow_to_set_taxonomic_data, lines_to_return, not_found, node );
203 public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return )
205 final List<String> lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return );
206 return UniProtEntry.createInstanceFromPlainText( lines );
209 public static List<String> queryDb( final String query, int max_lines_to_return, final String base_url )
211 if ( ForesterUtil.isEmpty( query ) ) {
212 throw new IllegalArgumentException( "illegal attempt to use empty query " );
214 if ( max_lines_to_return < 1 ) {
215 max_lines_to_return = 1;
217 final URL url = new URL( base_url + query );
219 System.out.println( "url: " + url.toString() );
221 final URLConnection urlc = url.openConnection();
222 final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) );
224 final List<String> result = new ArrayList<String>();
225 while ( ( line = in.readLine() ) != null ) {
227 System.out.println( line );
230 if ( result.size() > max_lines_to_return ) {
236 // To prevent accessing online dbs in too quick succession.
239 catch ( final InterruptedException e ) {
245 public static List<String> queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return )
247 final StringBuilder url_sb = new StringBuilder();
248 url_sb.append( EMBL_REFSEQ );
249 return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
252 public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
253 final StringBuilder url_sb = new StringBuilder();
254 // url_sb.append( BASE_EMBL_DB_URL );
255 if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) {
256 url_sb.append( EMBL_DBS_EMBL );
257 url_sb.append( '/' );
259 else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) {
260 url_sb.append( EMBL_REFSEQ );
261 // if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
262 // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
263 // url_sb.append( '/' );
266 // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N );
267 // url_sb.append( '/' );
270 return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
273 public static List<String> queryUniprot( final String query, final int max_lines_to_return ) throws IOException {
274 return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL );
277 private static void addDataFromDbToNode( final boolean allow_to_set_taxonomic_data,
278 final int lines_to_return,
279 final SortedSet<String> not_found,
280 final PhylogenyNode node,
281 final Accession acc ) throws IOException {
282 SequenceDatabaseEntry db_entry = null;
283 final String query = acc.getValue();
284 if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) {
286 System.out.println( "uniprot: " + query );
289 db_entry = obtainUniProtEntry( query, lines_to_return );
291 catch ( final FileNotFoundException e ) {
292 // Eat this, and move to next.
295 else if ( acc.getSource().equals( Source.EMBL.toString() ) ) {
297 System.out.println( "embl: " + query );
300 db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
302 catch ( final FileNotFoundException e ) {
303 // Eat this, and move to next.
306 else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) {
308 System.out.println( "refseq: " + query );
311 db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
313 catch ( final FileNotFoundException e ) {
314 // Eat this, and move to next.
317 if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
318 final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() : new Sequence();
319 if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
320 seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) );
322 if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
323 seq.setName( db_entry.getSequenceName() );
325 if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) {
326 seq.setGeneName( db_entry.getGeneName() );
328 if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
330 seq.setSymbol( db_entry.getSequenceSymbol() );
332 catch ( final PhyloXmlDataFormatException e ) {
333 // Eat this exception.
336 if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) {
337 for( final GoTerm go : db_entry.getGoTerms() ) {
338 final Annotation ann = new Annotation( go.getGoId().getId() );
339 ann.setDesc( go.getName() );
340 seq.addAnnotation( ann );
343 if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) {
344 for( final Accession x : db_entry.getCrossReferences() ) {
345 seq.addCrossReference( x );
348 final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() : new Taxonomy();
349 if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
350 tax.setScientificName( db_entry.getTaxonomyScientificName() );
352 if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
353 tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
355 node.getNodeData().setTaxonomy( tax );
356 node.getNodeData().setSequence( seq );
359 if ( node.isExternal() || !node.isEmpty() ) {
360 not_found.add( node.toString() );
364 Thread.sleep( 10 );// Sleep for 10 ms
366 catch ( final InterruptedException ie ) {
370 private static String encode( final String str ) throws UnsupportedEncodingException {
371 return URLEncoder.encode( str.trim(), URL_ENC );
374 private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
376 final List<String> result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return );
377 if ( result.size() > 0 ) {
378 return parseUniProtTaxonomy( result );
383 private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String sn,
384 final int max_taxonomies_return )
386 final List<String> result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return );
387 if ( result.size() > 0 ) {
388 return parseUniProtTaxonomy( result );
393 private static List<String> getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return )
395 return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return );
398 private static List<String> getTaxonomyStringFromId( final String id, final int max_lines_to_return )
400 return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return );
403 private static List<String> getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return )
405 return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return );
408 private static List<String> getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return )
410 return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return );
413 private final static boolean isAccessionAcceptable( final Accession acc ) {
414 return ( !( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc
415 .getSource().equals( Source.UNIPROT.toString() ) )
416 && ( acc.getSource().toString().equals( Source.EMBL.toString() ) ) && ( acc.getSource().toString()
417 .equals( Source.REFSEQ.toString() ) ) ) ) );
420 private static List<UniProtTaxonomy> parseUniProtTaxonomy( final List<String> result ) throws IOException {
421 final List<UniProtTaxonomy> taxonomies = new ArrayList<UniProtTaxonomy>();
422 for( final String line : result ) {
423 if ( ForesterUtil.isEmpty( line ) ) {
424 // Ignore empty lines.
426 else if ( line.startsWith( "Taxon" ) ) {
427 final String[] items = line.split( "\t" );
428 if ( !( items[ 1 ].equalsIgnoreCase( "Mnemonic" ) && items[ 2 ].equalsIgnoreCase( "Scientific name" )
429 && items[ 3 ].equalsIgnoreCase( "Common name" ) && items[ 4 ].equalsIgnoreCase( "Synonym" )
430 && items[ 5 ].equalsIgnoreCase( "Other Names" ) && items[ 6 ].equalsIgnoreCase( "Reviewed" )
431 && items[ 7 ].equalsIgnoreCase( "Rank" ) && items[ 8 ].equalsIgnoreCase( "Lineage" ) ) ) {
432 throw new IOException( "Unreconized UniProt Taxonomy format: " + line );
436 if ( line.split( "\t" ).length > 4 ) {
437 taxonomies.add( new UniProtTaxonomy( line ) );