// $Id: // forester -- software libraries and applications // for genomics and evolutionary biology research. // // Copyright (C) 2010 Christian M Zmasek // Copyright (C) 2010 Sanford-Burnham Medical Research Institute // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: www.phylosoft.org/forester package org.forester.ws.uniprot; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.util.ForesterUtil; public final class UniProtWsTools { public enum Db { UNKNOWN, UNIPROT; } public final static String BASE_URL = "http://www.uniprot.org/"; public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/"; private final static String URL_ENC = "UTF-8"; // uniprot/expasy accession number format (6 chars): // letter digit letter-or-digit letter-or-digit letter-or-digit digit // ?: => no back-reference // \A => begin of String // \Z => end of String private final static Pattern UNIPROT_AC_PATTERN = Pattern .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); private final static boolean DEBUG = true; private static String encode( final String str ) throws UnsupportedEncodingException { return URLEncoder.encode( str.trim(), URL_ENC ); } /** * Returns null if no match. * * @param query * @param db * @return */ static public String parseUniProtAccessor( final String query ) { final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); if ( m.lookingAt() ) { return m.group( 1 ); } else { return null; } } public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); } return null; } public static List getTaxonomiesFromCommonNameStrict( final String cn, final int max_taxonomies_return ) throws IOException { final List taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { final List filtered_taxonomies = new ArrayList(); for( final UniProtTaxonomy taxonomy : taxonomies ) { if ( taxonomy.getCommonName().equalsIgnoreCase( cn ) ) { filtered_taxonomies.add( taxonomy ); } } return filtered_taxonomies; } return null; } public static List getTaxonomiesFromId( final String id, final int max_taxonomies_return ) throws IOException { final List result = getTaxonomyStringFromId( id, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); } return null; } public static List getTaxonomiesFromScientificName( final String sn, final int max_taxonomies_return ) throws IOException { // Hack! Craniata? .. if ( sn.equals( "Drosophila" ) ) { return hack( UniProtTaxonomy.DROSOPHILA_GENUS ); } else if ( sn.equals( "Xenopus" ) ) { return hack( UniProtTaxonomy.XENOPUS_GENUS ); } final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); } return null; } /** * Does not return "sub-types". * For example, for "Mus musculus" only returns "Mus musculus" * and not "Mus musculus", "Mus musculus bactrianus", ... * */ public static List getTaxonomiesFromScientificNameStrict( final String sn, final int max_taxonomies_return ) throws IOException { final List taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { final List filtered_taxonomies = new ArrayList(); for( final UniProtTaxonomy taxonomy : taxonomies ) { if ( taxonomy.getScientificName().equalsIgnoreCase( sn ) ) { filtered_taxonomies.add( taxonomy ); } } return filtered_taxonomies; } return null; } public static List getTaxonomiesFromTaxonomyCode( final String code, final int max_taxonomies_return ) throws IOException { String my_code = new String( code ); // Hacks! if ( my_code.equals( "FUGRU" ) ) { my_code = "TAKRU"; } else if ( my_code.equals( "CAP" ) ) { return hack( UniProtTaxonomy.CAPITELLA_TELATA_SPECIES ); } final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); if ( result.size() > 0 ) { return parseUniProtTaxonomy( result ); } return null; } private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); } private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); } private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); } private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); } private static List hack( final UniProtTaxonomy tax ) { final List l = new ArrayList(); l.add( tax ); return l; } private static List parseUniProtTaxonomy( final List result ) throws IOException { final List taxonomies = new ArrayList(); for( final String line : result ) { if ( ForesterUtil.isEmpty( line ) ) { // Ignore empty lines. } else if ( line.startsWith( "Taxon" ) ) { //TODO next the check format FIXME } else { if ( line.split( "\t" ).length > 4 ) { taxonomies.add( new UniProtTaxonomy( line ) ); } } } return taxonomies; } public static List queryEmblDb( final String query, final int max_lines_to_return ) throws IOException { return queryDb( query, max_lines_to_return, BASE_EMBL_DB_URL ); } public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { return queryDb( query, max_lines_to_return, BASE_URL ); } public static List queryDb( final String query, int max_lines_to_return, final String base_url ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { throw new IllegalArgumentException( "illegal attempt to use empty query " ); } if ( max_lines_to_return < 1 ) { max_lines_to_return = 1; } final URL url = new URL( base_url + query ); if ( DEBUG ) { System.out.println( "url: " + url.toString() ); } final URLConnection urlc = url.openConnection(); final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); String line; final List result = new ArrayList(); while ( ( line = in.readLine() ) != null ) { System.out.println( line ); result.add( line ); if ( result.size() > max_lines_to_return ) { break; } } in.close(); return result; } public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) throws IOException { final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); return UniProtEntry.createInstanceFromPlainText( lines ); } public static SequenceDatabaseEntry obtainEmblEntry( final String query, final int max_lines_to_return ) throws IOException { final List lines = queryEmblDb( query, max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainText( lines ); } }