From: cmzmasek Date: Thu, 7 Jun 2012 06:37:55 +0000 (+0000) Subject: phylotastic hackathon at NESCENT 120606 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=67e2a07291484a4b991cea06368443f6d1680de5;p=jalview.git phylotastic hackathon at NESCENT 120606 --- diff --git a/forester/java/src/org/forester/ws/seqdb/DatabaseTools.java b/forester/java/src/org/forester/ws/seqdb/DatabaseTools.java new file mode 100644 index 0000000..b8ce477 --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/DatabaseTools.java @@ -0,0 +1,22 @@ + +package org.forester.ws.seqdb; + + +public class DatabaseTools { + + + static String extract( final String target, final String a, final String b ) { + final int i_a = target.indexOf( a ); + final int i_b = target.indexOf( b ); + if ( ( i_a < 0 ) || ( i_b < i_a ) ) { + throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and [" + + b + "]" ); + } + return target.substring( i_a + a.length(), i_b ).trim(); + } + + static String extract( final String target, final String a ) { + final int i_a = target.indexOf( a ); + return target.substring( i_a + a.length() ).trim(); + } +} diff --git a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java new file mode 100644 index 0000000..2f1a64e --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java @@ -0,0 +1,188 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.seqdb; + +import java.util.List; + +import org.forester.util.ForesterUtil; + +public final class EbiDbEntry implements SequenceDatabaseEntry { + + //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ + private String _pa; + private String _de; + private String _os; + private String _tax_id; + private String _symbol; + private String _provider; + + private EbiDbEntry() { + } + + @Override + public Object clone() throws CloneNotSupportedException { + throw new CloneNotSupportedException(); + } + + + public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List lines ) { + final EbiDbEntry e = new EbiDbEntry(); + for( final String line : lines ) { + // System.out.println( "-" + line ); + if ( line.startsWith( "ACCESSION" ) ) { + e.setPA( DatabaseTools.extract( line, "ACCESSION" ) ); + } + else if ( line.startsWith( "DEFINITION" ) ) { + if ( line.indexOf( "[" ) > 0 ) { + e.setDe( DatabaseTools.extract( line, "DEFINITION", "[" ) ); + } + else { + e.setDe( DatabaseTools.extract( line, "DEFINITION" ) ); + } + + + } + + else if ( line.startsWith( "SOURCE" ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOs( DatabaseTools.extract( line, "SOURCE", "(" ) ); + } + else { + e.setOs( DatabaseTools.extract( line, "SOURCE" ) ); + } + } + + } + return e; + } + + + + public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { + final EbiDbEntry e = new EbiDbEntry(); + for( final String line : lines ) { + + if ( line.startsWith( "PA" ) ) { + e.setPA( DatabaseTools.extract( line, "PA" ) ); + } + else if ( line.startsWith( "DE" ) ) { + // if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { + e.setDe( DatabaseTools.extract( line, "DE" ) ); + //} + } + // else if ( line.startsWith( "GN" ) ) { + // if ( ( line.indexOf( "Name=" ) > 0 ) ) { + // e.setSymbol( extract( line, "Name=", ";" ) ); + // } + // } + else if ( line.startsWith( "OS" ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOs( DatabaseTools.extract( line, "OS", "(" ) ); + } + else { + e.setOs( DatabaseTools.extract( line, "OS" ) ); + } + } + else if ( line.startsWith( "OX" ) ) { + if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { + e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); + } + } + } + return e; + } + + @Override + public String getAccession() { + return _pa; + } + + private void setPA( final String pa ) { + if ( _pa == null ) { + _pa = pa; + } + } + + @Override + public String getSequenceName() { + return _de; + } + + private void setDe( final String rec_name ) { + if ( _de == null ) { + _de = rec_name; + } + } + + @Override + public String getTaxonomyScientificName() { + return _os; + } + + private void setOs( final String os ) { + if ( _os == null ) { + _os = os; + } + } + + @Override + public String getTaxonomyIdentifier() { + return _tax_id; + } + + private void setTaxId( final String tax_id ) { + if ( _tax_id == null ) { + _tax_id = tax_id; + } + } + + @Override + public String getSequenceSymbol() { + return _symbol; + } + + private void setSymbol( final String symbol ) { + if ( _symbol == null ) { + _symbol = symbol; + } + } + + @Override + public boolean isEmpty() { + return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() ) + && ForesterUtil.isEmpty( getTaxonomyScientificName() ) + && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); + } + + @Override + public String getProvider() { + return _provider; + } + + public void setProvider( final String provider ) { + _provider = provider; + } +} diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java b/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java new file mode 100644 index 0000000..8025dfc --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java @@ -0,0 +1,43 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.seqdb; + +public interface SequenceDatabaseEntry { + + public boolean isEmpty(); + + public String getAccession(); + + public String getProvider(); + + public String getSequenceName(); + + public String getTaxonomyScientificName(); + + public String getTaxonomyIdentifier(); + + public String getSequenceSymbol(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java new file mode 100644 index 0000000..f8d23e3 --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -0,0 +1,405 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.seqdb; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.phylogeny.data.Identifier; +import org.forester.util.ForesterUtil; + +public final class SequenceDbWsTools { + + private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! + + public enum Db { + UNKNOWN, UNIPROT; + } + public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; + public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; + public final static String EMBL_DBS_EMBL = "embl"; + public final static String EMBL_DBS_REFSEQ_P = "refseqp"; + public final static String EMBL_DBS_REFSEQ_N = "refseqn"; + + private final static String URL_ENC = "UTF-8"; + // uniprot/expasy accession number format (6 chars): + // letter digit letter-or-digit letter-or-digit letter-or-digit digit + // ?: => no back-reference + // \A => begin of String + // \Z => end of String + private final static Pattern UNIPROT_AC_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); + private final static boolean DEBUG = false; + + private static String encode( final String str ) throws UnsupportedEncodingException { + return URLEncoder.encode( str.trim(), URL_ENC ); + } + + /** + * Returns null if no match. + * + * @param query + * @param db + * @return + */ + static public String parseUniProtAccessor( final String query ) { + final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + + public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + + public static List getTaxonomiesFromCommonNameStrict( final String cn, + final int max_taxonomies_return ) + throws IOException { + final List taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return ); + if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { + final List filtered_taxonomies = new ArrayList(); + for( final UniProtTaxonomy taxonomy : taxonomies ) { + if ( taxonomy.getCommonName().equalsIgnoreCase( cn ) ) { + filtered_taxonomies.add( taxonomy ); + } + } + return filtered_taxonomies; + } + return null; + } + + public static List getTaxonomiesFromId( final String id, final int max_taxonomies_return ) + throws IOException { + final List result = getTaxonomyStringFromId( id, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + + public static List getTaxonomiesFromScientificName( final String sn, + final int max_taxonomies_return ) + throws IOException { + // Hack! Craniata? .. + if ( sn.equals( "Drosophila" ) ) { + return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS ); + } + else if ( sn.equals( "Xenopus" ) ) { + return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS ); + } + // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) { + // return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA ); + // } + final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + + /** + * Does not return "sub-types". + * For example, for "Mus musculus" only returns "Mus musculus" + * and not "Mus musculus", "Mus musculus bactrianus", ... + * + */ + public static List getTaxonomiesFromScientificNameStrict( final String sn, + final int max_taxonomies_return ) + throws IOException { + final List taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return ); + if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { + final List filtered_taxonomies = new ArrayList(); + for( final UniProtTaxonomy taxonomy : taxonomies ) { + if ( taxonomy.getScientificName().equalsIgnoreCase( sn ) ) { + filtered_taxonomies.add( taxonomy ); + } + } + return filtered_taxonomies; + } + return null; + } + + public static List getTaxonomiesFromTaxonomyCode( final String code, + final int max_taxonomies_return ) + throws IOException { + final String my_code = new String( code ); + if ( ALLOW_TAXONOMY_CODE_HACKS ) { + final List l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code ); + if ( l != null ) { + return l; + } + } + final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); + if ( result.size() > 0 ) { + return parseUniProtTaxonomy( result ); + } + return null; + } + + private static List resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code ) + throws IOException { + if ( code.equals( "CAP" ) ) { + return getTaxonomiesFromId( "283909", max_taxonomies_return ); + } + else if ( code.equals( "FUGRU" ) ) { + return getTaxonomiesFromId( "31033", max_taxonomies_return ); + } + else if ( code.equals( "GIALA" ) ) { + return getTaxonomiesFromId( "5741", max_taxonomies_return ); + } + else if ( code.equals( "TRIVE" ) ) { + return getTaxonomiesFromId( "413071", max_taxonomies_return ); + } + else if ( code.equals( "CAPOWC" ) ) { + return getTaxonomiesFromId( "192875", max_taxonomies_return ); + } + else if ( code.equals( "SPHARC" ) ) { + return getTaxonomiesFromId( "667725", max_taxonomies_return ); + } + else if ( code.equals( "THETRA" ) ) { + return getTaxonomiesFromId( "529818", max_taxonomies_return ); + } + else if ( code.equals( "CHLVUL" ) ) { + return getTaxonomiesFromId( "574566", max_taxonomies_return ); + } + else if ( code.equals( "CITCLE" ) ) { + return getTaxonomiesFromId( "85681", max_taxonomies_return ); + } + else if ( code.equals( "MYCPOP" ) ) { + return getTaxonomiesFromId( "85929", max_taxonomies_return ); + } + else if ( code.equals( "AGABB" ) ) { + return getTaxonomiesFromId( "597362", max_taxonomies_return ); + } + else if ( code.equals( "BAUCOM" ) ) { + return getTaxonomiesFromId( "430998", max_taxonomies_return ); + } + else if ( code.equals( "DICSQU" ) ) { + return getTaxonomiesFromId( "114155", max_taxonomies_return ); + } + else if ( code.equals( "FOMPIN" ) ) { + return getTaxonomiesFromId( "40483", max_taxonomies_return ); + } + else if ( code.equals( "HYDMA" ) ) { + return getTaxonomiesFromId( "6085", max_taxonomies_return ); + } + else if ( code.equals( "MYCFI" ) ) { + return getTaxonomiesFromId( "83344", max_taxonomies_return ); + } + else if ( code.equals( "OIDMAI" ) ) { + return getTaxonomiesFromId( "78148", max_taxonomies_return ); + } + else if ( code.equals( "OSTRC" ) ) { + return getTaxonomiesFromId( "385169", max_taxonomies_return ); + } + else if ( code.equals( "POSPL" ) ) { + return getTaxonomiesFromId( "104341", max_taxonomies_return ); + } + else if ( code.equals( "SAICOM" ) ) { + return getTaxonomiesFromId( "5606", max_taxonomies_return ); + } + else if ( code.equals( "SERLA" ) ) { + return getTaxonomiesFromId( "85982", max_taxonomies_return ); + } + else if ( code.equals( "SPORO" ) ) { + return getTaxonomiesFromId( "40563", max_taxonomies_return ); + } + else if ( code.equals( "ACRALC" ) ) { + return getTaxonomiesFromId( "398408", max_taxonomies_return ); + } + else if ( code.equals( "THITER" ) ) { + return getTaxonomiesFromId( "35720", max_taxonomies_return ); + } + else if ( code.equals( "MYCTHE" ) ) { + return getTaxonomiesFromId( "78579", max_taxonomies_return ); + } + else if ( code.equals( "CONPUT" ) ) { + return getTaxonomiesFromId( "80637", max_taxonomies_return ); + } + else if ( code.equals( "WOLCOC" ) ) { + return getTaxonomiesFromId( "81056", max_taxonomies_return ); + } + else if ( code.equals( "CLAGRA" ) ) { + return getTaxonomiesFromId( "27339", max_taxonomies_return ); + } + else if ( code.equals( "XANPAR" ) ) { + return getTaxonomiesFromId( "107463", max_taxonomies_return ); + } + else if ( code.equals( "HYDPIN" ) ) { + return getTaxonomiesFromId( "388859", max_taxonomies_return ); + } + else if ( code.equals( "SERLAC" ) ) { + return getTaxonomiesFromId( "85982", max_taxonomies_return ); + } + else { + return null; + } + } + + private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); + } + + private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) + throws IOException { + return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); + } + + private static List uniProtTaxonomyToList( final UniProtTaxonomy tax ) { + final List l = new ArrayList(); + l.add( tax ); + return l; + } + + private static List parseUniProtTaxonomy( final List result ) throws IOException { + final List taxonomies = new ArrayList(); + for( final String line : result ) { + if ( ForesterUtil.isEmpty( line ) ) { + // Ignore empty lines. + } + else if ( line.startsWith( "Taxon" ) ) { + final String[] items = line.split( "\t" ); + if ( !( items[ 1 ].equalsIgnoreCase( "Mnemonic" ) && items[ 2 ].equalsIgnoreCase( "Scientific name" ) + && items[ 3 ].equalsIgnoreCase( "Common name" ) && items[ 4 ].equalsIgnoreCase( "Synonym" ) + && items[ 5 ].equalsIgnoreCase( "Other Names" ) && items[ 6 ].equalsIgnoreCase( "Reviewed" ) + && items[ 7 ].equalsIgnoreCase( "Rank" ) && items[ 8 ].equalsIgnoreCase( "Lineage" ) ) ) { + throw new IOException( "Unreconized UniProt Taxonomy format: " + line ); + } + } + else { + if ( line.split( "\t" ).length > 4 ) { + taxonomies.add( new UniProtTaxonomy( line ) ); + } + } + } + return taxonomies; + } + + public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { + + StringBuilder url_sb = new StringBuilder(); + url_sb.append( BASE_EMBL_DB_URL ); + + if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { + + url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); + url_sb.append( '/'); + } + else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { + if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { + + url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); + url_sb.append( '/'); + } + else { + + url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); + url_sb.append( '/'); + } + } + return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); + } + + public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { + return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); + } + + public static List queryDb( final String query, int max_lines_to_return, final String base_url ) + throws IOException { + if ( ForesterUtil.isEmpty( query ) ) { + throw new IllegalArgumentException( "illegal attempt to use empty query " ); + } + if ( max_lines_to_return < 1 ) { + max_lines_to_return = 1; + } + final URL url = new URL( base_url + query ); + if ( DEBUG ) { + System.out.println( "url: " + url.toString() ); + } + final URLConnection urlc = url.openConnection(); + final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); + String line; + final List result = new ArrayList(); + while ( ( line = in.readLine() ) != null ) { + if ( DEBUG ) { + System.out.println( line ); + } + result.add( line ); + if ( result.size() > max_lines_to_return ) { + break; + } + } + in.close(); + return result; + } + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) + throws IOException { + final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); + return UniProtEntry.createInstanceFromPlainText( lines ); + } + + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDb( id, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); + } + + public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) + throws IOException { + final List lines = queryEmblDb( id , max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainText( lines ); + } +} diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java new file mode 100644 index 0000000..fc85037 --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -0,0 +1,147 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.seqdb; + +import java.util.List; + +import org.forester.util.ForesterUtil; + +public final class UniProtEntry implements SequenceDatabaseEntry { + + private String _ac; + private String _rec_name; + private String _os_scientific_name; + private String _tax_id; + private String _symbol; + + private UniProtEntry() { + } + + @Override + public Object clone() throws CloneNotSupportedException { + throw new CloneNotSupportedException(); + } + + public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { + final UniProtEntry e = new UniProtEntry(); + for( final String line : lines ) { + if ( line.startsWith( "AC" ) ) { + e.setAc( DatabaseTools.extract( line, "AC", ";" ) ); + } + else if ( line.startsWith( "DE" ) ) { + if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { + e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) ); + } + } + else if ( line.startsWith( "GN" ) ) { + if ( ( line.indexOf( "Name=" ) > 0 ) ) { + e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) ); + } + } + else if ( line.startsWith( "OS" ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) ); + } + else { + e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) ); + } + } + else if ( line.startsWith( "OX" ) ) { + if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { + e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); + } + } + } + return e; + } + + @Override + public String getAccession() { + return _ac; + } + + private void setAc( final String ac ) { + if ( _ac == null ) { + _ac = ac; + } + } + + @Override + public String getSequenceName() { + return _rec_name; + } + + private void setRecName( final String rec_name ) { + if ( _rec_name == null ) { + _rec_name = rec_name; + } + } + + @Override + public String getTaxonomyScientificName() { + return _os_scientific_name; + } + + private void setOsScientificName( final String os_scientific_name ) { + if ( _os_scientific_name == null ) { + _os_scientific_name = os_scientific_name; + } + } + + @Override + public String getTaxonomyIdentifier() { + return _tax_id; + } + + private void setTaxId( final String tax_id ) { + if ( _tax_id == null ) { + _tax_id = tax_id; + } + } + + @Override + public String getSequenceSymbol() { + return _symbol; + } + + private void setSymbol( final String symbol ) { + if ( _symbol == null ) { + _symbol = symbol; + } + } + + @Override + public boolean isEmpty() { + return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() ) + && ForesterUtil.isEmpty( getTaxonomyScientificName() ) + && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); + } + + @Override + public String getProvider() { + return "uniprot"; + } +} diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java b/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java new file mode 100644 index 0000000..e9a2e88 --- /dev/null +++ b/forester/java/src/org/forester/ws/seqdb/UniProtTaxonomy.java @@ -0,0 +1,210 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.seqdb; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.util.ForesterUtil; + +public final class UniProtTaxonomy { + + private static final String ARCHAEA = "Archaea"; + private static final String BACTERIA = "Bacteria"; + private static final String EUKARYOTA = "Eukaryota"; + private final List _lineage; + private final String _code; + private final String _scientific_name; + private final String _common_name; + private final String _synonym; + private final String _rank; + private final String _id; + public final static String CELLULAR_ORGANISMS = "cellular organisms"; + public final static String VIRUSES = "Viruses"; + public final static UniProtTaxonomy DROSOPHILA_GENUS = new UniProtTaxonomy( new String[] { + CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Arthropoda", "Hexapoda", "Insecta", "Pterygota", "Neoptera", + "Endopterygota", "Diptera", "Brachycera", "Muscomorpha", "Ephydroidea", "Drosophilidae", "Drosophila" }, + "", + "fruit flies", + "Drosophila", + "", + "genus", + "7215" ); + public final static UniProtTaxonomy XENOPUS_GENUS = new UniProtTaxonomy( new String[] { + CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Amphibia", + "Batrachia", "Anura", "Mesobatrachia", "Pipoidea", "Pipidae", "Xenopodinae", "Xenopus" }, + "", + "", + "Xenopus", + "", + "genus", + "8353" ); + public final static UniProtTaxonomy CAPITELLA_TELATA_SPECIES = new UniProtTaxonomy( new String[] { + CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Annelida", "Polychaeta", "Scolecida", "Capitellida", + "Capitellidae", "Capitella", "Capitella teleta" }, + "", + "", + "Capitella teleta", + "Capitella sp. I", + "species", + "283909" ); + + // public final static UniProtTaxonomy NUCLEARIIDAE_AND_FONTICULA = new UniProtTaxonomy( new String[] { + // CELLULAR_ORGANISMS, EUKARYOTA, "Nucleariidae and Fonticula group" }, "", "", "", "", "", "1001604" ); + public UniProtTaxonomy( final String line ) { + final String[] items = line.split( "\t" ); + if ( items.length < 5 ) { + throw new IllegalArgumentException( "cannot parse uniprot taxonomy from: " + line ); + } + _id = items[ 0 ].trim(); + _code = items[ 1 ].trim(); + _scientific_name = items[ 2 ].trim(); + _common_name = items[ 3 ].trim(); + _synonym = items[ 4 ].trim(); + if ( items.length > 6 ) { + _rank = items[ 7 ].trim(); + } + else { + _rank = ""; + } + String[] lin = null; + if ( items.length > 8 ) { + lin = items[ 8 ].split( "; " ); + } + _lineage = new ArrayList(); + if ( ( lin != null ) && ( lin.length > 0 ) ) { + final List temp = new ArrayList(); + for( final String t : lin ) { + if ( !ForesterUtil.isEmpty( t ) ) { + temp.add( t.trim() ); + } + } + for( int i = 0; i < temp.size(); ++i ) { + if ( ( i == 0 ) + && ( temp.get( i ).equalsIgnoreCase( EUKARYOTA ) || temp.get( i ).equalsIgnoreCase( BACTERIA ) || temp + .get( i ).equalsIgnoreCase( ARCHAEA ) ) ) { + _lineage.add( CELLULAR_ORGANISMS ); + } + _lineage.add( temp.get( i ) ); + } + } + if ( _lineage.isEmpty() + && ( _scientific_name.equalsIgnoreCase( EUKARYOTA ) || _scientific_name.equalsIgnoreCase( BACTERIA ) || _scientific_name + .equalsIgnoreCase( ARCHAEA ) ) ) { + _lineage.add( CELLULAR_ORGANISMS ); + } + _lineage.add( _scientific_name ); + if ( _lineage.isEmpty() ) { + throw new IllegalArgumentException( "lineage in a UniProt Taxonomy can not be empty\n: " + line ); + } + } + + public UniProtTaxonomy( final List lineage, + final String code, + final String common_name, + final String scientific_name, + final String synonym, + final String rank, + final String id ) { + _lineage = lineage; + _code = code; + _scientific_name = scientific_name; + _common_name = common_name; + _synonym = synonym; + _rank = rank; + _id = id; + if ( ( _lineage != null ) && !_lineage.get( _lineage.size() - 1 ).equalsIgnoreCase( _scientific_name ) ) { + _lineage.add( _scientific_name ); + } + } + + public UniProtTaxonomy( final String[] lineage, + final String code, + final String common_name, + final String scientific_name, + final String synonym, + final String rank, + final String id ) { + _lineage = new ArrayList(); + if ( lineage != null ) { + for( final String l : lineage ) { + _lineage.add( l ); + } + } + _code = code; + _scientific_name = scientific_name; + _common_name = common_name; + _synonym = synonym; + _rank = rank; + _id = id; + if ( ( _lineage != null ) && !_lineage.get( _lineage.size() - 1 ).equalsIgnoreCase( _scientific_name ) ) { + _lineage.add( _scientific_name ); + } + } + + /** + * Creates deep copy for all fields, except lineage. + * + * @return + */ + public UniProtTaxonomy copy() { + return new UniProtTaxonomy( getLineage(), + getCode() != null ? new String( getCode() ) : null, + getCommonName() != null ? new String( getCommonName() ) : null, + getScientificName() != null ? new String( getScientificName() ) : null, + getSynonym() != null ? new String( getSynonym() ) : null, + getRank() != null ? new String( getRank() ) : null, + getId() != null ? new String( getId() ) : null ); + } + + public String getCode() { + return _code; + } + + public String getCommonName() { + return _common_name; + } + + public String getId() { + return _id; + } + + public List getLineage() { + return _lineage; + } + + public String getRank() { + return _rank; + } + + public String getScientificName() { + return _scientific_name; + } + + public String getSynonym() { + return _synonym; + } +}