From: cmzmasek@gmail.com Date: Fri, 30 Nov 2012 05:54:38 +0000 (+0000) Subject: duplicate.... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=6685886e16a34e09daecad72d9a38e56c7e687d8;p=jalview.git duplicate.... --- diff --git a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java deleted file mode 100644 index 77421fc..0000000 --- a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java +++ /dev/null @@ -1,20 +0,0 @@ - -package org.forester.ws.uniprot; - -public class DatabaseTools { - - static String extract( final String target, final String a, final String b ) { - final int i_a = target.indexOf( a ); - final int i_b = target.indexOf( b ); - if ( ( i_a < 0 ) || ( i_b < i_a ) ) { - throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and [" - + b + "]" ); - } - return target.substring( i_a + a.length(), i_b ).trim(); - } - - static String extract( final String target, final String a ) { - final int i_a = target.indexOf( a ); - return target.substring( i_a + a.length() ).trim(); - } -} diff --git a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java b/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java deleted file mode 100644 index 59f1246..0000000 --- a/forester/java/src/org/forester/ws/uniprot/EbiDbEntry.java +++ /dev/null @@ -1,180 +0,0 @@ -// $Id: -// forester -- software libraries and applications -// for genomics and evolutionary biology research. -// -// Copyright (C) 2010 Christian M Zmasek -// Copyright (C) 2010 Sanford-Burnham Medical Research Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.ws.uniprot; - -import java.util.List; - -import org.forester.util.ForesterUtil; - -public final class EbiDbEntry implements SequenceDatabaseEntry { - - //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ - private String _pa; - private String _de; - private String _os; - private String _tax_id; - private String _symbol; - private String _provider; - - private EbiDbEntry() { - } - - @Override - public Object clone() throws CloneNotSupportedException { - throw new CloneNotSupportedException(); - } - - public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List lines ) { - final EbiDbEntry e = new EbiDbEntry(); - for( final String line : lines ) { - // System.out.println( "-" + line ); - if ( line.startsWith( "ACCESSION" ) ) { - e.setPA( DatabaseTools.extract( line, "ACCESSION" ) ); - } - else if ( line.startsWith( "DEFINITION" ) ) { - if ( line.indexOf( "[" ) > 0 ) { - e.setDe( DatabaseTools.extract( line, "DEFINITION", "[" ) ); - } - else { - e.setDe( DatabaseTools.extract( line, "DEFINITION" ) ); - } - } - else if ( line.startsWith( "SOURCE" ) ) { - if ( line.indexOf( "(" ) > 0 ) { - e.setOs( DatabaseTools.extract( line, "SOURCE", "(" ) ); - } - else { - e.setOs( DatabaseTools.extract( line, "SOURCE" ) ); - } - } - } - return e; - } - - public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { - final EbiDbEntry e = new EbiDbEntry(); - for( final String line : lines ) { - if ( line.startsWith( "PA" ) ) { - e.setPA( DatabaseTools.extract( line, "PA" ) ); - } - else if ( line.startsWith( "DE" ) ) { - // if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setDe( DatabaseTools.extract( line, "DE" ) ); - //} - } - // else if ( line.startsWith( "GN" ) ) { - // if ( ( line.indexOf( "Name=" ) > 0 ) ) { - // e.setSymbol( extract( line, "Name=", ";" ) ); - // } - // } - else if ( line.startsWith( "OS" ) ) { - if ( line.indexOf( "(" ) > 0 ) { - e.setOs( DatabaseTools.extract( line, "OS", "(" ) ); - } - else { - e.setOs( DatabaseTools.extract( line, "OS" ) ); - } - } - else if ( line.startsWith( "OX" ) ) { - if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); - } - } - } - return e; - } - - @Override - public String getAccession() { - return _pa; - } - - private void setPA( final String pa ) { - if ( _pa == null ) { - _pa = pa; - } - } - - @Override - public String getSequenceName() { - return _de; - } - - private void setDe( final String rec_name ) { - if ( _de == null ) { - _de = rec_name; - } - } - - @Override - public String getTaxonomyScientificName() { - return _os; - } - - private void setOs( final String os ) { - if ( _os == null ) { - _os = os; - } - } - - @Override - public String getTaxonomyIdentifier() { - return _tax_id; - } - - private void setTaxId( final String tax_id ) { - if ( _tax_id == null ) { - _tax_id = tax_id; - } - } - - @Override - public String getSequenceSymbol() { - return _symbol; - } - - private void setSymbol( final String symbol ) { - if ( _symbol == null ) { - _symbol = symbol; - } - } - - @Override - public boolean isEmpty() { - return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() ) - && ForesterUtil.isEmpty( getTaxonomyScientificName() ) - && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); - } - - @Override - public String getProvider() { - return _provider; - } - - public void setProvider( final String provider ) { - _provider = provider; - } -} diff --git a/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java b/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java deleted file mode 100644 index fe3d975..0000000 --- a/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java +++ /dev/null @@ -1,43 +0,0 @@ -// $Id: -// forester -- software libraries and applications -// for genomics and evolutionary biology research. -// -// Copyright (C) 2010 Christian M Zmasek -// Copyright (C) 2010 Sanford-Burnham Medical Research Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.ws.uniprot; - -public interface SequenceDatabaseEntry { - - public boolean isEmpty(); - - public String getAccession(); - - public String getProvider(); - - public String getSequenceName(); - - public String getTaxonomyScientificName(); - - public String getTaxonomyIdentifier(); - - public String getSequenceSymbol(); -} \ No newline at end of file diff --git a/forester/java/src/org/forester/ws/uniprot/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/uniprot/SequenceDbWsTools.java deleted file mode 100644 index 64faed9..0000000 --- a/forester/java/src/org/forester/ws/uniprot/SequenceDbWsTools.java +++ /dev/null @@ -1,399 +0,0 @@ -// $Id: -// forester -- software libraries and applications -// for genomics and evolutionary biology research. -// -// Copyright (C) 2010 Christian M Zmasek -// Copyright (C) 2010 Sanford-Burnham Medical Research Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.ws.uniprot; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; -import java.net.URL; -import java.net.URLConnection; -import java.net.URLEncoder; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.forester.phylogeny.data.Identifier; -import org.forester.util.ForesterUtil; - -public final class SequenceDbWsTools { - - private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! - - public enum Db { - UNKNOWN, UNIPROT; - } - public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; - public final static String EMBL_DBS_EMBL = "embl"; - public final static String EMBL_DBS_REFSEQ_P = "refseqp"; - public final static String EMBL_DBS_REFSEQ_N = "refseqn"; - private final static String URL_ENC = "UTF-8"; - // uniprot/expasy accession number format (6 chars): - // letter digit letter-or-digit letter-or-digit letter-or-digit digit - // ?: => no back-reference - // \A => begin of String - // \Z => end of String - private final static Pattern UNIPROT_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static boolean DEBUG = false; - - private static String encode( final String str ) throws UnsupportedEncodingException { - return URLEncoder.encode( str.trim(), URL_ENC ); - } - - /** - * Returns null if no match. - * - * @param query - * @param db - * @return - */ - static public String parseUniProtAccessor( final String query ) { - final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - - public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) - throws IOException { - final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - - public static List getTaxonomiesFromCommonNameStrict( final String cn, - final int max_taxonomies_return ) - throws IOException { - final List taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return ); - if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { - final List filtered_taxonomies = new ArrayList(); - for( final UniProtTaxonomy taxonomy : taxonomies ) { - if ( taxonomy.getCommonName().equalsIgnoreCase( cn ) ) { - filtered_taxonomies.add( taxonomy ); - } - } - return filtered_taxonomies; - } - return null; - } - - public static List getTaxonomiesFromId( final String id, final int max_taxonomies_return ) - throws IOException { - final List result = getTaxonomyStringFromId( id, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - - public static List getTaxonomiesFromScientificName( final String sn, - final int max_taxonomies_return ) - throws IOException { - // Hack! Craniata? .. - if ( sn.equals( "Drosophila" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.DROSOPHILA_GENUS ); - } - else if ( sn.equals( "Xenopus" ) ) { - return uniProtTaxonomyToList( UniProtTaxonomy.XENOPUS_GENUS ); - } - // else if ( sn.equals( "Nucleariidae and Fonticula group" ) ) { - // return hack( UniProtTaxonomy.NUCLEARIIDAE_AND_FONTICULA ); - // } - final List result = getTaxonomyStringFromScientificName( sn, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - - /** - * Does not return "sub-types". - * For example, for "Mus musculus" only returns "Mus musculus" - * and not "Mus musculus", "Mus musculus bactrianus", ... - * - */ - public static List getTaxonomiesFromScientificNameStrict( final String sn, - final int max_taxonomies_return ) - throws IOException { - final List taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return ); - if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { - final List filtered_taxonomies = new ArrayList(); - for( final UniProtTaxonomy taxonomy : taxonomies ) { - if ( taxonomy.getScientificName().equalsIgnoreCase( sn ) ) { - filtered_taxonomies.add( taxonomy ); - } - } - return filtered_taxonomies; - } - return null; - } - - public static List getTaxonomiesFromTaxonomyCode( final String code, - final int max_taxonomies_return ) - throws IOException { - final String my_code = new String( code ); - if ( ALLOW_TAXONOMY_CODE_HACKS ) { - final List l = resolveFakeTaxonomyCodes( max_taxonomies_return, my_code ); - if ( l != null ) { - return l; - } - } - final List result = getTaxonomyStringFromTaxonomyCode( my_code, max_taxonomies_return ); - if ( result.size() > 0 ) { - return parseUniProtTaxonomy( result ); - } - return null; - } - - private static List resolveFakeTaxonomyCodes( final int max_taxonomies_return, final String code ) - throws IOException { - if ( code.equals( "CAP" ) ) { - return getTaxonomiesFromId( "283909", max_taxonomies_return ); - } - else if ( code.equals( "FUGRU" ) ) { - return getTaxonomiesFromId( "31033", max_taxonomies_return ); - } - else if ( code.equals( "GIALA" ) ) { - return getTaxonomiesFromId( "5741", max_taxonomies_return ); - } - else if ( code.equals( "TRIVE" ) ) { - return getTaxonomiesFromId( "413071", max_taxonomies_return ); - } - else if ( code.equals( "CAPOWC" ) ) { - return getTaxonomiesFromId( "192875", max_taxonomies_return ); - } - else if ( code.equals( "SPHARC" ) ) { - return getTaxonomiesFromId( "667725", max_taxonomies_return ); - } - else if ( code.equals( "THETRA" ) ) { - return getTaxonomiesFromId( "529818", max_taxonomies_return ); - } - else if ( code.equals( "CHLVUL" ) ) { - return getTaxonomiesFromId( "574566", max_taxonomies_return ); - } - else if ( code.equals( "CITCLE" ) ) { - return getTaxonomiesFromId( "85681", max_taxonomies_return ); - } - else if ( code.equals( "MYCPOP" ) ) { - return getTaxonomiesFromId( "85929", max_taxonomies_return ); - } - else if ( code.equals( "AGABB" ) ) { - return getTaxonomiesFromId( "597362", max_taxonomies_return ); - } - else if ( code.equals( "BAUCOM" ) ) { - return getTaxonomiesFromId( "430998", max_taxonomies_return ); - } - else if ( code.equals( "DICSQU" ) ) { - return getTaxonomiesFromId( "114155", max_taxonomies_return ); - } - else if ( code.equals( "FOMPIN" ) ) { - return getTaxonomiesFromId( "40483", max_taxonomies_return ); - } - else if ( code.equals( "HYDMA" ) ) { - return getTaxonomiesFromId( "6085", max_taxonomies_return ); - } - else if ( code.equals( "MYCFI" ) ) { - return getTaxonomiesFromId( "83344", max_taxonomies_return ); - } - else if ( code.equals( "OIDMAI" ) ) { - return getTaxonomiesFromId( "78148", max_taxonomies_return ); - } - else if ( code.equals( "OSTRC" ) ) { - return getTaxonomiesFromId( "385169", max_taxonomies_return ); - } - else if ( code.equals( "POSPL" ) ) { - return getTaxonomiesFromId( "104341", max_taxonomies_return ); - } - else if ( code.equals( "SAICOM" ) ) { - return getTaxonomiesFromId( "5606", max_taxonomies_return ); - } - else if ( code.equals( "SERLA" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); - } - else if ( code.equals( "SPORO" ) ) { - return getTaxonomiesFromId( "40563", max_taxonomies_return ); - } - else if ( code.equals( "ACRALC" ) ) { - return getTaxonomiesFromId( "398408", max_taxonomies_return ); - } - else if ( code.equals( "THITER" ) ) { - return getTaxonomiesFromId( "35720", max_taxonomies_return ); - } - else if ( code.equals( "MYCTHE" ) ) { - return getTaxonomiesFromId( "78579", max_taxonomies_return ); - } - else if ( code.equals( "CONPUT" ) ) { - return getTaxonomiesFromId( "80637", max_taxonomies_return ); - } - else if ( code.equals( "WOLCOC" ) ) { - return getTaxonomiesFromId( "81056", max_taxonomies_return ); - } - else if ( code.equals( "CLAGRA" ) ) { - return getTaxonomiesFromId( "27339", max_taxonomies_return ); - } - else if ( code.equals( "XANPAR" ) ) { - return getTaxonomiesFromId( "107463", max_taxonomies_return ); - } - else if ( code.equals( "HYDPIN" ) ) { - return getTaxonomiesFromId( "388859", max_taxonomies_return ); - } - else if ( code.equals( "SERLAC" ) ) { - return getTaxonomiesFromId( "85982", max_taxonomies_return ); - } - else { - return null; - } - } - - private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); - } - - private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) - throws IOException { - return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); - } - - private static List uniProtTaxonomyToList( final UniProtTaxonomy tax ) { - final List l = new ArrayList(); - l.add( tax ); - return l; - } - - private static List parseUniProtTaxonomy( final List result ) throws IOException { - final List taxonomies = new ArrayList(); - for( final String line : result ) { - if ( ForesterUtil.isEmpty( line ) ) { - // Ignore empty lines. - } - else if ( line.startsWith( "Taxon" ) ) { - final String[] items = line.split( "\t" ); - if ( !( items[ 1 ].equalsIgnoreCase( "Mnemonic" ) && items[ 2 ].equalsIgnoreCase( "Scientific name" ) - && items[ 3 ].equalsIgnoreCase( "Common name" ) && items[ 4 ].equalsIgnoreCase( "Synonym" ) - && items[ 5 ].equalsIgnoreCase( "Other Names" ) && items[ 6 ].equalsIgnoreCase( "Reviewed" ) - && items[ 7 ].equalsIgnoreCase( "Rank" ) && items[ 8 ].equalsIgnoreCase( "Lineage" ) ) ) { - throw new IOException( "Unreconized UniProt Taxonomy format: " + line ); - } - } - else { - if ( line.split( "\t" ).length > 4 ) { - taxonomies.add( new UniProtTaxonomy( line ) ); - } - } - } - return taxonomies; - } - - public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { - final StringBuilder url_sb = new StringBuilder(); - url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); - url_sb.append( '/' ); - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - url_sb.append( '/' ); - } - else { - url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - url_sb.append( '/' ); - } - } - return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); - } - - public static List queryUniprot( final String query, final int max_lines_to_return ) throws IOException { - return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL ); - } - - public static List queryDb( final String query, int max_lines_to_return, final String base_url ) - throws IOException { - if ( ForesterUtil.isEmpty( query ) ) { - throw new IllegalArgumentException( "illegal attempt to use empty query " ); - } - if ( max_lines_to_return < 1 ) { - max_lines_to_return = 1; - } - final URL url = new URL( base_url + query ); - if ( DEBUG ) { - System.out.println( "url: " + url.toString() ); - } - final URLConnection urlc = url.openConnection(); - final BufferedReader in = new BufferedReader( new InputStreamReader( urlc.getInputStream() ) ); - String line; - final List result = new ArrayList(); - while ( ( line = in.readLine() ) != null ) { - if ( DEBUG ) { - System.out.println( line ); - } - result.add( line ); - if ( result.size() > max_lines_to_return ) { - break; - } - } - in.close(); - return result; - } - - public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) - throws IOException { - final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); - return UniProtEntry.createInstanceFromPlainText( lines ); - } - - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); - } - - public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) - throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainText( lines ); - } -} diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java deleted file mode 100644 index d5056e6..0000000 --- a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java +++ /dev/null @@ -1,147 +0,0 @@ -// $Id: -// forester -- software libraries and applications -// for genomics and evolutionary biology research. -// -// Copyright (C) 2010 Christian M Zmasek -// Copyright (C) 2010 Sanford-Burnham Medical Research Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.ws.uniprot; - -import java.util.List; - -import org.forester.util.ForesterUtil; - -public final class UniProtEntry implements SequenceDatabaseEntry { - - private String _ac; - private String _rec_name; - private String _os_scientific_name; - private String _tax_id; - private String _symbol; - - private UniProtEntry() { - } - - @Override - public Object clone() throws CloneNotSupportedException { - throw new CloneNotSupportedException(); - } - - public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { - final UniProtEntry e = new UniProtEntry(); - for( final String line : lines ) { - if ( line.startsWith( "AC" ) ) { - e.setAc( DatabaseTools.extract( line, "AC", ";" ) ); - } - else if ( line.startsWith( "DE" ) ) { - if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) ); - } - } - else if ( line.startsWith( "GN" ) ) { - if ( ( line.indexOf( "Name=" ) > 0 ) ) { - e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) ); - } - } - else if ( line.startsWith( "OS" ) ) { - if ( line.indexOf( "(" ) > 0 ) { - e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) ); - } - else { - e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) ); - } - } - else if ( line.startsWith( "OX" ) ) { - if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); - } - } - } - return e; - } - - @Override - public String getAccession() { - return _ac; - } - - private void setAc( final String ac ) { - if ( _ac == null ) { - _ac = ac; - } - } - - @Override - public String getSequenceName() { - return _rec_name; - } - - private void setRecName( final String rec_name ) { - if ( _rec_name == null ) { - _rec_name = rec_name; - } - } - - @Override - public String getTaxonomyScientificName() { - return _os_scientific_name; - } - - private void setOsScientificName( final String os_scientific_name ) { - if ( _os_scientific_name == null ) { - _os_scientific_name = os_scientific_name; - } - } - - @Override - public String getTaxonomyIdentifier() { - return _tax_id; - } - - private void setTaxId( final String tax_id ) { - if ( _tax_id == null ) { - _tax_id = tax_id; - } - } - - @Override - public String getSequenceSymbol() { - return _symbol; - } - - private void setSymbol( final String symbol ) { - if ( _symbol == null ) { - _symbol = symbol; - } - } - - @Override - public boolean isEmpty() { - return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() ) - && ForesterUtil.isEmpty( getTaxonomyScientificName() ) - && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); - } - - @Override - public String getProvider() { - return "uniprot"; - } -} diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtTaxonomy.java b/forester/java/src/org/forester/ws/uniprot/UniProtTaxonomy.java deleted file mode 100644 index 4f62f77..0000000 --- a/forester/java/src/org/forester/ws/uniprot/UniProtTaxonomy.java +++ /dev/null @@ -1,210 +0,0 @@ -// $Id: -// forester -- software libraries and applications -// for genomics and evolutionary biology research. -// -// Copyright (C) 2010 Christian M Zmasek -// Copyright (C) 2010 Sanford-Burnham Medical Research Institute -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester - -package org.forester.ws.uniprot; - -import java.util.ArrayList; -import java.util.List; - -import org.forester.util.ForesterUtil; - -public final class UniProtTaxonomy { - - private static final String ARCHAEA = "Archaea"; - private static final String BACTERIA = "Bacteria"; - private static final String EUKARYOTA = "Eukaryota"; - private final List _lineage; - private final String _code; - private final String _scientific_name; - private final String _common_name; - private final String _synonym; - private final String _rank; - private final String _id; - public final static String CELLULAR_ORGANISMS = "cellular organisms"; - public final static String VIRUSES = "Viruses"; - public final static UniProtTaxonomy DROSOPHILA_GENUS = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Arthropoda", "Hexapoda", "Insecta", "Pterygota", "Neoptera", - "Endopterygota", "Diptera", "Brachycera", "Muscomorpha", "Ephydroidea", "Drosophilidae", "Drosophila" }, - "", - "fruit flies", - "Drosophila", - "", - "genus", - "7215" ); - public final static UniProtTaxonomy XENOPUS_GENUS = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Amphibia", - "Batrachia", "Anura", "Mesobatrachia", "Pipoidea", "Pipidae", "Xenopodinae", "Xenopus" }, - "", - "", - "Xenopus", - "", - "genus", - "8353" ); - public final static UniProtTaxonomy CAPITELLA_TELATA_SPECIES = new UniProtTaxonomy( new String[] { - CELLULAR_ORGANISMS, EUKARYOTA, "Metazoa", "Annelida", "Polychaeta", "Scolecida", "Capitellida", - "Capitellidae", "Capitella", "Capitella teleta" }, - "", - "", - "Capitella teleta", - "Capitella sp. I", - "species", - "283909" ); - - // public final static UniProtTaxonomy NUCLEARIIDAE_AND_FONTICULA = new UniProtTaxonomy( new String[] { - // CELLULAR_ORGANISMS, EUKARYOTA, "Nucleariidae and Fonticula group" }, "", "", "", "", "", "1001604" ); - public UniProtTaxonomy( final String line ) { - final String[] items = line.split( "\t" ); - if ( items.length < 5 ) { - throw new IllegalArgumentException( "cannot parse uniprot taxonomy from: " + line ); - } - _id = items[ 0 ].trim(); - _code = items[ 1 ].trim(); - _scientific_name = items[ 2 ].trim(); - _common_name = items[ 3 ].trim(); - _synonym = items[ 4 ].trim(); - if ( items.length > 6 ) { - _rank = items[ 7 ].trim(); - } - else { - _rank = ""; - } - String[] lin = null; - if ( items.length > 8 ) { - lin = items[ 8 ].split( "; " ); - } - _lineage = new ArrayList(); - if ( ( lin != null ) && ( lin.length > 0 ) ) { - final List temp = new ArrayList(); - for( final String t : lin ) { - if ( !ForesterUtil.isEmpty( t ) ) { - temp.add( t.trim() ); - } - } - for( int i = 0; i < temp.size(); ++i ) { - if ( ( i == 0 ) - && ( temp.get( i ).equalsIgnoreCase( EUKARYOTA ) || temp.get( i ).equalsIgnoreCase( BACTERIA ) || temp - .get( i ).equalsIgnoreCase( ARCHAEA ) ) ) { - _lineage.add( CELLULAR_ORGANISMS ); - } - _lineage.add( temp.get( i ) ); - } - } - if ( _lineage.isEmpty() - && ( _scientific_name.equalsIgnoreCase( EUKARYOTA ) || _scientific_name.equalsIgnoreCase( BACTERIA ) || _scientific_name - .equalsIgnoreCase( ARCHAEA ) ) ) { - _lineage.add( CELLULAR_ORGANISMS ); - } - _lineage.add( _scientific_name ); - if ( _lineage.isEmpty() ) { - throw new IllegalArgumentException( "lineage in a UniProt Taxonomy can not be empty\n: " + line ); - } - } - - public UniProtTaxonomy( final List lineage, - final String code, - final String common_name, - final String scientific_name, - final String synonym, - final String rank, - final String id ) { - _lineage = lineage; - _code = code; - _scientific_name = scientific_name; - _common_name = common_name; - _synonym = synonym; - _rank = rank; - _id = id; - if ( ( _lineage != null ) && !_lineage.get( _lineage.size() - 1 ).equalsIgnoreCase( _scientific_name ) ) { - _lineage.add( _scientific_name ); - } - } - - public UniProtTaxonomy( final String[] lineage, - final String code, - final String common_name, - final String scientific_name, - final String synonym, - final String rank, - final String id ) { - _lineage = new ArrayList(); - if ( lineage != null ) { - for( final String l : lineage ) { - _lineage.add( l ); - } - } - _code = code; - _scientific_name = scientific_name; - _common_name = common_name; - _synonym = synonym; - _rank = rank; - _id = id; - if ( ( _lineage != null ) && !_lineage.get( _lineage.size() - 1 ).equalsIgnoreCase( _scientific_name ) ) { - _lineage.add( _scientific_name ); - } - } - - /** - * Creates deep copy for all fields, except lineage. - * - * @return - */ - public UniProtTaxonomy copy() { - return new UniProtTaxonomy( getLineage(), - getCode() != null ? new String( getCode() ) : null, - getCommonName() != null ? new String( getCommonName() ) : null, - getScientificName() != null ? new String( getScientificName() ) : null, - getSynonym() != null ? new String( getSynonym() ) : null, - getRank() != null ? new String( getRank() ) : null, - getId() != null ? new String( getId() ) : null ); - } - - public String getCode() { - return _code; - } - - public String getCommonName() { - return _common_name; - } - - public String getId() { - return _id; - } - - public List getLineage() { - return _lineage; - } - - public String getRank() { - return _rank; - } - - public String getScientificName() { - return _scientific_name; - } - - public String getSynonym() { - return _synonym; - } -}