From 45f26280b2ab3b14a640c942bc92a8f6caab4519 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 7 Mar 2013 00:21:44 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/archaeopteryx/Constants.java | 2 +- forester/java/src/org/forester/test/Test.java | 39 -------- .../org/forester/ws/seqdb/SequenceDbWsTools.java | 102 ++++++++------------ .../src/org/forester/ws/seqdb/UniProtEntry.java | 29 ++---- 4 files changed, 49 insertions(+), 123 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/Constants.java b/forester/java/src/org/forester/archaeopteryx/Constants.java index bc90822..500ec78 100644 --- a/forester/java/src/org/forester/archaeopteryx/Constants.java +++ b/forester/java/src/org/forester/archaeopteryx/Constants.java @@ -42,7 +42,7 @@ public final class Constants { public final static boolean __SYNTH_LF = false; // TODO remove me public final static boolean ALLOW_DDBJ_BLAST = false; public final static String PRG_NAME = "Archaeopteryx"; - final static String VERSION = "0.9805 A1ST"; + final static String VERSION = "0.9805+ A1ST"; final static String PRG_DATE = "130306"; final static String DEFAULT_CONFIGURATION_FILE_NAME = "_aptx_configuration_file"; final static String[] DEFAULT_FONT_CHOICES = { "Verdana", "Tahoma", diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index afab717..4a3e95b 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -9416,45 +9416,6 @@ public final class Test { } private static boolean testUniprotEntryRetrieval() { - if ( !SequenceDbWsTools.parseUniProtAccessor( "P12345" ).equals( "P12345" ) ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "EP12345" ) != null ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "3 4P12345" ) != null ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "P12345E" ) != null ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "P123455" ) != null ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "EP12345E" ) != null ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "AY423861" ) != null ) { - return false; - } - if ( !SequenceDbWsTools.parseUniProtAccessor( "P1DDD5" ).equals( "P1DDD5" ) ) { - return false; - } - if ( SequenceDbWsTools.parseUniProtAccessor( "P1DDDD" ) != null ) { - return false; - } - if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X/P12345/12-42" ).equals( "P12345" ) ) { - return false; - } - if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X P12345 12-42" ).equals( "P12345" ) ) { - return false; - } - if ( !SequenceDbWsTools.parseUniProtAccessor( "P12345/12-42" ).equals( "P12345" ) ) { - return false; - } - if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X/P12345" ).equals( "P12345" ) ) { - return false; - } try { final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 ); if ( !entry.getAccession().equals( "P12345" ) ) { diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index c2258d6..e3b222b 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -36,8 +36,6 @@ import java.util.ArrayList; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; @@ -51,20 +49,13 @@ import org.forester.util.SequenceIdParser; public final class SequenceDbWsTools { - private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! + private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease! public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"; public final static String EMBL_DBS_EMBL = "embl"; public final static String EMBL_DBS_REFSEQ_P = "refseqp"; public final static String EMBL_DBS_REFSEQ_N = "refseqn"; private final static String URL_ENC = "UTF-8"; - // uniprot/expasy accession number format (6 chars): - // letter digit letter-or-digit letter-or-digit letter-or-digit digit - // ?: => no back-reference - // \A => begin of String - // \Z => end of String - private final static Pattern UNIPROT_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" ); private final static boolean DEBUG = false; public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) @@ -202,7 +193,7 @@ public final class SequenceDbWsTools { db = Db.EMBL; } else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - if ( ( query = parseUniProtAccessor( node.getName() ) ) != null ) { + if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) { db = Db.UNIPROT; } else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { @@ -240,50 +231,50 @@ public final class SequenceDbWsTools { } else if ( ( db == Db.NCBI ) && ( id != null ) ) { db_entry = obtainEmblEntry( id, lines_to_return ); - if ( ( db_entry != null ) && !db_entry.isEmpty() ) { - final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() - : new Sequence(); - if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { - String type = null; - if ( db == Db.EMBL ) { - type = "embl"; - } - else if ( db == Db.UNIPROT ) { - type = "uniprot"; - } - else if ( db == Db.NCBI ) { - type = "ncbi"; - } - else if ( db == Db.REFSEQ ) { - type = "refseq"; - } - seq.setAccession( new Accession( db_entry.getAccession(), type ) ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { - seq.setName( db_entry.getSequenceName() ); + } + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() + : new Sequence(); + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + String type = null; + if ( db == Db.EMBL ) { + type = "embl"; } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { - seq.setSymbol( db_entry.getSequenceSymbol() ); + else if ( db == Db.UNIPROT ) { + type = "uniprot"; } - final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() - : new Taxonomy(); - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { - tax.setScientificName( db_entry.getTaxonomyScientificName() ); + else if ( db == Db.NCBI ) { + type = "ncbi"; } - if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { - tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + else if ( db == Db.REFSEQ ) { + type = "refseq"; } - node.getNodeData().setTaxonomy( tax ); - node.getNodeData().setSequence( seq ); + seq.setAccession( new Accession( db_entry.getAccession(), type ) ); } - else if ( db != Db.NONE ) { - not_found.add( node.getName() ); + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); } - try { - Thread.sleep( 10 );// Sleep for 10 ms + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + seq.setSymbol( db_entry.getSequenceSymbol() ); } - catch ( final InterruptedException ie ) { + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() + : new Taxonomy(); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); } + if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); + } + else if ( db != Db.NONE ) { + not_found.add( node.getName() ); + } + try { + Thread.sleep( 10 );// Sleep for 10 ms + } + catch ( final InterruptedException ie ) { } } return not_found; @@ -295,23 +286,6 @@ public final class SequenceDbWsTools { return UniProtEntry.createInstanceFromPlainText( lines ); } - /** - * Returns null if no match. - * - * @param query - * @param db - * @return - */ - static public String parseUniProtAccessor( final String query ) { - final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - public static List queryDb( final String query, int max_lines_to_return, final String base_url ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java index 263a115..05e2e59 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -32,10 +32,9 @@ import org.forester.util.ForesterUtil; public final class UniProtEntry implements SequenceDatabaseEntry { private String _ac; - private String _rec_name; + private String _name; private String _os_scientific_name; private String _tax_id; - private String _symbol; private UniProtEntry() { } @@ -51,14 +50,12 @@ public final class UniProtEntry implements SequenceDatabaseEntry { if ( line.startsWith( "AC" ) ) { e.setAc( DatabaseTools.extract( line, "AC", ";" ) ); } - else if ( line.startsWith( "DE" ) ) { + else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) { if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) ); + e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) ); } - } - else if ( line.startsWith( "GN" ) ) { - if ( ( line.indexOf( "Name=" ) > 0 ) ) { - e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) ); + else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { + e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) ); } } else if ( line.startsWith( "OS" ) ) { @@ -91,12 +88,12 @@ public final class UniProtEntry implements SequenceDatabaseEntry { @Override public String getSequenceName() { - return _rec_name; + return _name; } - private void setRecName( final String rec_name ) { - if ( _rec_name == null ) { - _rec_name = rec_name; + private void setSequenceName( final String name ) { + if ( _name == null ) { + _name = name; } } @@ -124,13 +121,7 @@ public final class UniProtEntry implements SequenceDatabaseEntry { @Override public String getSequenceSymbol() { - return _symbol; - } - - private void setSymbol( final String symbol ) { - if ( _symbol == null ) { - _symbol = symbol; - } + return ""; } @Override -- 1.7.10.2