From c78e0c6ccc1b8c7f4e77db43be8d09e2d7c5b78e Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 22 Apr 2011 19:42:57 +0000 Subject: [PATCH] pattern match for up added.... --- .../archaeopteryx/tools/SequenceDataRetriver.java | 24 ++---- forester/java/src/org/forester/test/Test.java | 18 +++++ .../src/org/forester/ws/uniprot/UniProtEntry.java | 15 ---- .../org/forester/ws/uniprot/UniProtWsTools.java | 77 ++++++++++++-------- 4 files changed, 73 insertions(+), 61 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 7f0477e..0960425 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -30,7 +30,6 @@ import java.io.IOException; import java.net.UnknownHostException; import java.util.SortedSet; import java.util.TreeSet; -import java.util.regex.Pattern; import javax.swing.JOptionPane; @@ -49,13 +48,10 @@ import org.forester.ws.uniprot.UniProtWsTools; public final class SequenceDataRetriver implements Runnable { - // uniprot/expasy accession number format (6 chars): - // letter digit letter-or-digit letter-or-digit letter-or-digit digit - private final static Pattern UNIPROT_AC_PATTERN = Pattern.compile( "[A-NR-ZOPQ]\\d[A-Z0-9]{3}\\d" ); private final Phylogeny _phy; private final MainFrameApplication _mf; private final TreePanel _treepanel; - private final static boolean DEBUG = true; + private final static boolean DEBUG = true; private enum Db { UNKNOWN, UNIPROT; @@ -150,7 +146,7 @@ public final class SequenceDataRetriver implements Runnable { } } - synchronized public static SortedSet obtainSeqInformation( final Phylogeny phy ) throws IOException { + public static SortedSet obtainSeqInformation( final Phylogeny phy ) throws IOException { final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); @@ -178,20 +174,14 @@ public final class SequenceDataRetriver implements Runnable { db = Db.UNIPROT; } else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - query = node.getName(); + query = UniProtWsTools.parseUniProtAccessor( node.getName() ); + if ( !ForesterUtil.isEmpty( query ) ) { + db = Db.UNIPROT; + } } if ( !ForesterUtil.isEmpty( query ) ) { - if ( query.indexOf( '/' ) > 0 ) { - query = query.substring( 0, query.indexOf( '/' ) ); - } - if ( query.indexOf( '.' ) > 0 ) { - query = query.substring( 0, query.indexOf( '.' ) ); - } - if ( query.indexOf( '_' ) > 0 ) { - query = query.substring( 0, query.indexOf( '_' ) ); - } SequenceDatabaseEntry db_entry = null; - if ( ( db == Db.UNIPROT ) || UNIPROT_AC_PATTERN.matcher( query ).matches() ) { + if ( db == Db.UNIPROT ) { if ( DEBUG ) { System.out.println( "uniprot: " + query ); } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index a17dbe7..dc436c0 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -7732,6 +7732,24 @@ public final class Test { } private static boolean testUniprotEntryRetrieval() { + if ( !UniProtWsTools.parseUniProtAccessor( "P12345" ).equals( "P12345" ) ) { + return false; + } + if ( !UniProtWsTools.parseUniProtAccessor( "P1DDD5" ).equals( "P1DDD5" ) ) { + return false; + } + if ( UniProtWsTools.parseUniProtAccessor( "P1DDDD" ) != null ) { + return false; + } + if ( !UniProtWsTools.parseUniProtAccessor( "P1234X/P12345/12-42" ).equals( "P12345" ) ) { + return false; + } + if ( !UniProtWsTools.parseUniProtAccessor( "P12345/12-42" ).equals( "P12345" ) ) { + return false; + } + if ( !UniProtWsTools.parseUniProtAccessor( "P1234X/P12345" ).equals( "P12345" ) ) { + return false; + } try { final SequenceDatabaseEntry entry = UniProtWsTools.obtainUniProtEntry( "P12345", 200 ); if ( !entry.getAccession().equals( "P12345" ) ) { diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java index 1ee4cf5..f74b1c7 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java @@ -81,9 +81,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { return target.substring( i_a + a.length(), i_b ).trim(); } - /* (non-Javadoc) - * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getAc() - */ @Override public String getAccession() { return _ac; @@ -95,9 +92,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } - /* (non-Javadoc) - * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getRecName() - */ @Override public String getSequenceName() { return _rec_name; @@ -109,9 +103,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } - /* (non-Javadoc) - * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getOsScientificName() - */ @Override public String getTaxonomyScientificName() { return _os_scientific_name; @@ -123,9 +114,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } - /* (non-Javadoc) - * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getTaxId() - */ @Override public String getTaxonomyIdentifier() { return _tax_id; @@ -137,9 +125,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } - /* (non-Javadoc) - * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getSymbol() - */ @Override public String getSequenceSymbol() { return _symbol; diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java index 8afb9eb..cd8ab43 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java @@ -34,21 +34,46 @@ import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.forester.util.ForesterUtil; public final class UniProtWsTools { - public final static String BASE_URL = "http://www.uniprot.org/"; - private final static String URL_ENC = "UTF-8"; - private final static boolean DEBUG = false; - - synchronized private static String encode( final String str ) throws UnsupportedEncodingException { + public enum Db { + UNKNOWN, UNIPROT; + } + public final static String BASE_URL = "http://www.uniprot.org/"; + private final static String URL_ENC = "UTF-8"; + // uniprot/expasy accession number format (6 chars): + // letter digit letter-or-digit letter-or-digit letter-or-digit digit + private final static Pattern UNIPROT_AC_PATTERN = Pattern + .compile( "^.*[^a-zA-Z0-9]?([A-NR-ZOPQ]\\d[A-Z0-9]{3}\\d)[^a-zA-Z0-9]?" ); + private final static boolean DEBUG = false; + + private static String encode( final String str ) throws UnsupportedEncodingException { return URLEncoder.encode( str.trim(), URL_ENC ); } - synchronized public static List getTaxonomiesFromCommonName( final String cn, - final int max_taxonomies_return ) + /** + * Return null if no match. + * + * @param query + * @param db + * @return + */ + static public String parseUniProtAccessor( final String query ) { + final Matcher m = UNIPROT_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + + public static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { final List result = getTaxonomyStringFromCommonName( cn, max_taxonomies_return ); if ( result.size() > 0 ) { @@ -57,8 +82,8 @@ public final class UniProtWsTools { return null; } - synchronized public static List getTaxonomiesFromCommonNameStrict( final String cn, - final int max_taxonomies_return ) + public static List getTaxonomiesFromCommonNameStrict( final String cn, + final int max_taxonomies_return ) throws IOException { final List taxonomies = getTaxonomiesFromCommonName( cn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { @@ -73,8 +98,7 @@ public final class UniProtWsTools { return null; } - synchronized public static List getTaxonomiesFromId( final String id, - final int max_taxonomies_return ) + public static List getTaxonomiesFromId( final String id, final int max_taxonomies_return ) throws IOException { final List result = getTaxonomyStringFromId( id, max_taxonomies_return ); if ( result.size() > 0 ) { @@ -83,8 +107,8 @@ public final class UniProtWsTools { return null; } - synchronized public static List getTaxonomiesFromScientificName( final String sn, - final int max_taxonomies_return ) + public static List getTaxonomiesFromScientificName( final String sn, + final int max_taxonomies_return ) throws IOException { // Hack! Craniata? .. if ( sn.equals( "Drosophila" ) ) { @@ -106,8 +130,8 @@ public final class UniProtWsTools { * and not "Mus musculus", "Mus musculus bactrianus", ... * */ - synchronized public static List getTaxonomiesFromScientificNameStrict( final String sn, - final int max_taxonomies_return ) + public static List getTaxonomiesFromScientificNameStrict( final String sn, + final int max_taxonomies_return ) throws IOException { final List taxonomies = getTaxonomiesFromScientificName( sn, max_taxonomies_return ); if ( ( taxonomies != null ) && ( taxonomies.size() > 0 ) ) { @@ -122,8 +146,8 @@ public final class UniProtWsTools { return null; } - synchronized public static List getTaxonomiesFromTaxonomyCode( final String code, - final int max_taxonomies_return ) + public static List getTaxonomiesFromTaxonomyCode( final String code, + final int max_taxonomies_return ) throws IOException { String my_code = new String( code ); // Hacks! @@ -140,37 +164,33 @@ public final class UniProtWsTools { return null; } - synchronized private static List getTaxonomyStringFromCommonName( final String cn, - final int max_lines_to_return ) + private static List getTaxonomyStringFromCommonName( final String cn, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=common%3a%22" + encode( cn ) + "%22&format=tab", max_lines_to_return ); } - synchronized private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) + private static List getTaxonomyStringFromId( final String id, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=id%3a%22" + encode( id ) + "%22&format=tab", max_lines_to_return ); } - synchronized private static List getTaxonomyStringFromScientificName( final String sn, - final int max_lines_to_return ) + private static List getTaxonomyStringFromScientificName( final String sn, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=scientific%3a%22" + encode( sn ) + "%22&format=tab", max_lines_to_return ); } - synchronized private static List getTaxonomyStringFromTaxonomyCode( final String code, - final int max_lines_to_return ) + private static List getTaxonomyStringFromTaxonomyCode( final String code, final int max_lines_to_return ) throws IOException { return queryUniprot( "taxonomy/?query=mnemonic%3a%22" + encode( code ) + "%22&format=tab", max_lines_to_return ); } - synchronized private static List hack( final UniProtTaxonomy tax ) { + private static List hack( final UniProtTaxonomy tax ) { final List l = new ArrayList(); l.add( tax ); return l; } - synchronized private static List parseUniProtTaxonomy( final List result ) - throws IOException { + private static List parseUniProtTaxonomy( final List result ) throws IOException { final List taxonomies = new ArrayList(); for( final String line : result ) { if ( ForesterUtil.isEmpty( line ) ) { @@ -188,8 +208,7 @@ public final class UniProtWsTools { return taxonomies; } - synchronized public static List queryUniprot( final String query, int max_lines_to_return ) - throws IOException { + public static List queryUniprot( final String query, int max_lines_to_return ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { throw new IllegalArgumentException( "illegal attempt to use empty query " ); } -- 1.7.10.2