From 34e555e6895d06fa5b7de6a646c37b0817efafc6 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 6 Jun 2012 15:24:01 +0000 Subject: [PATCH] phylotastic hackathon at NESCENT 120606 --- .../archaeopteryx/tools/SequenceDataRetriver.java | 7 ++- forester/java/src/org/forester/test/Test.java | 40 ++++++------ .../src/org/forester/util/SequenceIdParser.java | 65 ++++++++++++++++++-- .../src/org/forester/ws/uniprot/DatabaseTools.java | 44 +------------ 4 files changed, 85 insertions(+), 71 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 4db8175..21dcf1f 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -43,6 +43,7 @@ import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; +import org.forester.util.SequenceIdParser; import org.forester.ws.uniprot.DatabaseTools; import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtWsTools; @@ -55,7 +56,7 @@ public final class SequenceDataRetriver extends RunnableProcess { private final static boolean DEBUG = false; private enum Db { - UNKNOWN, UNIPROT, EMBL; + UNKNOWN, UNIPROT, EMBL, NCBI; } public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { @@ -185,8 +186,8 @@ public final class SequenceDataRetriver extends RunnableProcess { if ( ( query = UniProtWsTools.parseUniProtAccessor( node.getName() ) ) != null ) { db = Db.UNIPROT; } - else if ( ( query = DatabaseTools.parseGenbankAccessor( node.getName() ) ) != null ) { - db = Db.EMBL; + else if ( ( query = SequenceIdParser.parseGenbankAccessor( node.getName() ) ) != null ) { + db = Db.NCBI; } } if ( !ForesterUtil.isEmpty( query ) ) { diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index b1b35ee..90430aa 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -7959,43 +7959,43 @@ public final class Test { //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals //Protein: 3 letters + 5 numerals //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - if ( !DatabaseTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { return false; } - if ( !DatabaseTools.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "AAY423861" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "AY4238612" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "AY4238612" ) != null ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "AAY4238612" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "AAY4238612" ) != null ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "Y423861" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "Y423861" ) != null ) { return false; } - if ( !DatabaseTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) { return false; } - if ( !DatabaseTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "|S123456" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "|S123456" ) != null ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "ABC123456" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "ABC123456" ) != null ) { return false; } - if ( !DatabaseTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) { return false; } - if ( !DatabaseTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) { return false; } - if ( DatabaseTools.parseGenbankAccessor( "ABCD12345" ) != null ) { + if ( SequenceIdParser.parseGenbankAccessor( "ABCD12345" ) != null ) { return false; } return true; @@ -8929,7 +8929,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "ADF31344" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -8942,7 +8942,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "ADF31344" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -8955,7 +8955,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "ADF31344" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -8969,7 +8969,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "AAA96518" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -8982,7 +8982,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "EHB07727" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -8995,7 +8995,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "BAF37827" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); @@ -9008,7 +9008,7 @@ public final class Test { || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) || !id.getValue().equals( "CAA73223" ) - || !id.getProvider().equals( "genbank" ) ) { + || !id.getProvider().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getProvider() ); diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java index 7486d8a..8def260 100644 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ b/forester/java/src/org/forester/util/SequenceIdParser.java @@ -58,20 +58,75 @@ public final class SequenceIdParser { .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" ); private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); - private final static boolean DEBUG = false; + private final static boolean DEBUG = true; - + /** + * Returns null if no match. + * + */ public final static Identifier parse( final String s ) { - String v = DatabaseTools.parseGenbankAccessor( s ); + String v = parseGenbankAccessor( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Identifier( v, "ncbi" ); + } + v = parseRefSeqAccessor( s ); if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, "genbank" ); + return new Identifier( v, "ncbi" ); } - return null; } + /** + * Returns null if no match. + * + * @param query + * @param db + * @return + */ + static public String parseGenbankAccessor( final String query ) { + Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + } + } + public final static String parseRefSeqAccessor( final String query ) { + Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + } + } diff --git a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java index 77b317d..d30830c 100644 --- a/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java +++ b/forester/java/src/org/forester/ws/uniprot/DatabaseTools.java @@ -1,52 +1,10 @@ package org.forester.ws.uniprot; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class DatabaseTools { - //The format for GenBank Accession numbers are: - //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals - //Protein: 3 letters + 5 numerals - //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); - private final static boolean DEBUG = false; - - /** - * Returns null if no match. - * - * @param query - * @param db - * @return - */ - static public String parseGenbankAccessor( final String query ) { - Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - } - } - + static String extract( final String target, final String a, final String b ) { final int i_a = target.indexOf( a ); final int i_b = target.indexOf( b ); -- 1.7.10.2