From f76e0750c0720a74ce64db38fcc8cb0d38dd8071 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 6 Jun 2012 16:05:52 +0000 Subject: [PATCH] phylotastic hackathon at NESCENT 120606 --- forester/java/src/org/forester/test/Test.java | 42 ++++++++++++++------ .../src/org/forester/util/SequenceIdParser.java | 33 +++++++-------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 90430aa..bbfa6dd 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -9016,18 +9016,36 @@ public final class Test { return false; } // -// id = SequenceIdParser.parse( "mites|ref_XP_002434188_1" ); -// if ( id == null -// || ForesterUtil.isEmpty( id.getValue() ) -// || ForesterUtil.isEmpty( id.getProvider() ) -// || !id.getValue().equals( "002434188" ) -// || !id.getProvider().equals( "genbank" ) ) { -// if ( id != null ) { -// System.out.println( "value =" + id.getValue() ); -// System.out.println( "provider=" + id.getProvider() ); -// } -// return false; -// } + id = SequenceIdParser.parse( "mites|ref_XP_002434188_1" ); + if ( id == null + || ForesterUtil.isEmpty( id.getValue() ) + || ForesterUtil.isEmpty( id.getProvider() ) + || !id.getValue().equals( "XP_002434188" ) + || !id.getProvider().equals( "ncbi" ) ) { + if ( id != null ) { + System.out.println( "value =" + id.getValue() ); + System.out.println( "provider=" + id.getProvider() ); + } + return false; + } + // + id = SequenceIdParser.parse( "mites_ref_XP_002434188_1_bla_XP_12345" ); + if ( id == null + || ForesterUtil.isEmpty( id.getValue() ) + || ForesterUtil.isEmpty( id.getProvider() ) + || !id.getValue().equals( "XP_002434188" ) + || !id.getProvider().equals( "ncbi" ) ) { + if ( id != null ) { + System.out.println( "value =" + id.getValue() ); + System.out.println( "provider=" + id.getProvider() ); + } + return false; + } + // + id = SequenceIdParser.parse( "XP_12345" ); + if ( id != null ) { + return false; + } // lcl_91970_unknown_ } diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java index 8def260..2f50f63 100644 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ b/forester/java/src/org/forester/util/SequenceIdParser.java @@ -58,6 +58,14 @@ public final class SequenceIdParser { .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" ); private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); + + // RefSeq accession numbers can be distinguished from GenBank accessions + // by their distinct prefix format of 2 characters followed by an + // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. + private final static Pattern REFSEQ_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" ); + + private final static boolean DEBUG = true; @@ -80,9 +88,6 @@ public final class SequenceIdParser { /** * Returns null if no match. * - * @param query - * @param db - * @return */ static public String parseGenbankAccessor( final String query ) { Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); @@ -106,26 +111,16 @@ public final class SequenceIdParser { } } + /** + * Returns null if no match. + * + */ public final static String parseRefSeqAccessor( final String query ) { - Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + Matcher m = REFSEQ_PATTERN.matcher( query ); if ( m.lookingAt() ) { return m.group( 1 ); } - else { - m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - } + return null; } -- 1.7.10.2