Extension of the UniProtKB accession number format
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 15 May 2014 00:09:51 +0000 (00:09 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 15 May 2014 00:09:51 +0000 (00:09 +0000)
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/SequenceAccessionTools.java

index 5864f61..59150d5 100644 (file)
@@ -11322,7 +11322,6 @@ public final class Test {
                 }
                 return false;
             }
-            //
             id = SequenceAccessionTools.parseAccessorFromString( "segmented worms|gb_ADF31344" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11332,7 +11331,6 @@ public final class Test {
                 }
                 return false;
             }
-            //
             id = SequenceAccessionTools.parseAccessorFromString( "segmented worms gb_ADF31344 and more" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11342,7 +11340,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "gb_AAA96518_1" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11352,7 +11349,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "gb_EHB07727_1_rodents_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11362,7 +11358,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "dbj_BAF37827_1_turtles_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11372,7 +11367,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "emb_CAA73223_1_primates_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) {
@@ -11382,7 +11376,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "mites|ref_XP_002434188_1" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
@@ -11392,7 +11385,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "mites_ref_XP_002434188_1_bla_XP_12345" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
@@ -11402,7 +11394,6 @@ public final class Test {
                 }
                 return false;
             }
-            // 
             id = SequenceAccessionTools.parseAccessorFromString( "P4A123" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) {
@@ -11418,6 +11409,40 @@ public final class Test {
                 System.out.println( "provider=" + id.getSource() );
                 return false;
             }
+            //
+            id = SequenceAccessionTools.parseAccessorFromString( "N3B004Z009" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "N3B004Z009" ) || !id.getSource().equals( "uniprot" ) ) {
+                if ( id != null ) {
+                    System.out.println( "value   =" + id.getValue() );
+                    System.out.println( "provider=" + id.getSource() );
+                }
+                return false;
+            }
+            id = SequenceAccessionTools.parseAccessorFromString( "A4CAA4ZBB9" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "A4CAA4ZBB9" ) || !id.getSource().equals( "uniprot" ) ) {
+                if ( id != null ) {
+                    System.out.println( "value   =" + id.getValue() );
+                    System.out.println( "provider=" + id.getSource() );
+                }
+                return false;
+            }
+            id = SequenceAccessionTools.parseAccessorFromString( "ecoli_A4CAA4ZBB9_rt" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "A4CAA4ZBB9" ) || !id.getSource().equals( "uniprot" ) ) {
+                if ( id != null ) {
+                    System.out.println( "value   =" + id.getValue() );
+                    System.out.println( "provider=" + id.getSource() );
+                }
+                return false;
+            }
+            id = SequenceAccessionTools.parseAccessorFromString( "Q4CAA4ZBB9" );
+            if ( id != null ) {
+                System.out.println( "value   =" + id.getValue() );
+                System.out.println( "provider=" + id.getSource() );
+                return false;
+            }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );
index 5ab000e..3efcb2b 100644 (file)
@@ -38,38 +38,33 @@ import org.forester.phylogeny.data.Sequence;
 \r
 public final class SequenceAccessionTools {\r
 \r
-    // gb_ADF31344_1_segmented_worms_\r
-    // gb_AAA96518_1\r
-    // gb_EHB07727_1_rodents_\r
-    // dbj_BAF37827_1_turtles_\r
-    // emb_CAA73223_1_primates_\r
-    // lcl_91970_unknown_\r
-    // mites|ref_XP_002434188_1\r
-    // ref_XP_002434188_1_mites___ticks_\r
-    // ref_NP_001121530_1_frogs___toads_\r
     //The format for GenBank Accession numbers are:\r
     //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
     //Protein:    3 letters + 5 numerals\r
     //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
-    public final static Pattern  GENBANK_NUC_PATTERN_1 = Pattern\r
-                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    public final static Pattern  GENBANK_NUC_PATTERN_2 = Pattern\r
-                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    public final static Pattern  GENBANK_PROT_PATTERN  = Pattern\r
-                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    public final static Pattern  GI_PATTERN            = Pattern.compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
-    public final static Pattern  UNIPROT_KB_PATTERN_0  = Pattern\r
-                                                               .compile( "(?:\\b|_)([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
-    public final static Pattern  UNIPROT_KB_PATTERN_1  = Pattern\r
-                                                               .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
-    public final static Pattern  UNIPROT_KB_PATTERN_2  = Pattern\r
-                                                               .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
-    public final static Pattern  ENSEMBL_PATTERN       = Pattern.compile( "(?:\\b|_)(ENS[A-Z]*[0-9]+)(?:\\b|_)" );\r
+    public final static Pattern  GENBANK_NUC_PATTERN_1       = Pattern\r
+                                                                     .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GENBANK_NUC_PATTERN_2       = Pattern\r
+                                                                     .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GENBANK_PROT_PATTERN        = Pattern\r
+                                                                     .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GI_PATTERN                  = Pattern\r
+                                                                     .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
+    public final static String   UNIPROT_KB_BASE_PATTERN_STR = "((?:[OPQ][0-9][A-Z0-9]{3}[0-9])|(?:[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}))";\r
+    public final static Pattern  UNIPROT_KB_PATTERN_0        = Pattern.compile( "(?:\\b|_)"\r
+                                                                     + UNIPROT_KB_BASE_PATTERN_STR + "(?:\\b|_)" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_1        = Pattern.compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]"\r
+                                                                     + UNIPROT_KB_BASE_PATTERN_STR + "(?:\\b|_)" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_2        = Pattern\r
+                                                                     .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|"\r
+                                                                             + UNIPROT_KB_BASE_PATTERN_STR\r
+                                                                             + ")_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
+    public final static Pattern  ENSEMBL_PATTERN             = Pattern.compile( "(?:\\b|_)(ENS[A-Z]*[0-9]+)(?:\\b|_)" );\r
     // RefSeq accession numbers can be distinguished from GenBank accessions \r
     // by their distinct prefix format of 2 characters followed by an\r
     // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
-    private final static Pattern REFSEQ_PATTERN        = Pattern\r
-                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern REFSEQ_PATTERN              = Pattern\r
+                                                                     .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
 \r
     private SequenceAccessionTools() {\r
         // Hiding the constructor.\r