inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 3 Oct 2013 00:14:28 +0000 (00:14 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 3 Oct 2013 00:14:28 +0000 (00:14 +0000)
forester/java/src/org/forester/archaeopteryx/TreePanel.java
forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java
forester/java/src/org/forester/archaeopteryx/tools/Blast.java
forester/java/src/org/forester/phylogeny/data/Accession.java
forester/java/src/org/forester/phylogeny/data/Identifier.java
forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterUtil.java
forester/java/src/org/forester/util/SequenceAccessionTools.java [new file with mode: 0644]
forester/java/src/org/forester/util/SequenceIdParser.java [deleted file]
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java

index 6d793c8..ad59803 100644 (file)
@@ -106,6 +106,7 @@ import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
 import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Accession;
 import org.forester.phylogeny.data.Annotation;
 import org.forester.phylogeny.data.BranchColor;
 import org.forester.phylogeny.data.Confidence;
@@ -127,7 +128,7 @@ import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
 import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 
 public final class TreePanel extends JPanel implements ActionListener, MouseWheelListener, Printable {
 
@@ -2272,7 +2273,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                     }
                 }
                 if ( type == '?' ) {
-                    if ( SequenceIdParser.isProtein( query ) ) {
+                    if ( SequenceAccessionTools.isProtein( query ) ) {
                         type = 'p';
                     }
                     else {
@@ -2914,17 +2915,11 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
     }
 
     final private String isCanOpenSeqWeb( final PhylogenyNode node ) {
-        String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
-        if ( ForesterUtil.isEmpty( v ) ) {
-            v = ForesterUtil.extractGenbankAccessor( node );
+        final Accession a = SequenceAccessionTools.parse( node );
+        if ( a != null ) {
+            return a.getValue();
         }
-        if ( ForesterUtil.isEmpty( v ) ) {
-            v = ForesterUtil.extractRefSeqAccessorAccessor( node );
-        }
-        if ( ForesterUtil.isEmpty( v ) ) {
-            v = ForesterUtil.extractGInumber( node );
-        }
-        return v;
+        return null;
     }
 
     final private boolean isCanOpenTaxWeb( final PhylogenyNode node ) {
index 7c1cbaf..3955c88 100644 (file)
@@ -31,7 +31,7 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.phylogeny.iterators.PreorderTreeIterator;
 import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 import org.forester.ws.seqdb.UniProtTaxonomy;
 
 public class TreePanelUtil {
@@ -80,7 +80,7 @@ public class TreePanelUtil {
                                                    final Configuration conf,
                                                    final TreePanel tp ) {
         String uri_str = null;
-        final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+        final String upkb = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node );
         if ( !ForesterUtil.isEmpty( upkb ) ) {
             try {
                 uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
@@ -91,10 +91,10 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = ForesterUtil.extractGenbankAccessor( node );
+            final String v = SequenceAccessionTools.extractGenbankAccessor( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
-                    if ( SequenceIdParser.isProtein( v ) ) {
+                    if ( SequenceAccessionTools.isProtein( v ) ) {
                         uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
                     }
                     else {
@@ -108,10 +108,10 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = ForesterUtil.extractRefSeqAccessorAccessor( node );
+            final String v = SequenceAccessionTools.extractRefSeqAccessor( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
-                    if ( SequenceIdParser.isProtein( v ) ) {
+                    if ( SequenceAccessionTools.isProtein( v ) ) {
                         uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
                     }
                     else {
@@ -125,7 +125,7 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = ForesterUtil.extractGInumber( node );
+            final String v = SequenceAccessionTools.extractGInumber( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
                     uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 );
index 49e2841..bf3cb49 100644 (file)
@@ -38,9 +38,9 @@ import javax.swing.JApplet;
 import org.forester.archaeopteryx.AptxUtil;
 import org.forester.archaeopteryx.TreePanel;
 import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Accession;
 import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 import org.forester.ws.wabi.RestUtil;
 
 public final class Blast {
@@ -81,34 +81,34 @@ public final class Blast {
             }
             if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null )
                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
-                final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getAccession()
+                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getAccession()
                         .getValue() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) {
-                final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() );
+                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) {
-                final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() );
+                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query )
                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) {
-                final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() );
+                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
         }
         if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) {
-            final Identifier id = SequenceIdParser.parse( node.getName() );
+            final Accession id = SequenceAccessionTools.parse( node.getName() );
             if ( id != null ) {
                 query = id.getValue();
             }
index 5fb3afe..8d9739a 100644 (file)
@@ -34,10 +34,22 @@ import org.forester.util.ForesterUtil;
 
 public final class Accession implements PhylogenyData, Comparable<Accession> {
 
-    final private String _comment;
-    final private String _source;
-    final private String _source_value;
-    final private String _value;
+    final private String       _comment;
+    final private String       _source;
+    final private String       _source_value;
+    final private String       _value;
+    final public static String NCBI    = "ncbi";
+    final public static String REFSEQ  = "refseq";
+    final public static String UNIPROT = "uniprot";
+    final public static String GI      = "gi";
+    public static final String EMBL    = "embl";
+
+    public Accession( final String value ) {
+        _value = value;
+        _source = "";
+        _comment = "";
+        _source_value = value;
+    }
 
     public Accession( final String value, final String source ) {
         _value = value;
index 4c3e9b3..39997e6 100644 (file)
@@ -33,12 +33,9 @@ import org.forester.util.ForesterUtil;
 
 public final class Identifier implements PhylogenyData {
 
-    final public static String NCBI   = "ncbi";
-    final public static String REFSEQ = "refseq";
-    final public static String SP     = "sp";
-    final private String       _value;
-    final private String       _provider;
-    final private String       _value_provider;
+    final private String _value;
+    final private String _provider;
+    final private String _value_provider;
 
     public Identifier() {
         _value = "";
index 2cfbf0b..26cb433 100644 (file)
@@ -34,7 +34,7 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 
 import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 
 class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData {
 
@@ -168,19 +168,19 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData {
 
     private static String obtainSeqLink( final String p ) {
         String link;
-        final String up_id = ForesterUtil.extractUniProtKbProteinSeqIdentifier( p );
+        final String up_id = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( p );
         if ( !ForesterUtil.isEmpty( up_id ) ) {
             link = "<a class=\"pl\" href=\"" + ForesterUtil.UNIPROT_KB + up_id + "\" target=\"_up_window\">" + up_id
                     + "</a>";
         }
         else {
-            final String gb_id = SequenceIdParser.parseGenbankProteinAccessor( p );
+            final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessor( p );
             if ( !ForesterUtil.isEmpty( gb_id ) ) {
                 link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_PROTEIN + gb_id + "\" target=\"_up_window\">"
                         + gb_id + "</a>";
             }
             else {
-                final String gi = SequenceIdParser.parseGInumber( p );
+                final String gi = SequenceAccessionTools.parseGInumber( p );
                 if ( !ForesterUtil.isEmpty( gi ) ) {
                     link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_GI + gi + "\" target=\"_up_window\">gi|" + gi
                             + "</a>";
index d1b94ed..d397f77 100644 (file)
@@ -114,7 +114,7 @@ import org.forester.util.DescriptiveStatistics;
 import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
 import org.forester.util.GeneralTable;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 import org.forester.ws.seqdb.SequenceDatabaseEntry;
 import org.forester.ws.seqdb.SequenceDbWsTools;
 import org.forester.ws.seqdb.UniProtTaxonomy;
@@ -2504,8 +2504,8 @@ public final class Test {
             if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
                 return false;
             }
-            n.setName( "j40f4_Q06891.1_fndn2 fnr3" );
-            if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) {
+            n.setName( "AAA34956" );
+            if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
                 return false;
             }
             n.setName( "GI:394892" );
@@ -2523,6 +2523,16 @@ public final class Test {
                 System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
                 return false;
             }
+            n.setName( "P12345" );
+            if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) {
+                System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
+                return false;
+            }
+            n.setName( "gi_fdgjmn-3jk5-243 mnefmn fg023-0 P12345 4395jtmnsrg02345m1ggi92450jrg890j4t0j240" );
+            if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) {
+                System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
+                return false;
+            }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );
@@ -3329,46 +3339,46 @@ public final class Test {
         //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
         //Protein:    3 letters + 5 numerals
         //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
-        if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "AAY423861" ) != null ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "AY4238612" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "AY4238612" ) != null ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "AAY4238612" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "AAY4238612" ) != null ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "Y423861" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "Y423861" ) != null ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "|S123456" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "|S123456" ) != null ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "ABC123456" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "ABC123456" ) != null ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
             return false;
         }
-        if ( SequenceIdParser.parseGenbankAccessor( "ABCD12345" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessor( "ABCD12345" ) != null ) {
             return false;
         }
         return true;
@@ -3662,166 +3672,166 @@ public final class Test {
         try {
             PhylogenyNode n = new PhylogenyNode();
             n.setName( "tr|B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr.B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr=B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr-B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr/B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr\\B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr_B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( " tr|B3RJ64 " );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "-tr|B3RJ64-" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "-tr=B3RJ64-" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "_tr=B3RJ64_" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( " tr_tr|B3RJ64_sp|123 " );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
-            n.setName( "sp|B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            n.setName( "B3RJ64" );
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
-            n.setName( "ssp|B3RJ64" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            n.setName( "sp|B3RJ64" );
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "sp|B3RJ64C" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
                 return false;
             }
             n.setName( "sp B3RJ64" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "sp|B3RJ6X" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
                 return false;
             }
             n.setName( "sp|B3RJ6" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
                 return false;
             }
             n.setName( "K1PYK7_CRAGI" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_PEA" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_RAT" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_PIG" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
                 return false;
             }
             n.setName( "~K1PYK7_PIG~" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
                 return false;
             }
             n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             n.setName( "K1PYKX_CRAGI" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
                 return false;
             }
             n.setName( "XXXXX_CRAGI" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
                 return false;
             }
             n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
                 return false;
             }
             n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" );
-            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
                 return false;
             }
             n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence();
             seq.setSymbol( "K1PYK7_CRAGI" );
             n.getNodeData().addSequence( seq );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             seq.setSymbol( "tr|B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setName( "K1PYK7_CRAGI" );
             n.getNodeData().addSequence( seq );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             seq.setName( "tr|B3RJ64" );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) );
             n.getNodeData().addSequence( seq );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setAccession( new Accession( "tr|B3RJ64", "?" ) );
             n.getNodeData().addSequence( seq );
-            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             //
             n = new PhylogenyNode();
             n.setName( "ACP19736" );
-            if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+            if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             n.setName( "_ACP19736_" );
-            if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+            if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
                 return false;
             }
         }
@@ -9620,120 +9630,120 @@ public final class Test {
 
     private static boolean testSequenceIdParsing() {
         try {
-            Identifier id = SequenceIdParser.parse( "gb_ADF31344_segmented_worms_" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+            Accession id = SequenceAccessionTools.parse( "gb_ADF31344_segmented_worms_" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             //
-            id = SequenceIdParser.parse( "segmented worms|gb_ADF31344" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "segmented worms|gb_ADF31344" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             //
-            id = SequenceIdParser.parse( "segmented worms gb_ADF31344 and more" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "segmented worms gb_ADF31344 and more" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "gb_AAA96518_1" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "AAA96518" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "gb_AAA96518_1" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "gb_EHB07727_1_rodents_" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "EHB07727" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "gb_EHB07727_1_rodents_" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "dbj_BAF37827_1_turtles_" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "BAF37827" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "dbj_BAF37827_1_turtles_" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "emb_CAA73223_1_primates_" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "CAA73223" ) || !id.getProvider().equals( "ncbi" ) ) {
+            id = SequenceAccessionTools.parse( "emb_CAA73223_1_primates_" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "mites|ref_XP_002434188_1" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) {
+            id = SequenceAccessionTools.parse( "mites|ref_XP_002434188_1" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "mites_ref_XP_002434188_1_bla_XP_12345" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) {
+            id = SequenceAccessionTools.parse( "mites_ref_XP_002434188_1_bla_XP_12345" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "P4A123" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) {
+            id = SequenceAccessionTools.parse( "P4A123" );
+            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+                    || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) {
                 if ( id != null ) {
                     System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
+                    System.out.println( "provider=" + id.getSource() );
                 }
                 return false;
             }
             // 
-            id = SequenceIdParser.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" );
-            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
-                    || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) {
-                if ( id != null ) {
-                    System.out.println( "value   =" + id.getValue() );
-                    System.out.println( "provider=" + id.getProvider() );
-                }
-                return false;
-            }
+            //            id = SequenceAccessionTools.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" );
+            //            if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+            //                    || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "sp" ) ) {
+            //                if ( id != null ) {
+            //                    System.out.println( "value   =" + id.getValue() );
+            //                    System.out.println( "provider=" + id.getSource() );
+            //                }
+            //                return false;
+            //            }
             // 
-            id = SequenceIdParser.parse( "XP_12345" );
+            id = SequenceAccessionTools.parse( "XP_12345" );
             if ( id != null ) {
                 System.out.println( "value   =" + id.getValue() );
-                System.out.println( "provider=" + id.getProvider() );
+                System.out.println( "provider=" + id.getSource() );
                 return false;
             }
             // lcl_91970_unknown_
index aed217b..43700ef 100644 (file)
@@ -87,10 +87,6 @@ public final class ForesterUtil {
     public static final String       NCBI_PROTEIN                     = "http://www.ncbi.nlm.nih.gov/protein/";
     public static final String       NCBI_NUCCORE                     = "http://www.ncbi.nlm.nih.gov/nuccore/";
     public final static String       UNIPROT_KB                       = "http://www.uniprot.org/uniprot/";
-    public final static Pattern      UNIPROT_KB_PATTERN_1             = Pattern
-                                                                              .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
-    public final static Pattern      UNIPROT_KB_PATTERN_2             = Pattern
-                                                                              .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
     public static final String       NCBI_GI                          = "http://www.ncbi.nlm.nih.gov/protein/gi:";
     static {
         final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
@@ -105,157 +101,6 @@ public final class ForesterUtil {
     private ForesterUtil() {
     }
 
-    public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() );
-            }
-            if ( !isEmpty( seq.getGeneName() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getGeneName() );
-            }
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() );
-            }
-        }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseRefSeqAccessor( node.getName() );
-        }
-        return v;
-    }
-
-    public static String extractGenbankAccessor( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() );
-            }
-            if ( !isEmpty( seq.getGeneName() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getGeneName() );
-            }
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() );
-            }
-        }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseGenbankAccessor( node.getName() );
-        }
-        return v;
-    }
-
-    public static String extractGInumber( final PhylogenyNode node ) {
-        String v = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
-                v = SequenceIdParser.parseGInumber( seq.getName() );
-            }
-            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() );
-            }
-        }
-        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
-            v = SequenceIdParser.parseGInumber( node.getName() );
-        }
-        return v;
-    }
-
-    public static String extractUniProtKbProteinSeqIdentifier( final String str ) {
-        String upkb = null;
-        Matcher m = UNIPROT_KB_PATTERN_1.matcher( str );
-        if ( m.find() ) {
-            upkb = m.group( 1 );
-        }
-        else {
-            m = UNIPROT_KB_PATTERN_2.matcher( str );
-            if ( m.find() ) {
-                upkb = m.group();
-            }
-        }
-        return upkb;
-    }
-
-    public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
-        String upkb = null;
-        if ( node.getNodeData().isHasSequence() ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            Matcher m;
-            if ( !isEmpty( seq.getSymbol() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
-                    if ( m.find() ) {
-                        upkb = m.group();
-                    }
-                }
-            }
-            if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
-                    if ( m.find() ) {
-                        upkb = m.group();
-                    }
-                }
-            }
-            if ( isEmpty( upkb ) && !isEmpty( seq.getGeneName() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getGeneName() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getGeneName() );
-                    if ( m.find() ) {
-                        upkb = m.group();
-                    }
-                }
-            }
-            if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !isEmpty( seq.getAccession().getValue() ) ) {
-                m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
-                if ( m.find() ) {
-                    upkb = m.group( 1 );
-                }
-                else {
-                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
-                    if ( m.find() ) {
-                        upkb = m.group();
-                    }
-                }
-            }
-        }
-        if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) {
-            final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() );
-            if ( m1.find() ) {
-                upkb = m1.group( 1 );
-            }
-            else {
-                final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() );
-                if ( m2.find() ) {
-                    upkb = m2.group();
-                }
-            }
-        }
-        return upkb;
-    }
-
     final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
         if ( sb.length() > 0 ) {
             sb.append( separator );
diff --git a/forester/java/src/org/forester/util/SequenceAccessionTools.java b/forester/java/src/org/forester/util/SequenceAccessionTools.java
new file mode 100644 (file)
index 0000000..4136049
--- /dev/null
@@ -0,0 +1,322 @@
+// $Id:\r
+// FORESTER -- software libraries and applications\r
+// for evolutionary biology research and applications.\r
+//\r
+// Copyright (C) 2008-2009 Christian M. Zmasek\r
+// Copyright (C) 2008-2009 Burnham Institute for Medical Research\r
+// Copyright (C) 2000-2001 Washington University School of Medicine\r
+// and Howard Hughes Medical Institute\r
+// Copyright (C) 2003-2007 Ethalinda K.S. Cannon\r
+// All rights reserved\r
+//\r
+// This library is free software; you can redistribute it and/or\r
+// modify it under the terms of the GNU Lesser General Public\r
+// License as published by the Free Software Foundation; either\r
+// version 2.1 of the License, or (at your option) any later version.\r
+//\r
+// This library is distributed in the hope that it will be useful,\r
+// but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+// Lesser General Public License for more details.\r
+//\r
+// You should have received a copy of the GNU Lesser General Public\r
+// License along with this library; if not, write to the Free Software\r
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA\r
+//\r
+// Contact: phylosoft @ gmail . com\r
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester\r
+\r
+package org.forester.util;\r
+\r
+import java.util.regex.Matcher;\r
+import java.util.regex.Pattern;\r
+\r
+import org.forester.phylogeny.PhylogenyNode;\r
+import org.forester.phylogeny.data.Accession;\r
+import org.forester.phylogeny.data.Sequence;\r
+\r
+public final class SequenceAccessionTools {\r
+\r
+    public final static Pattern  UNIPROT_KB_PATTERN_0            = Pattern\r
+                                                                         .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_1            = Pattern\r
+                                                                         .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_2            = Pattern\r
+                                                                         .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
+    // gb_ADF31344_1_segmented_worms_\r
+    // gb_AAA96518_1\r
+    // gb_EHB07727_1_rodents_\r
+    // dbj_BAF37827_1_turtles_\r
+    // emb_CAA73223_1_primates_\r
+    // lcl_91970_unknown_\r
+    // mites|ref_XP_002434188_1\r
+    // ref_XP_002434188_1_mites___ticks_\r
+    // ref_NP_001121530_1_frogs___toads_\r
+    //The format for GenBank Accession numbers are:\r
+    //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
+    //Protein:    3 letters + 5 numerals\r
+    //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
+    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern GENBANK_PROTEIN_AC_PATTERN      = Pattern\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern GI_PATTERN                      = Pattern\r
+                                                                         .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
+    // RefSeq accession numbers can be distinguished from GenBank accessions \r
+    // by their distinct prefix format of 2 characters followed by an\r
+    // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
+    private final static Pattern REFSEQ_PATTERN                  = Pattern\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
+\r
+    private SequenceAccessionTools() {\r
+        // Hiding the constructor.\r
+    }\r
+\r
+    public static String extractGenbankAccessor( final PhylogenyNode node ) {\r
+        String v = null;\r
+        if ( node.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = node.getNodeData().getSequence();\r
+            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+                v = parseGenbankAccessor( seq.getSymbol() );\r
+            }\r
+            if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+                v = parseGenbankAccessor( seq.getGeneName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                v = parseGenbankAccessor( seq.getName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                v = parseGenbankAccessor( seq.getAccession().getValue() );\r
+            }\r
+        }\r
+        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+            v = parseGenbankAccessor( node.getName() );\r
+        }\r
+        return v;\r
+    }\r
+\r
+    public static String extractGInumber( final PhylogenyNode node ) {\r
+        String v = null;\r
+        if ( node.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = node.getNodeData().getSequence();\r
+            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                v = parseGInumber( seq.getName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                v = parseGInumber( seq.getAccession().getValue() );\r
+            }\r
+        }\r
+        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+            v = parseGInumber( node.getName() );\r
+        }\r
+        return v;\r
+    }\r
+\r
+    public static String extractRefSeqAccessor( final PhylogenyNode node ) {\r
+        String v = null;\r
+        if ( node.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = node.getNodeData().getSequence();\r
+            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+                v = parseRefSeqAccessor( seq.getSymbol() );\r
+            }\r
+            if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+                v = parseRefSeqAccessor( seq.getGeneName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                v = parseRefSeqAccessor( seq.getName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                v = parseRefSeqAccessor( seq.getAccession().getValue() );\r
+            }\r
+        }\r
+        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+            v = parseRefSeqAccessor( node.getName() );\r
+        }\r
+        return v;\r
+    }\r
+\r
+    public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {\r
+        String a = null;\r
+        if ( node.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = node.getNodeData().getSequence();\r
+            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() );\r
+            }\r
+        }\r
+        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+            a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() );\r
+        }\r
+        return a;\r
+    }\r
+\r
+    public static String extractUniProtKbProteinSeqIdentifier( final String str ) {\r
+        Matcher m = UNIPROT_KB_PATTERN_0.matcher( str );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        m = UNIPROT_KB_PATTERN_1.matcher( str );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        m = UNIPROT_KB_PATTERN_2.matcher( str );\r
+        if ( m.find() ) {\r
+            return m.group();\r
+        }\r
+        return null;\r
+    }\r
+\r
+    public final static boolean isProtein( final String query ) {\r
+        final String r1 = parseRefSeqAccessor( query );\r
+        if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
+            return true;\r
+        }\r
+        final String r2 = extractUniProtKbProteinSeqIdentifier( query );\r
+        if ( !ForesterUtil.isEmpty( r2 ) ) {\r
+            return true;\r
+        }\r
+        return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
+    }\r
+\r
+    public final static Accession parse( final PhylogenyNode n ) {\r
+        String v = extractUniProtKbProteinSeqIdentifier( n );\r
+        if ( !ForesterUtil.isEmpty( v ) ) {\r
+            return new Accession( v, Accession.UNIPROT );\r
+        }\r
+        v = extractGenbankAccessor( n );\r
+        if ( !ForesterUtil.isEmpty( v ) ) {\r
+            return new Accession( v, Accession.NCBI );\r
+        }\r
+        v = extractRefSeqAccessor( n );\r
+        if ( !ForesterUtil.isEmpty( v ) ) {\r
+            return new Accession( v, Accession.REFSEQ );\r
+        }\r
+        v = extractGInumber( n );\r
+        if ( !ForesterUtil.isEmpty( v ) ) {\r
+            return new Accession( v, Accession.GI );\r
+        }\r
+        return null;\r
+    }\r
+\r
+    public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) {\r
+        if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )\r
+                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )\r
+                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {\r
+            final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase();\r
+            final String value = node.getNodeData().getSequence().getAccession().getValue();\r
+            if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source\r
+                    .equals( "sp" ) ) ) {\r
+                return new Accession( value, Accession.UNIPROT );\r
+            }\r
+            else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {\r
+                return new Accession( value, Accession.EMBL );\r
+            }\r
+            else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {\r
+                return new Accession( value, Accession.NCBI );\r
+            }\r
+            else if ( source.equals( "refseq" ) ) {\r
+                return new Accession( value, Accession.REFSEQ );\r
+            }\r
+            else if ( source.equals( "gi" ) ) {\r
+                return new Accession( value, Accession.GI );\r
+            }\r
+        }\r
+        return null;\r
+    }\r
+\r
+    /**\r
+     * Returns null if no match.\r
+     * \r
+     */\r
+    public final static Accession parse( final String s ) {\r
+        if ( !ForesterUtil.isEmpty( s ) ) {\r
+            String v = extractUniProtKbProteinSeqIdentifier( s );\r
+            if ( !ForesterUtil.isEmpty( v ) ) {\r
+                return new Accession( v, Accession.UNIPROT );\r
+            }\r
+            v = parseGenbankAccessor( s );\r
+            if ( !ForesterUtil.isEmpty( v ) ) {\r
+                return new Accession( v, Accession.NCBI );\r
+            }\r
+            v = parseRefSeqAccessor( s );\r
+            if ( !ForesterUtil.isEmpty( v ) ) {\r
+                return new Accession( v, Accession.REFSEQ );\r
+            }\r
+            v = parseGInumber( s );\r
+            if ( !ForesterUtil.isEmpty( v ) ) {\r
+                return new Accession( v, Accession.GI );\r
+            }\r
+        }\r
+        return null;\r
+    }\r
+\r
+    /**\r
+     * Returns null if no match.\r
+     * \r
+     */\r
+    public static String parseGenbankAccessor( final String query ) {\r
+        Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
+        if ( m.lookingAt() ) {\r
+            return m.group( 1 );\r
+        }\r
+        else {\r
+            m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
+            if ( m.lookingAt() ) {\r
+                return m.group( 1 );\r
+            }\r
+            else {\r
+                m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+                if ( m.lookingAt() ) {\r
+                    return m.group( 1 );\r
+                }\r
+                else {\r
+                    return null;\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+    public static String parseGenbankProteinAccessor( final String query ) {\r
+        final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+        if ( m.lookingAt() ) {\r
+            return m.group( 1 );\r
+        }\r
+        else {\r
+            return null;\r
+        }\r
+    }\r
+\r
+    public static String parseGInumber( final String query ) {\r
+        final Matcher m = GI_PATTERN.matcher( query );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        return null;\r
+    }\r
+\r
+    /**\r
+     * Returns null if no match.\r
+     * \r
+     */\r
+    public final static String parseRefSeqAccessor( final String query ) {\r
+        final Matcher m = REFSEQ_PATTERN.matcher( query );\r
+        if ( m.lookingAt() ) {\r
+            return m.group( 1 );\r
+        }\r
+        return null;\r
+    }\r
+}\r
diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java
deleted file mode 100644 (file)
index 8fcf6ee..0000000
+++ /dev/null
@@ -1,173 +0,0 @@
-// $Id:\r
-// FORESTER -- software libraries and applications\r
-// for evolutionary biology research and applications.\r
-//\r
-// Copyright (C) 2008-2009 Christian M. Zmasek\r
-// Copyright (C) 2008-2009 Burnham Institute for Medical Research\r
-// Copyright (C) 2000-2001 Washington University School of Medicine\r
-// and Howard Hughes Medical Institute\r
-// Copyright (C) 2003-2007 Ethalinda K.S. Cannon\r
-// All rights reserved\r
-//\r
-// This library is free software; you can redistribute it and/or\r
-// modify it under the terms of the GNU Lesser General Public\r
-// License as published by the Free Software Foundation; either\r
-// version 2.1 of the License, or (at your option) any later version.\r
-//\r
-// This library is distributed in the hope that it will be useful,\r
-// but WITHOUT ANY WARRANTY; without even the implied warranty of\r
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
-// Lesser General Public License for more details.\r
-//\r
-// You should have received a copy of the GNU Lesser General Public\r
-// License along with this library; if not, write to the Free Software\r
-// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA\r
-//\r
-// Contact: phylosoft @ gmail . com\r
-// WWW: https://sites.google.com/site/cmzmasek/home/software/forester\r
-\r
-package org.forester.util;\r
-\r
-import java.util.regex.Matcher;\r
-import java.util.regex.Pattern;\r
-\r
-import org.forester.phylogeny.data.Identifier;\r
-\r
-public final class SequenceIdParser {\r
-\r
-    // gb_ADF31344_1_segmented_worms_\r
-    // gb_AAA96518_1\r
-    // gb_EHB07727_1_rodents_\r
-    // dbj_BAF37827_1_turtles_\r
-    // emb_CAA73223_1_primates_\r
-    // lcl_91970_unknown_\r
-    // mites|ref_XP_002434188_1\r
-    // ref_XP_002434188_1_mites___ticks_\r
-    // ref_NP_001121530_1_frogs___toads_\r
-    //The format for GenBank Accession numbers are:\r
-    //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
-    //Protein:    3 letters + 5 numerals\r
-    //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
-    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GENBANK_PROTEIN_AC_PATTERN      = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    // RefSeq accession numbers can be distinguished from GenBank accessions \r
-    // by their distinct prefix format of 2 characters followed by an\r
-    // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
-    private final static Pattern REFSEQ_PATTERN                  = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
-    // See: http://web.expasy.org/docs/userman.html#ID_line\r
-    private final static Pattern TREMBL_PATTERN                  = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GI_PATTERN                      = Pattern\r
-                                                                         .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
-\r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public final static Identifier parse( final String s ) {\r
-        if ( !ForesterUtil.isEmpty( s ) ) {\r
-            String v = parseGenbankAccessor( s );\r
-            if ( !ForesterUtil.isEmpty( v ) ) {\r
-                return new Identifier( v, Identifier.NCBI );\r
-            }\r
-            v = parseRefSeqAccessor( s );\r
-            if ( !ForesterUtil.isEmpty( v ) ) {\r
-                return new Identifier( v, Identifier.REFSEQ );\r
-            }\r
-            v = parseTrEMBLAccessor( s );\r
-            if ( !ForesterUtil.isEmpty( v ) ) {\r
-                return new Identifier( v, Identifier.SP );\r
-            }\r
-        }\r
-        return null;\r
-    }\r
-\r
-    public final static boolean isProtein( final String query ) {\r
-        final String r1 = parseRefSeqAccessor( query );\r
-        if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
-            return true;\r
-        }\r
-        final String r2 = parseTrEMBLAccessor( query );\r
-        if ( !ForesterUtil.isEmpty( r2 ) ) {\r
-            return true;\r
-        }\r
-        return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
-    }\r
-\r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public static String parseGenbankAccessor( final String query ) {\r
-        Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
-        if ( m.lookingAt() ) {\r
-            return m.group( 1 );\r
-        }\r
-        else {\r
-            m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
-            if ( m.lookingAt() ) {\r
-                return m.group( 1 );\r
-            }\r
-            else {\r
-                m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
-                if ( m.lookingAt() ) {\r
-                    return m.group( 1 );\r
-                }\r
-                else {\r
-                    return null;\r
-                }\r
-            }\r
-        }\r
-    }\r
-\r
-    public static String parseGenbankProteinAccessor( final String query ) {\r
-        final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
-        if ( m.lookingAt() ) {\r
-            return m.group( 1 );\r
-        }\r
-        else {\r
-            return null;\r
-        }\r
-    }\r
-\r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public final static String parseRefSeqAccessor( final String query ) {\r
-        final Matcher m = REFSEQ_PATTERN.matcher( query );\r
-        if ( m.lookingAt() ) {\r
-            return m.group( 1 );\r
-        }\r
-        return null;\r
-    }\r
-\r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    private final static String parseTrEMBLAccessor( final String query ) {\r
-        final Matcher m = TREMBL_PATTERN.matcher( query );\r
-        if ( m.lookingAt() ) {\r
-            return m.group( 1 );\r
-        }\r
-        return null;\r
-    }\r
-\r
-    private SequenceIdParser() {\r
-        // Hiding the constructor.\r
-    }\r
-\r
-    public static String parseGInumber( final String query ) {\r
-        final Matcher m = GI_PATTERN.matcher( query );\r
-        if ( m.find() ) {\r
-            return m.group( 1 );\r
-        }\r
-        return null;\r
-    }\r
-}\r
index f5f83e4..c40b37e 100644 (file)
@@ -48,7 +48,7 @@ import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
 
 public final class SequenceDbWsTools {
 
@@ -137,13 +137,13 @@ public final class SequenceDbWsTools {
         return null;
     }
 
-    public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return )
+    public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return )
             throws IOException {
         final List<String> lines = queryEmblDb( id, max_lines_to_return );
         return EbiDbEntry.createInstanceFromPlainText( lines );
     }
 
-    public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return )
+    public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
             throws IOException {
         final List<String> lines = queryEmblDb( id, max_lines_to_return );
         return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
@@ -159,188 +159,176 @@ public final class SequenceDbWsTools {
             if ( ext_nodes_only && node.isInternal() ) {
                 continue;
             }
-            String query = null;
-            Identifier id = null;
-            Db db = Db.NONE;
-            if ( node.getNodeData().isHasSequence()
-                    && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
-                            .startsWith( "uniprot" )
-                            || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
-                                    .startsWith( "swissprot" )
-                            || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
-                                    .startsWith( "trembl" )
-                            || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
-                                    .startsWith( "sp" ) || node.getNodeData().getSequence().getAccession().getValue()
-                            .toLowerCase().startsWith( "uniprotkb" ) ) ) {
-                query = node.getNodeData().getSequence().getAccession().getValue();
-                db = Db.UNIPROT;
+            //            String query = null;
+            //            Accession id = null;
+            //            Accession acc = SequenceAccessionTools.obtain( node );
+            //            
+            //            
+            //            Db db = Db.NONE;
+            //            if ( node.getNodeData().isHasSequence()
+            //                    && ( node.getNodeData().getSequence().getAccession() != null )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+            //                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
+            //                            .startsWith( "uniprot" )
+            //                            || node.getNodeData().getSequence().getAccession().getValue()
+            //                                    .equalsIgnoreCase( "swissprot" )
+            //                            || node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "trembl" ) || node
+            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "sp" ) ) ) {
+            //                query = node.getNodeData().getSequence().getAccession().getValue();
+            //                db = Db.UNIPROT;
+            //            }
+            //            else if ( node.getNodeData().isHasSequence()
+            //                    && ( node.getNodeData().getSequence().getAccession() != null )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+            //                    && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "embl" ) || node
+            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ebi" ) ) ) {
+            //                query = node.getNodeData().getSequence().getAccession().getValue();
+            //                db = Db.EMBL;
+            //            }
+            //            else if ( node.getNodeData().isHasSequence()
+            //                    && ( node.getNodeData().getSequence().getAccession() != null )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+            //                    && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ncbi" ) || node
+            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "genbank" ) ) ) {
+            //                query = node.getNodeData().getSequence().getAccession().getValue();
+            //                // db = Db.NCBI;
+            //            }
+            //            else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+            //                    && node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "refseq" ) ) {
+            //                query = node.getNodeData().getSequence().getAccession().getValue();
+            //                db = Db.REFSEQ;
+            //            }
+            //            else {
+            Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node );
+            //                if ( ( query = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
+            //                    db = Db.UNIPROT;
+            //                }
+            //                else if ( node.getNodeData().isHasSequence() ) {
+            //                    if ( ( id = SequenceAccessionTools.parse( node.getName() ) ) != null ) {
+            //                        if ( id.getSource() == Accession.NCBI ) {
+            //                            //  db = Db.NCBI;
+            //                        }
+            //                        else if ( id.getSource() == Accession.REFSEQ ) {
+            //                            db = Db.REFSEQ;
+            //                        }
+            //                    }
+            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ) ) != null ) {
+            //                        if ( id.getSource() == Accession.NCBI ) {
+            //                            // = Db.NCBI;
+            //                        }
+            //                        else if ( id.getSource() == Accession.REFSEQ ) {
+            //                            db = Db.REFSEQ;
+            //                        }
+            //                    }
+            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) {
+            //                        if ( id.getSource() == Accession.NCBI ) {
+            //                            // db = Db.NCBI;
+            //                        }
+            //                        else if ( id.getSource() == Accession.REFSEQ ) {
+            //                            db = Db.REFSEQ;
+            //                        }
+            //                    }
+            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) {
+            //                        if ( id.getSource() == Accession.NCBI ) {
+            //                            // db = Db.NCBI;
+            //                        }
+            //                        else if ( id.getSource() == Accession.REFSEQ ) {
+            //                            db = Db.REFSEQ;
+            //                        }
+            //                    }
+            //                }
+            // }
+            if ( ( acc == null )
+                    || ForesterUtil.isEmpty( acc.getSource() )
+                    || ForesterUtil.isEmpty( acc.getValue() )
+                    || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc
+                            .getSource() != Accession.REFSEQ ) ) ) {
+                acc = SequenceAccessionTools.parse( node );
             }
-            else if ( node.getNodeData().isHasSequence()
-                    && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node
-                            .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) {
-                query = node.getNodeData().getSequence().getAccession().getValue();
-                db = Db.EMBL;
-            }
-            else if ( node.getNodeData().isHasSequence()
-                    && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ncbi" ) || node
-                            .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "genbank" ) ) ) {
-                query = node.getNodeData().getSequence().getAccession().getValue();
-                // db = Db.NCBI;
-            }
-            else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-                    && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "refseq" ) ) {
-                query = node.getNodeData().getSequence().getAccession().getValue();
-                db = Db.REFSEQ;
-            }
-            else {
-                if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
-                    db = Db.UNIPROT;
-                }
-                else if ( node.getNodeData().isHasSequence() ) {
-                    if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) {
-                        if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
-                            //  db = Db.NCBI;
-                        }
-                        else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
-                            db = Db.REFSEQ;
-                        }
-                    }
-                    else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() ) ) != null ) {
-                        if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
-                            // = Db.NCBI;
-                        }
-                        else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
-                            db = Db.REFSEQ;
-                        }
-                    }
-                    else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) {
-                        if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
-                            // db = Db.NCBI;
-                        }
-                        else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
-                            db = Db.REFSEQ;
-                        }
-                    }
-                    else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) {
-                        if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
-                            // db = Db.NCBI;
-                        }
-                        else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
-                            db = Db.REFSEQ;
-                        }
-                    }
-                }
-            }
-            if ( db == Db.NONE ) {
+            if ( ( acc == null )
+                    || ForesterUtil.isEmpty( acc.getSource() )
+                    || ForesterUtil.isEmpty( acc.getValue() )
+                    || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc
+                            .getSource() != Accession.REFSEQ ) ) ) {
                 not_found.add( node.toString() );
             }
-            SequenceDatabaseEntry db_entry = null;
-            if ( !ForesterUtil.isEmpty( query ) ) {
-                if ( db == Db.UNIPROT ) {
+            else {
+                SequenceDatabaseEntry db_entry = null;
+                final String query = acc.getValue();
+                if ( acc.getSource() == Accession.UNIPROT ) {
                     if ( DEBUG ) {
                         System.out.println( "uniprot: " + query );
                     }
                     db_entry = obtainUniProtEntry( query, lines_to_return );
                 }
-                else if ( db == Db.EMBL ) {
+                else if ( acc.getSource() == Accession.EMBL ) {
                     if ( DEBUG ) {
                         System.out.println( "embl: " + query );
                     }
-                    db_entry = obtainEmblEntry( new Identifier( query ), lines_to_return );
+                    db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
                 }
-                else if ( db == Db.REFSEQ ) {
+                else if ( acc.getSource() == Accession.REFSEQ ) {
                     if ( DEBUG ) {
                         System.out.println( "refseq: " + query );
                     }
-                    db_entry = obtainRefSeqEntryFromEmbl( new Identifier( query ), lines_to_return );
+                    db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
                 }
-                //   else if ( db == Db.NCBI ) {
-                //       if ( DEBUG ) {
-                //           System.out.println( "ncbi: " + query );
-                //       }
-                //       db_entry = obtainNcbiEntry( new Identifier( query ), lines_to_return );
-                //  }
-            }
-            else if ( ( db == Db.REFSEQ ) && ( id != null ) ) {
-                db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return );
-            }
-            //else if ( ( db == Db.NCBI ) && ( id != null ) ) {
-            //    db_entry = obtainNcbiEntry( id, lines_to_return );
-            //}
-            if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
-                final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
-                        : new Sequence();
-                if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
-                    String type = null;
-                    if ( db == Db.EMBL ) {
-                        type = "embl";
+                if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
+                    final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
+                            : new Sequence();
+                    if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
+                        seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) );
                     }
-                    else if ( db == Db.UNIPROT ) {
-                        type = "uniprot";
+                    if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
+                        seq.setName( db_entry.getSequenceName() );
                     }
-                    //   else if ( db == Db.NCBI ) {
-                    //       type = "ncbi";
-                    //   }
-                    else if ( db == Db.REFSEQ ) {
-                        type = "refseq";
+                    if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) {
+                        seq.setGeneName( db_entry.getGeneName() );
                     }
-                    seq.setAccession( new Accession( db_entry.getAccession(), type ) );
-                }
-                if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
-                    seq.setName( db_entry.getSequenceName() );
-                }
-                if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) {
-                    seq.setGeneName( db_entry.getGeneName() );
-                }
-                if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
-                    try {
-                        seq.setSymbol( db_entry.getSequenceSymbol() );
+                    if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
+                        try {
+                            seq.setSymbol( db_entry.getSequenceSymbol() );
+                        }
+                        catch ( final PhyloXmlDataFormatException e ) {
+                            // Eat this exception.
+                        }
                     }
-                    catch ( final PhyloXmlDataFormatException e ) {
-                        // Eat this exception.
+                    if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) {
+                        for( final GoTerm go : db_entry.getGoTerms() ) {
+                            final Annotation ann = new Annotation( go.getGoId().getId() );
+                            ann.setDesc( go.getName() );
+                            seq.addAnnotation( ann );
+                        }
                     }
-                }
-                if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) {
-                    for( final GoTerm go : db_entry.getGoTerms() ) {
-                        final Annotation ann = new Annotation( go.getGoId().getId() );
-                        ann.setDesc( go.getName() );
-                        seq.addAnnotation( ann );
+                    if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) {
+                        for( final Accession x : db_entry.getCrossReferences() ) {
+                            seq.addCrossReference( x );
+                        }
                     }
-                }
-                if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) {
-                    for( final Accession x : db_entry.getCrossReferences() ) {
-                        seq.addCrossReference( x );
+                    final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
+                            : new Taxonomy();
+                    if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
+                        tax.setScientificName( db_entry.getTaxonomyScientificName() );
+                    }
+                    if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
+                        tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
                     }
+                    node.getNodeData().setTaxonomy( tax );
+                    node.getNodeData().setSequence( seq );
                 }
-                final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
-                        : new Taxonomy();
-                if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
-                    tax.setScientificName( db_entry.getTaxonomyScientificName() );
+                else {
+                    not_found.add( node.getName() );
                 }
-                if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
-                    tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+                try {
+                    Thread.sleep( 10 );// Sleep for 10 ms
+                }
+                catch ( final InterruptedException ie ) {
                 }
-                node.getNodeData().setTaxonomy( tax );
-                node.getNodeData().setSequence( seq );
-            }
-            else if ( db != Db.NONE ) {
-                not_found.add( node.getName() );
-            }
-            try {
-                Thread.sleep( 10 );// Sleep for 10 ms
-            }
-            catch ( final InterruptedException ie ) {
             }
         }
         return not_found;
@@ -388,14 +376,14 @@ public final class SequenceDbWsTools {
         return result;
     }
 
-    public static List<String> queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException {
+    public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
         final StringBuilder url_sb = new StringBuilder();
         url_sb.append( BASE_EMBL_DB_URL );
-        if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
+        if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource() == Accession.NCBI ) ) {
             url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL );
             url_sb.append( '/' );
         }
-        else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
+        else if ( id.getSource() == Accession.REFSEQ ) {
             if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
                 url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
                 url_sb.append( '/' );
@@ -459,8 +447,4 @@ public final class SequenceDbWsTools {
         }
         return taxonomies;
     }
-
-    public enum Db {
-        UNIPROT, EMBL, NCBI, NONE, REFSEQ;
-    }
 }