inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 3 Oct 2013 01:22:16 +0000 (01:22 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 3 Oct 2013 01:22:16 +0000 (01:22 +0000)
forester/java/src/org/forester/archaeopteryx/TreePanel.java
forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java
forester/java/src/org/forester/archaeopteryx/tools/Blast.java
forester/java/src/org/forester/phylogeny/PhylogenyNode.java
forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/SequenceAccessionTools.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java
forester/java/src/org/forester/ws/seqdb/UniProtEntry.java

index ad59803..88c4d66 100644 (file)
@@ -2273,7 +2273,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
                     }
                 }
                 if ( type == '?' ) {
-                    if ( SequenceAccessionTools.isProtein( query ) ) {
+                    if ( SequenceAccessionTools.isProteinDbQuery( query ) ) {
                         type = 'p';
                     }
                     else {
@@ -2915,7 +2915,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
     }
 
     final private String isCanOpenSeqWeb( final PhylogenyNode node ) {
-        final Accession a = SequenceAccessionTools.parse( node );
+        final Accession a = SequenceAccessionTools.obtainAccessorFromDataFields( node );
         if ( a != null ) {
             return a.getValue();
         }
index 3955c88..61db054 100644 (file)
@@ -80,7 +80,7 @@ public class TreePanelUtil {
                                                    final Configuration conf,
                                                    final TreePanel tp ) {
         String uri_str = null;
-        final String upkb = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node );
+        final String upkb = SequenceAccessionTools.obtainUniProtAccessorFromDataFields( node );
         if ( !ForesterUtil.isEmpty( upkb ) ) {
             try {
                 uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
@@ -91,10 +91,10 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = SequenceAccessionTools.extractGenbankAccessor( node );
+            final String v = SequenceAccessionTools.obtainGenbankAccessorFromDataFields( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
-                    if ( SequenceAccessionTools.isProtein( v ) ) {
+                    if ( SequenceAccessionTools.isProteinDbQuery( v ) ) {
                         uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
                     }
                     else {
@@ -108,10 +108,10 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = SequenceAccessionTools.extractRefSeqAccessor( node );
+            final String v = SequenceAccessionTools.obtainRefSeqAccessorFromDataFields( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
-                    if ( SequenceAccessionTools.isProtein( v ) ) {
+                    if ( SequenceAccessionTools.isProteinDbQuery( v ) ) {
                         uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
                     }
                     else {
@@ -125,7 +125,7 @@ public class TreePanelUtil {
             }
         }
         if ( ForesterUtil.isEmpty( uri_str ) ) {
-            final String v = SequenceAccessionTools.extractGInumber( node );
+            final String v = SequenceAccessionTools.obtainGiNumberFromDataFields( node );
             if ( !ForesterUtil.isEmpty( v ) ) {
                 try {
                     uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 );
index bf3cb49..72c23c1 100644 (file)
@@ -81,34 +81,34 @@ public final class Blast {
             }
             if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null )
                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
-                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getAccession()
+                final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getAccession()
                         .getValue() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) {
-                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() );
+                final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getName() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) {
-                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() );
+                final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getSymbol() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
             if ( ForesterUtil.isEmpty( query )
                     && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) {
-                final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() );
+                final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getGeneName() );
                 if ( id != null ) {
                     query = id.getValue();
                 }
             }
         }
         if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) {
-            final Accession id = SequenceAccessionTools.parse( node.getName() );
+            final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getName() );
             if ( id != null ) {
                 query = id.getValue();
             }
index 05876e7..01416eb 100644 (file)
@@ -83,6 +83,9 @@ public final class PhylogenyNode implements Comparable<PhylogenyNode> {
         _descendants = null;
     }
 
+    public boolean isEmpty() {
+    }
+
     /**
      * Adds PhylogenyNode n to the list of child nodes and sets the _parent of n
      * to this.
index 26cb433..6c6e6fd 100644 (file)
@@ -168,19 +168,19 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData {
 
     private static String obtainSeqLink( final String p ) {
         String link;
-        final String up_id = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( p );
+        final String up_id = SequenceAccessionTools.parseUniProtAccessorFromString( p );
         if ( !ForesterUtil.isEmpty( up_id ) ) {
             link = "<a class=\"pl\" href=\"" + ForesterUtil.UNIPROT_KB + up_id + "\" target=\"_up_window\">" + up_id
                     + "</a>";
         }
         else {
-            final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessor( p );
+            final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessorFromString( p );
             if ( !ForesterUtil.isEmpty( gb_id ) ) {
                 link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_PROTEIN + gb_id + "\" target=\"_up_window\">"
                         + gb_id + "</a>";
             }
             else {
-                final String gi = SequenceAccessionTools.parseGInumber( p );
+                final String gi = SequenceAccessionTools.parseGInumberFromString( p );
                 if ( !ForesterUtil.isEmpty( gi ) ) {
                     link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_GI + gi + "\" target=\"_up_window\">gi|" + gi
                             + "</a>";
index d397f77..61b3d3d 100644 (file)
@@ -3339,46 +3339,46 @@ public final class Test {
         //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
         //Protein:    3 letters + 5 numerals
         //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "AY423861" ).equals( "AY423861" ) ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( ".AY423861.2" ).equals( "AY423861.2" ) ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "AAY423861" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AAY423861" ) != null ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "AY4238612" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AY4238612" ) != null ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "AAY4238612" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AAY4238612" ) != null ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "Y423861" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "Y423861" ) != null ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "S12345" ).equals( "S12345" ) ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "|S12345|" ).equals( "S12345" ) ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "|S123456" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "|S123456" ) != null ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "ABC123456" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "ABC123456" ) != null ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "ABC12345" ).equals( "ABC12345" ) ) {
             return false;
         }
-        if ( !SequenceAccessionTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
+        if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "&ABC12345&" ).equals( "ABC12345" ) ) {
             return false;
         }
-        if ( SequenceAccessionTools.parseGenbankAccessor( "ABCD12345" ) != null ) {
+        if ( SequenceAccessionTools.parseGenbankAccessorFromString( "ABCD12345" ) != null ) {
             return false;
         }
         return true;
@@ -3672,166 +3672,166 @@ public final class Test {
         try {
             PhylogenyNode n = new PhylogenyNode();
             n.setName( "tr|B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr.B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr=B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr-B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr/B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr\\B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "tr_B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( " tr|B3RJ64 " );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "-tr|B3RJ64-" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "-tr=B3RJ64-" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "_tr=B3RJ64_" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( " tr_tr|B3RJ64_sp|123 " );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "sp|B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "sp|B3RJ64C" );
-            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) {
                 return false;
             }
             n.setName( "sp B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n.setName( "sp|B3RJ6X" );
-            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) {
                 return false;
             }
             n.setName( "sp|B3RJ6" );
-            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) {
                 return false;
             }
             n.setName( "K1PYK7_CRAGI" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_PEA" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PEA" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_RAT" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_RAT" ) ) {
                 return false;
             }
             n.setName( "K1PYK7_PIG" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PIG" ) ) {
                 return false;
             }
             n.setName( "~K1PYK7_PIG~" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PIG" ) ) {
                 return false;
             }
             n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             n.setName( "K1PYKX_CRAGI" );
-            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) {
                 return false;
             }
             n.setName( "XXXXX_CRAGI" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "XXXXX_CRAGI" ) ) {
                 return false;
             }
             n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "H3IB65" ) ) {
                 return false;
             }
             n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" );
-            if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+            if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) {
                 return false;
             }
             n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "Q86U06" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence();
             seq.setSymbol( "K1PYK7_CRAGI" );
             n.getNodeData().addSequence( seq );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             seq.setSymbol( "tr|B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setName( "K1PYK7_CRAGI" );
             n.getNodeData().addSequence( seq );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) {
                 return false;
             }
             seq.setName( "tr|B3RJ64" );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) );
             n.getNodeData().addSequence( seq );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK8_CRAGI" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             seq = new org.forester.phylogeny.data.Sequence();
             seq.setAccession( new Accession( "tr|B3RJ64", "?" ) );
             n.getNodeData().addSequence( seq );
-            if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+            if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) {
                 return false;
             }
             //
             n = new PhylogenyNode();
             n.setName( "ACP19736" );
-            if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+            if ( !SequenceAccessionTools.obtainGenbankAccessorFromDataFields( n ).equals( "ACP19736" ) ) {
                 return false;
             }
             n = new PhylogenyNode();
             n.setName( "_ACP19736_" );
-            if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+            if ( !SequenceAccessionTools.obtainGenbankAccessorFromDataFields( n ).equals( "ACP19736" ) ) {
                 return false;
             }
         }
@@ -9630,7 +9630,7 @@ public final class Test {
 
     private static boolean testSequenceIdParsing() {
         try {
-            Accession id = SequenceAccessionTools.parse( "gb_ADF31344_segmented_worms_" );
+            Accession id = SequenceAccessionTools.parseAccessorFromString( "gb_ADF31344_segmented_worms_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9640,7 +9640,7 @@ public final class Test {
                 return false;
             }
             //
-            id = SequenceAccessionTools.parse( "segmented worms|gb_ADF31344" );
+            id = SequenceAccessionTools.parseAccessorFromString( "segmented worms|gb_ADF31344" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9650,7 +9650,7 @@ public final class Test {
                 return false;
             }
             //
-            id = SequenceAccessionTools.parse( "segmented worms gb_ADF31344 and more" );
+            id = SequenceAccessionTools.parseAccessorFromString( "segmented worms gb_ADF31344 and more" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9660,7 +9660,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "gb_AAA96518_1" );
+            id = SequenceAccessionTools.parseAccessorFromString( "gb_AAA96518_1" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9670,7 +9670,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "gb_EHB07727_1_rodents_" );
+            id = SequenceAccessionTools.parseAccessorFromString( "gb_EHB07727_1_rodents_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9680,7 +9680,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "dbj_BAF37827_1_turtles_" );
+            id = SequenceAccessionTools.parseAccessorFromString( "dbj_BAF37827_1_turtles_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9690,7 +9690,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "emb_CAA73223_1_primates_" );
+            id = SequenceAccessionTools.parseAccessorFromString( "emb_CAA73223_1_primates_" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) {
                 if ( id != null ) {
@@ -9700,7 +9700,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "mites|ref_XP_002434188_1" );
+            id = SequenceAccessionTools.parseAccessorFromString( "mites|ref_XP_002434188_1" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
                 if ( id != null ) {
@@ -9710,7 +9710,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "mites_ref_XP_002434188_1_bla_XP_12345" );
+            id = SequenceAccessionTools.parseAccessorFromString( "mites_ref_XP_002434188_1_bla_XP_12345" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
                 if ( id != null ) {
@@ -9720,7 +9720,7 @@ public final class Test {
                 return false;
             }
             // 
-            id = SequenceAccessionTools.parse( "P4A123" );
+            id = SequenceAccessionTools.parseAccessorFromString( "P4A123" );
             if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
                     || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) {
                 if ( id != null ) {
@@ -9740,7 +9740,7 @@ public final class Test {
             //                return false;
             //            }
             // 
-            id = SequenceAccessionTools.parse( "XP_12345" );
+            id = SequenceAccessionTools.parseAccessorFromString( "XP_12345" );
             if ( id != null ) {
                 System.out.println( "value   =" + id.getValue() );
                 System.out.println( "provider=" + id.getSource() );
index 4136049..c65fbc5 100644 (file)
@@ -37,12 +37,6 @@ import org.forester.phylogeny.data.Sequence;
 \r
 public final class SequenceAccessionTools {\r
 \r
-    public final static Pattern  UNIPROT_KB_PATTERN_0            = Pattern\r
-                                                                         .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );\r
-    public final static Pattern  UNIPROT_KB_PATTERN_1            = Pattern\r
-                                                                         .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
-    public final static Pattern  UNIPROT_KB_PATTERN_2            = Pattern\r
-                                                                         .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
     // gb_ADF31344_1_segmented_worms_\r
     // gb_AAA96518_1\r
     // gb_EHB07727_1_rodents_\r
@@ -56,207 +50,194 @@ public final class SequenceAccessionTools {
     //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
     //Protein:    3 letters + 5 numerals\r
     //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
-    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GENBANK_PROTEIN_AC_PATTERN      = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
-    private final static Pattern GI_PATTERN                      = Pattern\r
-                                                                         .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
+    public final static Pattern  GENBANK_NUC_PATTERN_1 = Pattern\r
+                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GENBANK_NUC_PATTERN_2 = Pattern\r
+                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GENBANK_PROT_PATTERN  = Pattern\r
+                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+    public final static Pattern  GI_PATTERN            = Pattern.compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_0  = Pattern.compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_1  = Pattern\r
+                                                               .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
+    public final static Pattern  UNIPROT_KB_PATTERN_2  = Pattern\r
+                                                               .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
     // RefSeq accession numbers can be distinguished from GenBank accessions \r
     // by their distinct prefix format of 2 characters followed by an\r
     // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
-    private final static Pattern REFSEQ_PATTERN                  = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern REFSEQ_PATTERN        = Pattern\r
+                                                               .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
 \r
     private SequenceAccessionTools() {\r
         // Hiding the constructor.\r
     }\r
 \r
-    public static String extractGenbankAccessor( final PhylogenyNode node ) {\r
-        String v = null;\r
-        if ( node.getNodeData().isHasSequence() ) {\r
-            final Sequence seq = node.getNodeData().getSequence();\r
-            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
-                v = parseGenbankAccessor( seq.getSymbol() );\r
-            }\r
-            if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
-                v = parseGenbankAccessor( seq.getGeneName() );\r
-            }\r
-            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
-                v = parseGenbankAccessor( seq.getName() );\r
-            }\r
-            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
-                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
-                v = parseGenbankAccessor( seq.getAccession().getValue() );\r
-            }\r
+    public final static boolean isProteinDbQuery( final String query ) {\r
+        final String r1 = parseRefSeqAccessorFromString( query );\r
+        if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
+            return true;\r
         }\r
-        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
-            v = parseGenbankAccessor( node.getName() );\r
+        final String r2 = parseUniProtAccessorFromString( query );\r
+        if ( !ForesterUtil.isEmpty( r2 ) ) {\r
+            return true;\r
         }\r
-        return v;\r
+        return GENBANK_PROT_PATTERN.matcher( query ).lookingAt();\r
     }\r
 \r
-    public static String extractGInumber( final PhylogenyNode node ) {\r
-        String v = null;\r
-        if ( node.getNodeData().isHasSequence() ) {\r
-            final Sequence seq = node.getNodeData().getSequence();\r
-            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
-                v = parseGInumber( seq.getName() );\r
+    public final static Accession obtainAccessorFromDataFields( final PhylogenyNode n ) {\r
+        String a = obtainUniProtAccessorFromDataFields( n );\r
+        if ( !ForesterUtil.isEmpty( a ) ) {\r
+            return new Accession( a, Accession.UNIPROT );\r
+        }\r
+        a = obtainGenbankAccessorFromDataFields( n );\r
+        if ( !ForesterUtil.isEmpty( a ) ) {\r
+            return new Accession( a, Accession.NCBI );\r
+        }\r
+        a = obtainRefSeqAccessorFromDataFields( n );\r
+        if ( !ForesterUtil.isEmpty( a ) ) {\r
+            return new Accession( a, Accession.REFSEQ );\r
+        }\r
+        a = obtainGiNumberFromDataFields( n );\r
+        if ( !ForesterUtil.isEmpty( a ) ) {\r
+            return new Accession( a, Accession.GI );\r
+        }\r
+        return null;\r
+    }\r
+\r
+    public final static Accession obtainFromSeqAccession( final PhylogenyNode n ) {\r
+        if ( n.getNodeData().isHasSequence() && ( n.getNodeData().getSequence().getAccession() != null )\r
+                && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getSource() )\r
+                && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getValue() ) ) {\r
+            final String source = n.getNodeData().getSequence().getAccession().getSource().toLowerCase();\r
+            final String value = n.getNodeData().getSequence().getAccession().getValue();\r
+            if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source\r
+                    .equals( "sp" ) ) ) {\r
+                return new Accession( value, Accession.UNIPROT );\r
             }\r
-            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
-                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
-                v = parseGInumber( seq.getAccession().getValue() );\r
+            else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {\r
+                return new Accession( value, Accession.EMBL );\r
+            }\r
+            else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {\r
+                return new Accession( value, Accession.NCBI );\r
+            }\r
+            else if ( source.equals( "refseq" ) ) {\r
+                return new Accession( value, Accession.REFSEQ );\r
+            }\r
+            else if ( source.equals( "gi" ) ) {\r
+                return new Accession( value, Accession.GI );\r
             }\r
         }\r
-        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
-            v = parseGInumber( node.getName() );\r
-        }\r
-        return v;\r
+        return null;\r
     }\r
 \r
-    public static String extractRefSeqAccessor( final PhylogenyNode node ) {\r
-        String v = null;\r
-        if ( node.getNodeData().isHasSequence() ) {\r
-            final Sequence seq = node.getNodeData().getSequence();\r
+    public final static String obtainGenbankAccessorFromDataFields( final PhylogenyNode n ) {\r
+        String a = null;\r
+        if ( n.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = n.getNodeData().getSequence();\r
             if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
-                v = parseRefSeqAccessor( seq.getSymbol() );\r
+                a = parseGenbankAccessorFromString( seq.getSymbol() );\r
             }\r
             if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
-                v = parseRefSeqAccessor( seq.getGeneName() );\r
+                a = parseGenbankAccessorFromString( seq.getGeneName() );\r
             }\r
-            if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
-                v = parseRefSeqAccessor( seq.getName() );\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                a = parseGenbankAccessorFromString( seq.getName() );\r
             }\r
-            if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+            if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )\r
                     && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
-                v = parseRefSeqAccessor( seq.getAccession().getValue() );\r
+                a = parseGenbankAccessorFromString( seq.getAccession().getValue() );\r
             }\r
         }\r
-        if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
-            v = parseRefSeqAccessor( node.getName() );\r
+        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {\r
+            a = parseGenbankAccessorFromString( n.getName() );\r
         }\r
-        return v;\r
+        return a;\r
     }\r
 \r
-    public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {\r
+    public final static String obtainGiNumberFromDataFields( final PhylogenyNode n ) {\r
         String a = null;\r
-        if ( node.getNodeData().isHasSequence() ) {\r
-            final Sequence seq = node.getNodeData().getSequence();\r
-            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
-                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() );\r
-            }\r
+        if ( n.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = n.getNodeData().getSequence();\r
             if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
-                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() );\r
+                a = parseGInumberFromString( seq.getName() );\r
             }\r
             if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
-                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() );\r
+                a = parseGInumberFromString( seq.getGeneName() );\r
             }\r
-            if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+            if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )\r
                     && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
-                a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() );\r
+                a = parseGInumberFromString( seq.getAccession().getValue() );\r
             }\r
         }\r
-        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
-            a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() );\r
+        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {\r
+            a = parseGInumberFromString( n.getName() );\r
         }\r
         return a;\r
     }\r
 \r
-    public static String extractUniProtKbProteinSeqIdentifier( final String str ) {\r
-        Matcher m = UNIPROT_KB_PATTERN_0.matcher( str );\r
-        if ( m.find() ) {\r
-            return m.group( 1 );\r
-        }\r
-        m = UNIPROT_KB_PATTERN_1.matcher( str );\r
-        if ( m.find() ) {\r
-            return m.group( 1 );\r
-        }\r
-        m = UNIPROT_KB_PATTERN_2.matcher( str );\r
-        if ( m.find() ) {\r
-            return m.group();\r
-        }\r
-        return null;\r
-    }\r
-\r
-    public final static boolean isProtein( final String query ) {\r
-        final String r1 = parseRefSeqAccessor( query );\r
-        if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
-            return true;\r
-        }\r
-        final String r2 = extractUniProtKbProteinSeqIdentifier( query );\r
-        if ( !ForesterUtil.isEmpty( r2 ) ) {\r
-            return true;\r
-        }\r
-        return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
-    }\r
-\r
-    public final static Accession parse( final PhylogenyNode n ) {\r
-        String v = extractUniProtKbProteinSeqIdentifier( n );\r
-        if ( !ForesterUtil.isEmpty( v ) ) {\r
-            return new Accession( v, Accession.UNIPROT );\r
-        }\r
-        v = extractGenbankAccessor( n );\r
-        if ( !ForesterUtil.isEmpty( v ) ) {\r
-            return new Accession( v, Accession.NCBI );\r
-        }\r
-        v = extractRefSeqAccessor( n );\r
-        if ( !ForesterUtil.isEmpty( v ) ) {\r
-            return new Accession( v, Accession.REFSEQ );\r
+    public final static String obtainRefSeqAccessorFromDataFields( final PhylogenyNode n ) {\r
+        String a = null;\r
+        if ( n.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = n.getNodeData().getSequence();\r
+            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+                a = parseRefSeqAccessorFromString( seq.getSymbol() );\r
+            }\r
+            if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+                a = parseRefSeqAccessorFromString( seq.getGeneName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                a = parseRefSeqAccessorFromString( seq.getName() );\r
+            }\r
+            if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                a = parseRefSeqAccessorFromString( seq.getAccession().getValue() );\r
+            }\r
         }\r
-        v = extractGInumber( n );\r
-        if ( !ForesterUtil.isEmpty( v ) ) {\r
-            return new Accession( v, Accession.GI );\r
+        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {\r
+            a = parseRefSeqAccessorFromString( n.getName() );\r
         }\r
-        return null;\r
+        return a;\r
     }\r
 \r
-    public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) {\r
-        if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )\r
-                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )\r
-                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {\r
-            final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase();\r
-            final String value = node.getNodeData().getSequence().getAccession().getValue();\r
-            if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source\r
-                    .equals( "sp" ) ) ) {\r
-                return new Accession( value, Accession.UNIPROT );\r
-            }\r
-            else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {\r
-                return new Accession( value, Accession.EMBL );\r
+    public final static String obtainUniProtAccessorFromDataFields( final PhylogenyNode n ) {\r
+        String a = null;\r
+        if ( n.getNodeData().isHasSequence() ) {\r
+            final Sequence seq = n.getNodeData().getSequence();\r
+            if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+                a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getSymbol() );\r
             }\r
-            else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {\r
-                return new Accession( value, Accession.NCBI );\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+                a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getName() );\r
             }\r
-            else if ( source.equals( "refseq" ) ) {\r
-                return new Accession( value, Accession.REFSEQ );\r
+            if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+                a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getGeneName() );\r
             }\r
-            else if ( source.equals( "gi" ) ) {\r
-                return new Accession( value, Accession.GI );\r
+            if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )\r
+                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+                a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getAccession().getValue() );\r
             }\r
         }\r
-        return null;\r
+        if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {\r
+            a = SequenceAccessionTools.parseUniProtAccessorFromString( n.getName() );\r
+        }\r
+        return a;\r
     }\r
 \r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public final static Accession parse( final String s ) {\r
+    public final static Accession parseAccessorFromString( final String s ) {\r
         if ( !ForesterUtil.isEmpty( s ) ) {\r
-            String v = extractUniProtKbProteinSeqIdentifier( s );\r
+            String v = parseUniProtAccessorFromString( s );\r
             if ( !ForesterUtil.isEmpty( v ) ) {\r
                 return new Accession( v, Accession.UNIPROT );\r
             }\r
-            v = parseGenbankAccessor( s );\r
+            v = parseGenbankAccessorFromString( s );\r
             if ( !ForesterUtil.isEmpty( v ) ) {\r
                 return new Accession( v, Accession.NCBI );\r
             }\r
-            v = parseRefSeqAccessor( s );\r
+            v = parseRefSeqAccessorFromString( s );\r
             if ( !ForesterUtil.isEmpty( v ) ) {\r
                 return new Accession( v, Accession.REFSEQ );\r
             }\r
-            v = parseGInumber( s );\r
+            v = parseGInumberFromString( s );\r
             if ( !ForesterUtil.isEmpty( v ) ) {\r
                 return new Accession( v, Accession.GI );\r
             }\r
@@ -264,22 +245,18 @@ public final class SequenceAccessionTools {
         return null;\r
     }\r
 \r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public static String parseGenbankAccessor( final String query ) {\r
-        Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
+    public final static String parseGenbankAccessorFromString( final String s ) {\r
+        Matcher m = GENBANK_NUC_PATTERN_1.matcher( s );\r
         if ( m.lookingAt() ) {\r
             return m.group( 1 );\r
         }\r
         else {\r
-            m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
+            m = GENBANK_NUC_PATTERN_2.matcher( s );\r
             if ( m.lookingAt() ) {\r
                 return m.group( 1 );\r
             }\r
             else {\r
-                m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+                m = GENBANK_PROT_PATTERN.matcher( s );\r
                 if ( m.lookingAt() ) {\r
                     return m.group( 1 );\r
                 }\r
@@ -290,8 +267,8 @@ public final class SequenceAccessionTools {
         }\r
     }\r
 \r
-    public static String parseGenbankProteinAccessor( final String query ) {\r
-        final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+    public final static String parseGenbankProteinAccessorFromString( final String s ) {\r
+        final Matcher m = GENBANK_PROT_PATTERN.matcher( s );\r
         if ( m.lookingAt() ) {\r
             return m.group( 1 );\r
         }\r
@@ -300,23 +277,35 @@ public final class SequenceAccessionTools {
         }\r
     }\r
 \r
-    public static String parseGInumber( final String query ) {\r
-        final Matcher m = GI_PATTERN.matcher( query );\r
+    public final static String parseGInumberFromString( final String s ) {\r
+        final Matcher m = GI_PATTERN.matcher( s );\r
         if ( m.find() ) {\r
             return m.group( 1 );\r
         }\r
         return null;\r
     }\r
 \r
-    /**\r
-     * Returns null if no match.\r
-     * \r
-     */\r
-    public final static String parseRefSeqAccessor( final String query ) {\r
-        final Matcher m = REFSEQ_PATTERN.matcher( query );\r
+    public final static String parseRefSeqAccessorFromString( final String s ) {\r
+        final Matcher m = REFSEQ_PATTERN.matcher( s );\r
         if ( m.lookingAt() ) {\r
             return m.group( 1 );\r
         }\r
         return null;\r
     }\r
+\r
+    public final static String parseUniProtAccessorFromString( final String s ) {\r
+        Matcher m = UNIPROT_KB_PATTERN_0.matcher( s );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        m = UNIPROT_KB_PATTERN_1.matcher( s );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        m = UNIPROT_KB_PATTERN_2.matcher( s );\r
+        if ( m.find() ) {\r
+            return m.group();\r
+        }\r
+        return null;\r
+    }\r
 }\r
index c40b37e..17c56d1 100644 (file)
@@ -26,6 +26,7 @@
 package org.forester.ws.seqdb;
 
 import java.io.BufferedReader;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
@@ -58,7 +59,7 @@ public final class SequenceDbWsTools {
     public final static String   EMBL_DBS_REFSEQ_P = "refseqp";
     public final static String   EMBL_DBS_REFSEQ_N = "refseqn";
     private final static String  URL_ENC           = "UTF-8";
-    private final static boolean DEBUG             = false;
+    private final static boolean DEBUG             = true;
 
     private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
             throws IOException {
@@ -159,96 +160,13 @@ public final class SequenceDbWsTools {
             if ( ext_nodes_only && node.isInternal() ) {
                 continue;
             }
-            //            String query = null;
-            //            Accession id = null;
-            //            Accession acc = SequenceAccessionTools.obtain( node );
-            //            
-            //            
-            //            Db db = Db.NONE;
-            //            if ( node.getNodeData().isHasSequence()
-            //                    && ( node.getNodeData().getSequence().getAccession() != null )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-            //                    && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
-            //                            .startsWith( "uniprot" )
-            //                            || node.getNodeData().getSequence().getAccession().getValue()
-            //                                    .equalsIgnoreCase( "swissprot" )
-            //                            || node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "trembl" ) || node
-            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "sp" ) ) ) {
-            //                query = node.getNodeData().getSequence().getAccession().getValue();
-            //                db = Db.UNIPROT;
-            //            }
-            //            else if ( node.getNodeData().isHasSequence()
-            //                    && ( node.getNodeData().getSequence().getAccession() != null )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-            //                    && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "embl" ) || node
-            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ebi" ) ) ) {
-            //                query = node.getNodeData().getSequence().getAccession().getValue();
-            //                db = Db.EMBL;
-            //            }
-            //            else if ( node.getNodeData().isHasSequence()
-            //                    && ( node.getNodeData().getSequence().getAccession() != null )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-            //                    && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ncbi" ) || node
-            //                            .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "genbank" ) ) ) {
-            //                query = node.getNodeData().getSequence().getAccession().getValue();
-            //                // db = Db.NCBI;
-            //            }
-            //            else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-            //                    && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-            //                    && node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "refseq" ) ) {
-            //                query = node.getNodeData().getSequence().getAccession().getValue();
-            //                db = Db.REFSEQ;
-            //            }
-            //            else {
             Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node );
-            //                if ( ( query = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
-            //                    db = Db.UNIPROT;
-            //                }
-            //                else if ( node.getNodeData().isHasSequence() ) {
-            //                    if ( ( id = SequenceAccessionTools.parse( node.getName() ) ) != null ) {
-            //                        if ( id.getSource() == Accession.NCBI ) {
-            //                            //  db = Db.NCBI;
-            //                        }
-            //                        else if ( id.getSource() == Accession.REFSEQ ) {
-            //                            db = Db.REFSEQ;
-            //                        }
-            //                    }
-            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ) ) != null ) {
-            //                        if ( id.getSource() == Accession.NCBI ) {
-            //                            // = Db.NCBI;
-            //                        }
-            //                        else if ( id.getSource() == Accession.REFSEQ ) {
-            //                            db = Db.REFSEQ;
-            //                        }
-            //                    }
-            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) {
-            //                        if ( id.getSource() == Accession.NCBI ) {
-            //                            // db = Db.NCBI;
-            //                        }
-            //                        else if ( id.getSource() == Accession.REFSEQ ) {
-            //                            db = Db.REFSEQ;
-            //                        }
-            //                    }
-            //                    else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) {
-            //                        if ( id.getSource() == Accession.NCBI ) {
-            //                            // db = Db.NCBI;
-            //                        }
-            //                        else if ( id.getSource() == Accession.REFSEQ ) {
-            //                            db = Db.REFSEQ;
-            //                        }
-            //                    }
-            //                }
-            // }
             if ( ( acc == null )
                     || ForesterUtil.isEmpty( acc.getSource() )
                     || ForesterUtil.isEmpty( acc.getValue() )
                     || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc
                             .getSource() != Accession.REFSEQ ) ) ) {
-                acc = SequenceAccessionTools.parse( node );
+                acc = SequenceAccessionTools.obtainAccessorFromDataFields( node );
             }
             if ( ( acc == null )
                     || ForesterUtil.isEmpty( acc.getSource() )
@@ -264,19 +182,34 @@ public final class SequenceDbWsTools {
                     if ( DEBUG ) {
                         System.out.println( "uniprot: " + query );
                     }
-                    db_entry = obtainUniProtEntry( query, lines_to_return );
+                    try {
+                        db_entry = obtainUniProtEntry( query, lines_to_return );
+                    }
+                    catch ( FileNotFoundException e ) {
+                        // Eat this, and move to next.
+                    }
                 }
                 else if ( acc.getSource() == Accession.EMBL ) {
                     if ( DEBUG ) {
                         System.out.println( "embl: " + query );
                     }
-                    db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
+                    try {
+                        db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
+                    }
+                    catch ( FileNotFoundException e ) {
+                        // Eat this, and move to next.
+                    }
                 }
                 else if ( acc.getSource() == Accession.REFSEQ ) {
                     if ( DEBUG ) {
                         System.out.println( "refseq: " + query );
                     }
-                    db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
+                    try {
+                        db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
+                    }
+                    catch ( FileNotFoundException e ) {
+                        // Eat this, and move to next.
+                    }
                 }
                 if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
                     final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
@@ -322,6 +255,7 @@ public final class SequenceDbWsTools {
                     node.getNodeData().setSequence( seq );
                 }
                 else {
+                    node.i
                     not_found.add( node.getName() );
                 }
                 try {
index 2eaa720..4a8d158 100644 (file)
@@ -132,7 +132,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
         if ( _go_terms == null ) {
             _go_terms = new ArrayList<GoTerm>();
         }
-        System.out.println( "GOTERM ADDED: " + g );
         _go_terms.add( g );
     }
 
@@ -209,7 +208,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
                         else if ( ns_str.equals( "C" ) ) {
                             gns = GoNameSpace.CELLULAR_COMPONENT_STR;
                         }
-                        System.out.println( "GO:" + id + " " + desc + " " + ns_str );
                         e.addGoTerm( new BasicGoTerm( id, desc, gns, false ) );
                     }
                 }