inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 6 Mar 2013 04:29:05 +0000 (04:29 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 6 Mar 2013 04:29:05 +0000 (04:29 +0000)
forester/java/src/org/forester/archaeopteryx/AptxUtil.java
forester/java/src/org/forester/archaeopteryx/TreePanel.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterUtil.java
forester/java/src/org/forester/util/SequenceIdParser.java

index 18e80e7..823e18f 100644 (file)
@@ -36,10 +36,12 @@ import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.net.URI;
 import java.net.URL;
+import java.net.URLEncoder;
 import java.text.ParseException;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -76,6 +78,7 @@ import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Accession;
 import org.forester.phylogeny.data.BranchColor;
+import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
@@ -83,17 +86,13 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.phylogeny.iterators.PreorderTreeIterator;
 import org.forester.util.AsciiHistogram;
 import org.forester.util.DescriptiveStatistics;
+import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
+import org.forester.util.SequenceIdParser;
 import org.forester.ws.seqdb.UniProtTaxonomy;
 
 public final class AptxUtil {
 
-    final static String           UNIPROT_KB                     = "http://www.uniprot.org/uniprot/";
-    final static Pattern          UNIPROT_KB_PATTERN_1             = Pattern
-            .compile( "\\b(sp|tr)\\W([A-Z0-9]{5,6})\\b" );
-
-    final static Pattern          UNIPROT_KB_PATTERN_2             = Pattern
-                                                                         .compile( "\\b[A-Z0-9]{5,6}_[A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA\\b" );
     private final static Pattern  seq_identifier_pattern_1       = Pattern
                                                                          .compile( "^([A-Za-z]{2,5})[|=:]([0-9A-Za-z_\\.]{5,40})\\s*$" );
     private final static Pattern  seq_identifier_pattern_2       = Pattern
@@ -104,6 +103,81 @@ public final class AptxUtil {
         Arrays.sort( AVAILABLE_FONT_FAMILIES_SORTED );
     }
 
+    public final static String createUriForSeqWeb( final PhylogenyNode node,
+                                                   final Configuration conf,
+                                                   final TreePanel tp ) {
+        String uri_str = null;
+        if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
+                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+                && conf.isHasWebLink( node.getNodeData().getSequence().getAccession().getSource().toLowerCase() ) ) {
+            final Sequence seq = node.getNodeData().getSequence();
+            final String source = seq.getAccession().getSource().toLowerCase();
+            String url;
+            if ( source.toLowerCase().equals( "ncbi" ) ) {
+                url = Constants.NCBI_ALL_DATABASE_SEARCH;
+            }
+            else {
+                final WebLink weblink = conf.getWebLink( source );
+                url = weblink.getUrl().toString();
+            }
+            try {
+                uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 );
+            }
+            catch ( final UnsupportedEncodingException e ) {
+                showErrorMessage( tp, e.toString() );
+                e.printStackTrace();
+            }
+        }
+        if ( ForesterUtil.isEmpty( uri_str ) ) {
+            final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+            if ( !ForesterUtil.isEmpty( upkb ) ) {
+                try {
+                    uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
+                }
+                catch ( final UnsupportedEncodingException e ) {
+                    showErrorMessage( tp, e.toString() );
+                    e.printStackTrace();
+                }
+            }
+        }
+        if ( ForesterUtil.isEmpty( uri_str ) ) {
+            final String v = ForesterUtil.extractGenbankAccessor( node );
+            if ( !ForesterUtil.isEmpty( v ) ) {
+                try {
+                    if ( SequenceIdParser.isProtein( v ) ) {
+                        uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
+                    }
+                    else {
+                        uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 );
+                    }
+                }
+                catch ( final UnsupportedEncodingException e ) {
+                    showErrorMessage( tp, e.toString() );
+                    e.printStackTrace();
+                }
+            }
+        }
+        if ( ForesterUtil.isEmpty( uri_str ) ) {
+            final String v = ForesterUtil.extractRefSeqAccessorAccessor( node );
+            if ( !ForesterUtil.isEmpty( v ) ) {
+                try {
+                    if ( SequenceIdParser.isProtein( v ) ) {
+                        uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
+                    }
+                    else {
+                        uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 );
+                    }
+                }
+                catch ( final UnsupportedEncodingException e ) {
+                    showErrorMessage( tp, e.toString() );
+                    e.printStackTrace();
+                }
+            }
+        }
+        return uri_str;
+    }
+
     public static MaskFormatter createMaskFormatter( final String s ) {
         MaskFormatter formatter = null;
         try {
index b94080f..07a99bd 100644 (file)
@@ -77,7 +77,6 @@ import java.util.Hashtable;
 import java.util.List;
 import java.util.Set;
 import java.util.SortedSet;
-import java.util.regex.Matcher;
 
 import javax.swing.BorderFactory;
 import javax.swing.JApplet;
@@ -3142,7 +3141,14 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
             final String title = clickto_names.get( i );
             _node_popup_menu_items[ i ] = new JMenuItem( title );
             if ( title.equals( Configuration.clickto_options[ Configuration.open_seq_web ][ 0 ] ) ) {
-                _node_popup_menu_items[ i ].setEnabled( isCanOpenSeqWeb( node ) );
+                final String id = isCanOpenSeqWeb( node );
+                if ( !ForesterUtil.isEmpty( id ) ) {
+                    _node_popup_menu_items[ i ].setText( _node_popup_menu_items[ i ].getText() + " [" + id + "]" );
+                    _node_popup_menu_items[ i ].setEnabled( true );
+                }
+                else {
+                    _node_popup_menu_items[ i ].setEnabled( false );
+                }
             }
             else if ( title.equals( Configuration.clickto_options[ Configuration.open_tax_web ][ 0 ] ) ) {
                 _node_popup_menu_items[ i ].setEnabled( isCanOpenTaxWeb( node ) );
@@ -3233,137 +3239,31 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
         }
     }
 
-    final private boolean isCanOpenSeqWeb( final PhylogenyNode node ) {
+    final private String isCanOpenSeqWeb( final PhylogenyNode node ) {
         if ( node.getNodeData().isHasSequence()
                 && ( node.getNodeData().getSequence().getAccession() != null )
                 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
                 && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
                 && getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource()
                         .toLowerCase() ) ) {
-            return true;
+            return node.getNodeData().getSequence().getAccession().getSource();
         }
-        if ( !ForesterUtil.isEmpty( node.getName() )
-                && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
-                        .matcher( node.getName() ).find() ) ) {
-            return true;
+        String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+        if ( ForesterUtil.isEmpty( v ) ) {
+            v = ForesterUtil.extractGenbankAccessor( node );
         }
-        if ( node.getNodeData().isHasSequence() ) {
-            Sequence seq = node.getNodeData().getSequence();
-            if ( !ForesterUtil.isEmpty( seq.getName() )
-                    && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
-                            .matcher( seq.getName() ).find() ) ) {
-                return true;
-            }
-            if ( !ForesterUtil.isEmpty( seq.getSymbol() )
-                    && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
-                            .matcher( seq.getSymbol() ).find() ) ) {
-                return true;
-            }
-            if ( ( node.getNodeData().getSequence().getAccession() != null )
-                    && !ForesterUtil.isEmpty( seq.getAccession().getValue() )
-                    && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
-                            .matcher( seq.getAccession().getValue() ).find() ) ) {
-                return true;
-            }
+        if ( ForesterUtil.isEmpty( v ) ) {
+            v = ForesterUtil.extractRefSeqAccessorAccessor( node );
         }
-        return false;
+        return v;
     }
 
     final private void openSeqWeb( final PhylogenyNode node ) {
-        if ( !isCanOpenSeqWeb( node ) ) {
+        if ( ForesterUtil.isEmpty( isCanOpenSeqWeb( node ) ) ) {
             cannotOpenBrowserWarningMessage( "sequence" );
             return;
         }
-        String uri_str = null;
-        if ( node.getNodeData().isHasSequence()
-                && ( node.getNodeData().getSequence().getAccession() != null )
-                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
-                && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
-                && getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource()
-                        .toLowerCase() ) ) {
-            final Sequence seq = node.getNodeData().getSequence();
-            final String source = seq.getAccession().getSource().toLowerCase();
-            String url;
-            if ( source.toLowerCase().equals( "ncbi" ) ) {
-                url = Constants.NCBI_ALL_DATABASE_SEARCH;
-            }
-            else {
-                final WebLink weblink = getConfiguration().getWebLink( source );
-                url = weblink.getUrl().toString();
-            }
-            try {
-                uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 );
-            }
-            catch ( final UnsupportedEncodingException e ) {
-                AptxUtil.showErrorMessage( this, e.toString() );
-                e.printStackTrace();
-            }
-        }
-        else {
-            String upkb = null;
-            if ( node.getNodeData().isHasSequence() ) {
-                Sequence seq = node.getNodeData().getSequence();
-                Matcher m;
-                if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
-                    m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
-                    if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
-                        if ( m.find() ) {
-                            upkb = m.group( 2 );
-                        }
-                        else {
-                            m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
-                            if ( m.find() ) {
-                                upkb = m.group();
-                            }
-                        }
-                    }
-                }
-                if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
-                    m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
-                    if ( m.find() ) {
-                        upkb = m.group( 2 );
-                    }
-                    else {
-                        m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
-                        if ( m.find() ) {
-                            upkb = m.group();
-                        }
-                    }
-                }
-                if ( ForesterUtil.isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
-                        && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
-                    m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
-                    if ( m.find() ) {
-                        upkb = m.group( 2 );
-                    }
-                    else {
-                        m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
-                        if ( m.find() ) {
-                            upkb = m.group();
-                        }
-                    }
-                }
-            }
-            if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( node.getName() ) ) {
-                final Matcher m1 = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() );
-                if ( m1.find() ) {
-                    upkb = m1.group( 2 );
-                }
-                else {
-                    final Matcher m2 = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( node.getName() );
-                    if ( m2.find() ) {
-                        upkb = m2.group();
-                    }
-                }
-            }
-            try {
-                uri_str = AptxUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
-            }
-            catch ( final UnsupportedEncodingException e ) {
-                AptxUtil.showErrorMessage( this, e.toString() );
-                e.printStackTrace();
-            }
-        }
+        final String uri_str = AptxUtil.createUriForSeqWeb( node, getConfiguration(), this );
         if ( !ForesterUtil.isEmpty( uri_str ) ) {
             try {
                 AptxUtil.launchWebBrowser( new URI( uri_str ),
@@ -3417,7 +3317,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
         }
         else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
             try {
-                uri_str = "http://www.eol.org/search?q="
+                uri_str = "http://www.uniprot.org/taxonomy/?query="
                         + URLEncoder.encode( tax.getScientificName(), ForesterConstants.UTF8 );
             }
             catch ( final UnsupportedEncodingException e ) {
@@ -3437,7 +3337,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
         }
         else if ( !ForesterUtil.isEmpty( tax.getCommonName() ) ) {
             try {
-                uri_str = "http://www.eol.org/search?q="
+                uri_str = "http://www.uniprot.org/taxonomy/?query="
                         + URLEncoder.encode( tax.getCommonName(), ForesterConstants.UTF8 );
             }
             catch ( final UnsupportedEncodingException e ) {
@@ -3447,11 +3347,10 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
         }
         if ( !ForesterUtil.isEmpty( uri_str ) ) {
             try {
-                JApplet applet = null;
-                if ( isApplet() ) {
-                    applet = obtainApplet();
-                }
-                AptxUtil.launchWebBrowser( new URI( uri_str ), isApplet(), applet, "_aptx_tax" );
+                AptxUtil.launchWebBrowser( new URI( uri_str ),
+                                           isApplet(),
+                                           isApplet() ? obtainApplet() : null,
+                                           "_aptx_tax" );
             }
             catch ( final IOException e ) {
                 AptxUtil.showErrorMessage( this, e.toString() );
index bd8b586..c13523c 100644 (file)
@@ -38,6 +38,7 @@ import java.util.Locale;
 import java.util.Set;
 
 import org.forester.application.support_transfer;
+import org.forester.archaeopteryx.AptxUtil;
 import org.forester.development.DevelopmentTools;
 import org.forester.evoinference.TestPhylogenyReconstruction;
 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
@@ -67,6 +68,7 @@ import org.forester.phylogeny.PhylogenyBranch;
 import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
+import org.forester.phylogeny.data.Accession;
 import org.forester.phylogeny.data.BinaryCharacters;
 import org.forester.phylogeny.data.BranchWidth;
 import org.forester.phylogeny.data.Confidence;
@@ -216,6 +218,24 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+        System.out.print( "UniProtKB id extraction: " );
+        if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
+            System.out.println( "OK." );
+            succeeded++;
+        }
+        else {
+            System.out.println( "failed." );
+            failed++;
+        }
+        System.out.print( "Uri for Aptx web sequence accession: " );
+        if ( Test.testCreateUriForSeqWeb() ) {
+            System.out.println( "OK." );
+            succeeded++;
+        }
+        else {
+            System.out.println( "failed." );
+            failed++;
+        }
         System.out.print( "Basic node construction and parsing of NHX (node level): " );
         if ( Test.testNHXNodeParsing() ) {
             System.out.println( "OK." );
@@ -824,6 +844,221 @@ public final class Test {
         }
     }
 
+    private static boolean testExtractUniProtKbProteinSeqIdentifier() {
+        try {
+            PhylogenyNode n = new PhylogenyNode();
+            n.setName( "tr|B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr.B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr=B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr-B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr/B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr\\B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "tr_B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( " tr|B3RJ64 " );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "-tr|B3RJ64-" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "-tr=B3RJ64-" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "_tr=B3RJ64_" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( " tr_tr|B3RJ64_sp|123 " );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "sp|B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n.setName( "ssp|B3RJ64" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "sp|B3RJ64C" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "sp B3RJ64" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "sp|B3RJ6X" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "sp|B3RJ6" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "K1PYK7_CRAGI" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+                return false;
+            }
+            n.setName( "K1PYK7_PEA" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
+                return false;
+            }
+            n.setName( "K1PYK7_RAT" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
+                return false;
+            }
+            n.setName( "K1PYK7_PIG" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+                return false;
+            }
+            n.setName( "~K1PYK7_PIG~" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+                return false;
+            }
+            n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+                return false;
+            }
+            n.setName( "K1PYKX_CRAGI" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "XXXXX_CRAGI" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
+                return false;
+            }
+            n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
+                return false;
+            }
+            n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" );
+            if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+                return false;
+            }
+            n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
+                return false;
+            }
+            n = new PhylogenyNode();
+            org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence();
+            seq.setSymbol( "K1PYK7_CRAGI" );
+            n.getNodeData().addSequence( seq );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+                return false;
+            }
+            seq.setSymbol( "tr|B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n = new PhylogenyNode();
+            seq = new org.forester.phylogeny.data.Sequence();
+            seq.setName( "K1PYK7_CRAGI" );
+            n.getNodeData().addSequence( seq );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+                return false;
+            }
+            seq.setName( "tr|B3RJ64" );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            n = new PhylogenyNode();
+            seq = new org.forester.phylogeny.data.Sequence();
+            seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) );
+            n.getNodeData().addSequence( seq );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
+                return false;
+            }
+            n = new PhylogenyNode();
+            seq = new org.forester.phylogeny.data.Sequence();
+            seq.setAccession( new Accession( "tr|B3RJ64", "?" ) );
+            n.getNodeData().addSequence( seq );
+            if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+                return false;
+            }
+            //
+            n = new PhylogenyNode();
+            n.setName( "ACP19736" );
+            if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+                return false;
+            }
+            n = new PhylogenyNode();
+            n.setName( "_ACP19736_" );
+            if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+                return false;
+            }
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace( System.out );
+            return false;
+        }
+        return true;
+    }
+
+    private static boolean testCreateUriForSeqWeb() {
+        try {
+            final PhylogenyNode n = new PhylogenyNode();
+            n.setName( "tr|B3RJ64" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B3RJ64" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                System.exit( -1 );
+                return false;
+            }
+            n.setName( "B0LM41_HUMAN" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B0LM41_HUMAN" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                System.exit( -1 );
+                return false;
+            }
+            n.setName( "NP_001025424" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                System.exit( -1 );
+                return false;
+            }
+            n.setName( "_NM_001030253-" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "NM_001030253" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                System.exit( -1 );
+                return false;
+            }
+            n.setName( "NP_001025424" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                System.exit( -1 );
+                return false;
+            }
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace( System.out );
+            return false;
+        }
+        return true;
+    }
+
     private static boolean testExtractTaxonomyCodeFromNodeName() {
         try {
             if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED )
index 03c2e69..828d3df 100644 (file)
@@ -59,6 +59,7 @@ import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.forester.phylogeny.PhylogenyNode;
@@ -82,6 +83,13 @@ public final class ForesterUtil {
     public static final NumberFormat FORMATTER_6;
     public static final NumberFormat FORMATTER_06;
     public static final NumberFormat FORMATTER_3;
+    public static final String       NCBI_PROTEIN                     = "http://www.ncbi.nlm.nih.gov/protein/";
+    public static final String       NCBI_NUCCORE                     = "http://www.ncbi.nlm.nih.gov/nuccore/";
+    public final static String       UNIPROT_KB                       = "http://www.uniprot.org/uniprot/";
+    public final static Pattern      UNIPROT_KB_PATTERN_1             = Pattern
+                                                                              .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
+    public final static Pattern      UNIPROT_KB_PATTERN_2             = Pattern
+                                                                              .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
     static {
         final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
         dfs.setDecimalSeparator( '.' );
@@ -95,6 +103,106 @@ public final class ForesterUtil {
     private ForesterUtil() {
     }
 
+    public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) {
+        String v = null;
+        if ( node.getNodeData().isHasSequence() ) {
+            final Sequence seq = node.getNodeData().getSequence();
+            if ( !isEmpty( seq.getSymbol() ) ) {
+                v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() );
+            }
+            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+                v = SequenceIdParser.parseRefSeqAccessor( seq.getName() );
+            }
+            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+                    && !isEmpty( seq.getAccession().getValue() ) ) {
+                v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() );
+            }
+        }
+        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+            v = SequenceIdParser.parseRefSeqAccessor( node.getName() );
+        }
+        return v;
+    }
+
+    public static String extractGenbankAccessor( final PhylogenyNode node ) {
+        String v = null;
+        if ( node.getNodeData().isHasSequence() ) {
+            final Sequence seq = node.getNodeData().getSequence();
+            if ( !isEmpty( seq.getSymbol() ) ) {
+                v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() );
+            }
+            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+                v = SequenceIdParser.parseGenbankAccessor( seq.getName() );
+            }
+            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+                    && !isEmpty( seq.getAccession().getValue() ) ) {
+                v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() );
+            }
+        }
+        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+            v = SequenceIdParser.parseGenbankAccessor( node.getName() );
+        }
+        return v;
+    }
+
+    public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
+        String upkb = null;
+        if ( node.getNodeData().isHasSequence() ) {
+            final Sequence seq = node.getNodeData().getSequence();
+            Matcher m;
+            if ( !isEmpty( seq.getSymbol() ) ) {
+                m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
+                if ( m.find() ) {
+                    upkb = m.group( 1 );
+                }
+                else {
+                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
+                    if ( m.find() ) {
+                        upkb = m.group();
+                    }
+                }
+            }
+            if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) {
+                m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
+                if ( m.find() ) {
+                    upkb = m.group( 1 );
+                }
+                else {
+                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
+                    if ( m.find() ) {
+                        upkb = m.group();
+                    }
+                }
+            }
+            if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
+                    && !isEmpty( seq.getAccession().getValue() ) ) {
+                m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
+                if ( m.find() ) {
+                    upkb = m.group( 1 );
+                }
+                else {
+                    m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
+                    if ( m.find() ) {
+                        upkb = m.group();
+                    }
+                }
+            }
+        }
+        if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) {
+            final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() );
+            if ( m1.find() ) {
+                upkb = m1.group( 1 );
+            }
+            else {
+                final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() );
+                if ( m2.find() ) {
+                    upkb = m2.group();
+                }
+            }
+        }
+        return upkb;
+    }
+
     final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
         if ( sb.length() > 0 ) {
             sb.append( separator );
index c96d0f4..6d2dd37 100644 (file)
@@ -125,7 +125,7 @@ public final class SequenceIdParser {
      * Returns null if no match.\r
      * \r
      */\r
-    private final static String parseRefSeqAccessor( final String query ) {\r
+    public final static String parseRefSeqAccessor( final String query ) {\r
         final Matcher m = REFSEQ_PATTERN.matcher( query );\r
         if ( m.lookingAt() ) {\r
             return m.group( 1 );\r