inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 8 Mar 2013 00:04:11 +0000 (00:04 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 8 Mar 2013 00:04:11 +0000 (00:04 +0000)
forester/java/src/org/forester/archaeopteryx/AptxUtil.java
forester/java/src/org/forester/archaeopteryx/TreePanel.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterUtil.java
forester/java/src/org/forester/util/SequenceIdParser.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java

index 8ccd028..bc7a693 100644 (file)
@@ -150,6 +150,18 @@ public final class AptxUtil {
                 }
             }
         }
+        if ( ForesterUtil.isEmpty( uri_str ) ) {
+            final String v = ForesterUtil.extractGInumber( node );
+            if ( !ForesterUtil.isEmpty( v ) ) {
+                try {
+                    uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 );
+                }
+                catch ( final UnsupportedEncodingException e ) {
+                    showErrorMessage( tp, e.toString() );
+                    e.printStackTrace();
+                }
+            }
+        }
         return uri_str;
     }
 
index b2ec09f..533093c 100644 (file)
@@ -3243,6 +3243,9 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee
         if ( ForesterUtil.isEmpty( v ) ) {
             v = ForesterUtil.extractRefSeqAccessorAccessor( node );
         }
+        if ( ForesterUtil.isEmpty( v ) ) {
+            v = ForesterUtil.extractGInumber( node );
+        }
         return v;
     }
 
index 4a3e95b..93a623e 100644 (file)
@@ -1041,12 +1041,27 @@ public final class Test {
             if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "XM_002122186" ) ) {
                 return false;
             }
-            n.setName( "AAA34956" );
+            n.setName( "dgh_AAA34956_gdg" );
             if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
                 return false;
             }
-            n.setName( "Q06891.1" );
-            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891" ) ) {
+            n.setName( "j40f4_Q06891.1_fndn2 fnr3" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) {
+                return false;
+            }
+            n.setName( "GI:394892" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                return false;
+            }
+            n.setName( "gi_394892" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+                return false;
+            }
+            n.setName( "gi6335_gi_394892_56635_Gi_43" );
+            if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+                System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
                 return false;
             }
         }
@@ -9376,7 +9391,10 @@ public final class Test {
         if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
             return false;
         }
-        if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) {
+        if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
+            return false;
+        }
+        if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
             return false;
         }
         if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) {
@@ -9427,9 +9445,6 @@ public final class Test {
             if ( !entry.getSequenceName().equals( "Aspartate aminotransferase, mitochondrial" ) ) {
                 return false;
             }
-            if ( !entry.getSequenceSymbol().equals( "GOT2" ) ) {
-                return false;
-            }
             if ( !entry.getTaxonomyIdentifier().equals( "9986" ) ) {
                 return false;
             }
index 828d3df..a8c81b0 100644 (file)
@@ -90,6 +90,7 @@ public final class ForesterUtil {
                                                                               .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
     public final static Pattern      UNIPROT_KB_PATTERN_2             = Pattern
                                                                               .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
+    public static final String       NCBI_GI                          = "http://www.ncbi.nlm.nih.gov/protein/gi:";
     static {
         final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
         dfs.setDecimalSeparator( '.' );
@@ -145,6 +146,24 @@ public final class ForesterUtil {
         return v;
     }
 
+    public static String extractGInumber( final PhylogenyNode node ) {
+        String v = null;
+        if ( node.getNodeData().isHasSequence() ) {
+            final Sequence seq = node.getNodeData().getSequence();
+            if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+                v = SequenceIdParser.parseGInumber( seq.getName() );
+            }
+            if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+                    && !isEmpty( seq.getAccession().getValue() ) ) {
+                v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() );
+            }
+        }
+        if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+            v = SequenceIdParser.parseGInumber( node.getName() );
+        }
+        return v;
+    }
+
     public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
         String upkb = null;
         if ( node.getNodeData().isHasSequence() ) {
index 6d2dd37..d828a6a 100644 (file)
@@ -49,11 +49,11 @@ public final class SequenceIdParser {
     //Protein:    3 letters + 5 numerals\r
     //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
     private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
     private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
     private final static Pattern GENBANK_PROTEIN_AC_PATTERN      = Pattern\r
-                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );\r
+                                                                         .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
     // RefSeq accession numbers can be distinguished from GenBank accessions \r
     // by their distinct prefix format of 2 characters followed by an\r
     // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
@@ -62,6 +62,8 @@ public final class SequenceIdParser {
     // See: http://web.expasy.org/docs/userman.html#ID_line\r
     private final static Pattern TREMBL_PATTERN                  = Pattern\r
                                                                          .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" );\r
+    private final static Pattern GI_PATTERN                      = Pattern\r
+                                                                         .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
 \r
     /**\r
      * Returns null if no match.\r
@@ -148,4 +150,12 @@ public final class SequenceIdParser {
     private SequenceIdParser() {\r
         // Hiding the constructor.\r
     }\r
+\r
+    public static String parseGInumber( final String query ) {\r
+        final Matcher m = GI_PATTERN.matcher( query );\r
+        if ( m.find() ) {\r
+            return m.group( 1 );\r
+        }\r
+        return null;\r
+    }\r
 }\r
index e3b222b..7c656d5 100644 (file)
@@ -230,7 +230,7 @@ public final class SequenceDbWsTools {
                 db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return );
             }
             else if ( ( db == Db.NCBI ) && ( id != null ) ) {
-                db_entry = obtainEmblEntry( id, lines_to_return );
+                db_entry = obtainEmblEntry( id, lines_to_return ); //TODO ?
             }
             if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
                 final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()