reordered
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 16 Apr 2014 18:29:48 +0000 (18:29 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 16 Apr 2014 18:29:48 +0000 (18:29 +0000)
forester/java/src/org/forester/archaeopteryx/Constants.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterConstants.java

index 0ad0eb9..923c109 100644 (file)
@@ -43,7 +43,7 @@ public final class Constants {
     public final static boolean ALLOW_DDBJ_BLAST                                              = false;
     public final static String  PRG_NAME                                                      = "Archaeopteryx";
     final static String         VERSION                                                       = "0.988 SR";
-    final static String         PRG_DATE                                                      = "140415";
+    final static String         PRG_DATE                                                      = "140416";
     final static String         DEFAULT_CONFIGURATION_FILE_NAME                               = "_aptx_configuration_file";
     final static String[]       DEFAULT_FONT_CHOICES                                          = { "Arial", "Helvetica",
             "Verdana", "Tahoma", "Dialog", "Lucida Sans", "SansSerif", "Sans-serif", "Sans"  };
index d056ba3..9168c9d 100644 (file)
@@ -55,13 +55,17 @@ import org.forester.util.ForesterUtil;
 
 public final class ParserUtils {
 
-    final public static String   TAX_CODE                             = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA";
     final private static String  SN_BN                                = "[A-Z][a-z]{2,30}[_ ][a-z]{3,30}";
+    final public static String   TAX_CODE                             = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA";
+    final public static String   TAX_CODE_LO                          = "(?:[A-Z]{5})|RAT|PIG|PEA";
     final public static Pattern  TAXOMONY_CODE_PATTERN_A              = Pattern.compile( "(?:\\b|_)(" + TAX_CODE
-                                                                              + ")\\b" );
+                                                                              + ")(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_CODE_PATTERN_A_LO           = Pattern.compile( "(?:\\b|_)(" + TAX_CODE_LO
+                                                                              + ")(?:\\b|_)" );
     final public static Pattern  TAXOMONY_CODE_PATTERN_BRACKETED      = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" );
     final public static Pattern  TAXOMONY_CODE_PATTERN_PFR            = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_("
                                                                               + TAX_CODE + ")\\b" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_GENUS            = Pattern.compile( "([A-Z][a-z]{2,30})" );
     final public static Pattern  TAXOMONY_SN_PATTERN_SN               = Pattern.compile( "(?:\\b|_)(" + SN_BN
                                                                               + ")(?:(\\s*$)|([_ ][a-z]*[A-Z0-9]))" );
     final public static Pattern  TAXOMONY_SN_PATTERN_SNS              = Pattern.compile( "(?:\\b|_)(" + SN_BN
@@ -69,6 +73,8 @@ public final class ParserUtils {
                                                                               + ")[_ ][a-z]*[A-Z0-9]" );
     final public static Pattern  TAXOMONY_SN_PATTERN_SNS2             = Pattern.compile( "[A-Z0-9][a-z]*[_ ](" + SN_BN
                                                                               + "[_ ][a-z]{3,30}" + ")\\s*$" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_SP               = Pattern
+                                                                              .compile( "(?:\\b|_)([A-Z][a-z]{2,30}[_ ]sp\\.?)(?:\\b|_)?" );
     final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_1         = Pattern
                                                                               .compile( "(?:\\b|_)("
                                                                                       + SN_BN
@@ -81,9 +87,6 @@ public final class ParserUtils {
                                                                               .compile( "(?:\\b|_)("
                                                                                       + SN_BN
                                                                                       + "[_ ]str[a-z]{0,3}\\.?[_ ]\\S{1,60}[_ ]substr[a-z]{0,3}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
-    final public static Pattern  TAXOMONY_SN_PATTERN_SP               = Pattern
-                                                                              .compile( "(?:\\b|_)([A-Z][a-z]{2,30}[_ ]sp\\.?)(?:\\b|_)?" );
-    final public static Pattern  TAXOMONY_SN_PATTERN_GENUS            = Pattern.compile( "([A-Z][a-z]{2,30})" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PFS            = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
                                                                               + TAX_CODE + ")/\\d+-\\d+\\b" );
     final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR      = Pattern
@@ -327,7 +330,23 @@ public final class ParserUtils {
             return id;
         }
         else {
-            final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
+            String code = null;
+            if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) {
+                code = extractTaxonomyCodeFromNodeNameLettersOnly( node.getName() );
+                if ( ForesterUtil.isEmpty( code ) ) {
+                    final String sn = extractScientificNameFromNodeName( node.getName() );
+                    if ( !ForesterUtil.isEmpty( sn ) ) {
+                        if ( !node.getNodeData().isHasTaxonomy() ) {
+                            node.getNodeData().setTaxonomy( new Taxonomy() );
+                        }
+                        node.getNodeData().getTaxonomy().setScientificName( sn );
+                        return sn;
+                    }
+                }
+            }
+            if ( ForesterUtil.isEmpty( code ) ) {
+                code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
+            }
             if ( !ForesterUtil.isEmpty( code ) ) {
                 if ( !node.getNodeData().isHasTaxonomy() ) {
                     node.getNodeData().setTaxonomy( new Taxonomy() );
@@ -335,16 +354,6 @@ public final class ParserUtils {
                 node.getNodeData().getTaxonomy().setTaxonomyCode( code );
                 return code;
             }
-            else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) {
-                final String sn = extractScientificNameFromNodeName( node.getName() );
-                if ( !ForesterUtil.isEmpty( sn ) ) {
-                    if ( !node.getNodeData().isHasTaxonomy() ) {
-                        node.getNodeData().setTaxonomy( new Taxonomy() );
-                    }
-                    node.getNodeData().getTaxonomy().setScientificName( sn );
-                    return sn;
-                }
-            }
         }
         return null;
     }
@@ -361,12 +370,6 @@ public final class ParserUtils {
             if ( m.find() ) {
                 return m.group( 1 );
             }
-            //else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) {
-            //    m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name );
-            //    if ( m.find() ) {
-            //        return m.group( 1 );
-            //    }
-            //}
         }
         return null;
     }
@@ -417,4 +420,12 @@ public final class ParserUtils {
         }
         return parser;
     }
+
+    private final static String extractTaxonomyCodeFromNodeNameLettersOnly( final String name ) {
+        final Matcher m = TAXOMONY_CODE_PATTERN_A_LO.matcher( name );
+        if ( m.find() ) {
+            return m.group( 1 );
+        }
+        return null;
+    }
 }
index f57b0be..5ff446f 100644 (file)
@@ -12218,6 +12218,47 @@ public final class Test {
                 System.out.println( n21.toString() );
                 return false;
             }
+            final PhylogenyNode n22 = PhylogenyNode
+                    .createInstanceFromNhxString( "NEMVE_Nematostella_vectensis",
+                                                  NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n22.getNodeData().getTaxonomy().getTaxonomyCode().equals( "NEMVE" ) ) {
+                System.out.println( n22.toString() );
+                return false;
+            }
+            final PhylogenyNode n23 = PhylogenyNode
+                    .createInstanceFromNhxString( "9EMVE_Nematostella_vectensis",
+                                                  NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n23.getNodeData().getTaxonomy().getScientificName().equals( "Nematostella vectensis" ) ) {
+                System.out.println( n23.toString() );
+                return false;
+            }
+            final PhylogenyNode n24 = PhylogenyNode
+                    .createInstanceFromNhxString( "9EMVE_Nematostella", NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n24.getNodeData().getTaxonomy().getTaxonomyCode().equals( "9EMVE" ) ) {
+                System.out.println( n24.toString() );
+                return false;
+            }
+            //
+            final PhylogenyNode n25 = PhylogenyNode
+                    .createInstanceFromNhxString( "Nematostella_vectensis_NEMVE",
+                                                  NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n25.getNodeData().getTaxonomy().getTaxonomyCode().equals( "NEMVE" ) ) {
+                System.out.println( n25.toString() );
+                return false;
+            }
+            final PhylogenyNode n26 = PhylogenyNode
+                    .createInstanceFromNhxString( "Nematostella_vectensis_9EMVE",
+                                                  NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n26.getNodeData().getTaxonomy().getScientificName().equals( "Nematostella vectensis" ) ) {
+                System.out.println( n26.toString() );
+                return false;
+            }
+            final PhylogenyNode n27 = PhylogenyNode
+                    .createInstanceFromNhxString( "Nematostella_9EMVE", NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+            if ( !n27.getNodeData().getTaxonomy().getTaxonomyCode().equals( "9EMVE" ) ) {
+                System.out.println( n27.toString() );
+                return false;
+            }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );
index 3490f88..a98df7a 100644 (file)
@@ -28,7 +28,7 @@ package org.forester.util;
 public final class ForesterConstants {
 
     public final static String  FORESTER_VERSION            = "1.032";
-    public final static String  FORESTER_DATE               = "140415";
+    public final static String  FORESTER_DATE               = "140416";
     public final static String  PHYLO_XML_VERSION           = "1.10";
     public final static String  PHYLO_XML_LOCATION          = "http://www.phyloxml.org";
     public final static String  PHYLO_XML_XSD               = "phyloxml.xsd";