in progress
authorcmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sun, 13 Apr 2014 03:48:24 +0000 (03:48 +0000)
committercmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sun, 13 Apr 2014 03:48:24 +0000 (03:48 +0000)
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/test/Test.java

index 77af4c8..ef37f98 100644 (file)
@@ -61,13 +61,16 @@ public final class ParserUtils {
     final public static Pattern  TAXOMONY_CODE_PATTERN_PFR       = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_("
                                                                          + TAX_CODE + ")\\b" );
     final public static Pattern  TAXOMONY_SN_PATTERN             = Pattern
-                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,30}(?:_[a-z][a-z0-9_]+)?)\\b" );
+                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]{2,30}_[a-z]{3,30}(?:_[a-z][a-z0-9_]+)?)\\b" );
     final public static Pattern  TAXOMONY_SN_PATTERN_SN          = Pattern
-                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}(?:[_ ][a-z]{2,30})?)(?:\\b|_)" );
+                                                                         .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}(?:[_ ][a-z]{3,30})?)(?:\\b|_)?" );
     final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_1    = Pattern
-                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
+                                                                         .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
     final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_2    = Pattern
-                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" );
+                                                                         .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_SP    = Pattern
+            .compile( "\\b([A-Z][a-z]{2,30}[_ ]sp\\.)(?:\\b|_)?" );
+
     final public static Pattern  TAXOMONY_SN_PATTERN_GENUS       = Pattern.compile( "([A-Z][a-z]{2,30})" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PFS       = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
                                                                          + TAX_CODE + ")/\\d+-\\d+\\b" );
@@ -214,9 +217,16 @@ public final class ParserUtils {
             return m_str2.group( 1 ).replace( '_', ' ' );
         }
         final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name );
+       
         if ( m_sn.find() ) {
             return m_sn.group( 1 ).replace( '_', ' ' );
         }
+        
+        final Matcher m_sp = TAXOMONY_SN_PATTERN_SP.matcher( name );
+        
+        if ( m_sp.find() ) {
+            return m_sp.group( 1 ).replace( '_', ' ' );
+        }
         return null;
     }
 
index 6bf4e04..c89be53 100644 (file)
@@ -4206,6 +4206,9 @@ public final class Test {
             if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_123" ).equals( "Mus musculus" ) ) {
                 return false;
             }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Pilostyles mexicana Mexico Breedlove 27233" ).equals( "Pilostyles mexicana" ) ) {
+                return false;
+            }
             if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_strain_K12/DH10B" )
                     .equals( "Escherichia coli strain K12/DH10B" ) ) {
                 return false;
@@ -4262,6 +4265,28 @@ public final class Test {
                     .equals( "Escherichia coli (str. K12)" ) ) {
                 return false;
             }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp." )
+                    .equals( "Macrocera sp." ) ) {
+                
+                 return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp. 123" )
+                    .equals( "Macrocera sp." ) ) {
+                
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp. K12" )
+                    .equals( "Macrocera sp." ) ) {
+                
+                
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "something Macrocera sp. K12" )
+                    .equals( "Macrocera sp." ) ) {
+                
+                
+                return false;
+            }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );