inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 12 Apr 2014 02:31:11 +0000 (02:31 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Sat, 12 Apr 2014 02:31:11 +0000 (02:31 +0000)
forester/java/src/org/forester/analysis/TaxonomyDataManager.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/test/Test.java

index 4e88a61..6301504 100644 (file)
@@ -352,7 +352,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
         }
         if ( ut == null ) {
             String sn = "";
-            final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_I.matcher( simple_name );
+            final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name );
             if ( m.matches() ) {
                 sn = m.group( 1 );
             }
index c10efb7..8adaa71 100644 (file)
@@ -61,10 +61,14 @@ public final class ParserUtils {
     final public static Pattern  TAXOMONY_CODE_PATTERN_PFR       = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_("
                                                                          + TAX_CODE + ")\\b" );
     final public static Pattern  TAXOMONY_SN_PATTERN             = Pattern
-                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" );
+                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,30}(?:_[a-z][a-z0-9_]+)?)\\b" );
     final public static Pattern  TAXOMONY_SN_PATTERN_SN          = Pattern
-                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,}(?:[_ ][a-z]+)?)(?:\\b|_)" );
-    final public static Pattern  TAXOMONY_SN_PATTERN_I           = Pattern.compile( "([A-Z][a-z]{2,})" );
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}(?:[_ ][a-z]{2,30})?)(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_1    = Pattern
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_2    = Pattern
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_GENUS       = Pattern.compile( "([A-Z][a-z]{2,})" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PFS       = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
                                                                          + TAX_CODE + ")/\\d+-\\d+\\b" );
     final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern
@@ -201,6 +205,14 @@ public final class ParserUtils {
         if ( m.find() ) {
             return m.group( 1 ).replace( '_', ' ' );
         }
+        final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name );
+        if ( m_str1.find() ) {
+            return m_str1.group( 1 ).replace( '_', ' ' );
+        }
+        final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name );
+        if ( m_str2.find() ) {
+            return m_str2.group( 1 ).replace( '_', ' ' );
+        }
         final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name );
         if ( m_sn.find() ) {
             return m_sn.group( 1 ).replace( '_', ' ' );
index c347106..0f565a4 100644 (file)
@@ -312,6 +312,7 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+        System.exit( -1 );
         System.out.print( "Uri for Aptx web sequence accession: " );
         if ( Test.testCreateUriForSeqWeb() ) {
             System.out.println( "OK." );
@@ -4198,9 +4199,58 @@ public final class Test {
                     .equals( "Mus musculus musculus" ) ) {
                 return false;
             }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_musculus_bcl2" )
+                    .equals( "Mus musculus musculus" ) ) {
+                return false;
+            }
             if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_123" ).equals( "Mus musculus" ) ) {
                 return false;
             }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_strain_K12/DH10B" )
+                    .equals( "Escherichia coli strain K12/DH10B" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_str_K12/DH10B" )
+                    .equals( "Escherichia coli str K12/DH10B" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia coli str. K12/DH10B" )
+                    .equals( "Escherichia coli str. K12/DH10B" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis_lyrata_subsp_lyrata" )
+                    .equals( "Arabidopsis lyrata subsp lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata" )
+                    .equals( "Arabidopsis lyrata subsp. lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata 395" )
+                    .equals( "Arabidopsis lyrata subsp. lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata bcl2" )
+                    .equals( "Arabidopsis lyrata subsp. lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp lyrata bcl2" )
+                    .equals( "Arabidopsis lyrata subsp lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subspecies lyrata bcl2" )
+                    .equals( "Arabidopsis lyrata subspecies lyrata" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Verbascum sinuatum var. adenosepalum bcl2" )
+                    .equals( "Verbascum sinuatum var. adenosepalum" ) ) {
+                return false;
+            }
+            if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia coli (strain K12) " )
+                    .equals( "Escherichia coli (strain K12)" ) ) {
+                System.out.println( ParserUtils.extractScientificNameFromNodeName( "Escherichia coli (strain K12)" ) );
+                return false;
+            }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );