inprogress
[jalview.git] / forester / java / src / org / forester / io / parsers / util / ParserUtils.java
index a465d4a..77af4c8 100644 (file)
@@ -61,12 +61,18 @@ public final class ParserUtils {
     final public static Pattern  TAXOMONY_CODE_PATTERN_PFR       = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_("
                                                                          + TAX_CODE + ")\\b" );
     final public static Pattern  TAXOMONY_SN_PATTERN             = Pattern
-                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" );
+                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,30}(?:_[a-z][a-z0-9_]+)?)\\b" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_SN          = Pattern
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}(?:[_ ][a-z]{2,30})?)(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_1    = Pattern
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_STRAIN_2    = Pattern
+                                                                         .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" );
+    final public static Pattern  TAXOMONY_SN_PATTERN_GENUS       = Pattern.compile( "([A-Z][a-z]{2,30})" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PFS       = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_("
                                                                          + TAX_CODE + ")/\\d+-\\d+\\b" );
-    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_A   = Pattern.compile( "(?:\\b|_)(\\d{1,7})\\b" );
     final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern
-                                                                         .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(\\d{1,7})\\b" );
+                                                                         .compile( "(?:\\b|_)[A-Z0-9]{1,}_(\\d{1,7})\\b" );
     final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern
                                                                          .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" );
 
@@ -199,6 +205,18 @@ public final class ParserUtils {
         if ( m.find() ) {
             return m.group( 1 ).replace( '_', ' ' );
         }
+        final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name );
+        if ( m_str1.find() ) {
+            return m_str1.group( 1 ).replace( '_', ' ' );
+        }
+        final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name );
+        if ( m_str2.find() ) {
+            return m_str2.group( 1 ).replace( '_', ' ' );
+        }
+        final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name );
+        if ( m_sn.find() ) {
+            return m_sn.group( 1 ).replace( '_', ' ' );
+        }
         return null;
     }
 
@@ -273,12 +291,12 @@ public final class ParserUtils {
             if ( m.find() ) {
                 return m.group( 1 );
             }
-            else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) {
-                m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name );
-                if ( m.find() ) {
-                    return m.group( 1 );
-                }
-            }
+            //else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) {
+            //    m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name );
+            //    if ( m.find() ) {
+            //        return m.group( 1 );
+            //    }
+            //}
         }
         return null;
     }