From e9776288e623af2543d3a5e396c60ff1411d01c3 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Sun, 13 Apr 2014 03:48:24 +0000 Subject: [PATCH] in progress --- .../org/forester/io/parsers/util/ParserUtils.java | 18 ++++++++++---- forester/java/src/org/forester/test/Test.java | 25 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 77af4c8..ef37f98 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -61,13 +61,16 @@ public final class ParserUtils { final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + TAX_CODE + ")\\b" ); final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,30}(?:_[a-z][a-z0-9_]+)?)\\b" ); + .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]{2,30}_[a-z]{3,30}(?:_[a-z][a-z0-9_]+)?)\\b" ); final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern - .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}(?:[_ ][a-z]{2,30})?)(?:\\b|_)" ); + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}(?:[_ ][a-z]{3,30})?)(?:\\b|_)?" ); final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_1 = Pattern - .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_2 = Pattern - .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" ); + .compile( "\\b([A-Z][a-z]{2,30}[_ ][a-z]{3,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_SP = Pattern + .compile( "\\b([A-Z][a-z]{2,30}[_ ]sp\\.)(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_GENUS = Pattern.compile( "([A-Z][a-z]{2,30})" ); final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + TAX_CODE + ")/\\d+-\\d+\\b" ); @@ -214,9 +217,16 @@ public final class ParserUtils { return m_str2.group( 1 ).replace( '_', ' ' ); } final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name ); + if ( m_sn.find() ) { return m_sn.group( 1 ).replace( '_', ' ' ); } + + final Matcher m_sp = TAXOMONY_SN_PATTERN_SP.matcher( name ); + + if ( m_sp.find() ) { + return m_sp.group( 1 ).replace( '_', ' ' ); + } return null; } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 6bf4e04..c89be53 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -4206,6 +4206,9 @@ public final class Test { if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_123" ).equals( "Mus musculus" ) ) { return false; } + if ( !ParserUtils.extractScientificNameFromNodeName( "Pilostyles mexicana Mexico Breedlove 27233" ).equals( "Pilostyles mexicana" ) ) { + return false; + } if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_strain_K12/DH10B" ) .equals( "Escherichia coli strain K12/DH10B" ) ) { return false; @@ -4262,6 +4265,28 @@ public final class Test { .equals( "Escherichia coli (str. K12)" ) ) { return false; } + if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp." ) + .equals( "Macrocera sp." ) ) { + + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp. 123" ) + .equals( "Macrocera sp." ) ) { + + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Macrocera sp. K12" ) + .equals( "Macrocera sp." ) ) { + + + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "something Macrocera sp. K12" ) + .equals( "Macrocera sp." ) ) { + + + return false; + } } catch ( final Exception e ) { e.printStackTrace( System.out ); -- 1.7.10.2