From: cmzmasek@gmail.com Date: Sat, 12 Apr 2014 02:31:11 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=07bb1be17a1e880d3971b660d726e6f70f829421;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java index 4e88a61..6301504 100644 --- a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java +++ b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java @@ -352,7 +352,7 @@ public final class TaxonomyDataManager extends RunnableProcess { } if ( ut == null ) { String sn = ""; - final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_I.matcher( simple_name ); + final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name ); if ( m.matches() ) { sn = m.group( 1 ); } diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index c10efb7..8adaa71 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -61,10 +61,14 @@ public final class ParserUtils { final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + TAX_CODE + ")\\b" ); final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); + .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,30}(?:_[a-z][a-z0-9_]+)?)\\b" ); final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern - .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,}(?:[_ ][a-z]+)?)(?:\\b|_)" ); - final public static Pattern TAXOMONY_SN_PATTERN_I = Pattern.compile( "([A-Z][a-z]{2,})" ); + .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}(?:[_ ][a-z]{2,30})?)(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_1 = Pattern + .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ](?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_2 = Pattern + .compile( "\\b([A-Z][a-z]+[_ ][a-z]{2,30}[_ ]\\((?:str|subsp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_GENUS = Pattern.compile( "([A-Z][a-z]{2,})" ); final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + TAX_CODE + ")/\\d+-\\d+\\b" ); final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern @@ -201,6 +205,14 @@ public final class ParserUtils { if ( m.find() ) { return m.group( 1 ).replace( '_', ' ' ); } + final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name ); + if ( m_str1.find() ) { + return m_str1.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name ); + if ( m_str2.find() ) { + return m_str2.group( 1 ).replace( '_', ' ' ); + } final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name ); if ( m_sn.find() ) { return m_sn.group( 1 ).replace( '_', ' ' ); diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index c347106..0f565a4 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -312,6 +312,7 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.exit( -1 ); System.out.print( "Uri for Aptx web sequence accession: " ); if ( Test.testCreateUriForSeqWeb() ) { System.out.println( "OK." ); @@ -4198,9 +4199,58 @@ public final class Test { .equals( "Mus musculus musculus" ) ) { return false; } + if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_musculus_bcl2" ) + .equals( "Mus musculus musculus" ) ) { + return false; + } if ( !ParserUtils.extractScientificNameFromNodeName( "Mus_musculus_123" ).equals( "Mus musculus" ) ) { return false; } + if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_strain_K12/DH10B" ) + .equals( "Escherichia coli strain K12/DH10B" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia_coli_str_K12/DH10B" ) + .equals( "Escherichia coli str K12/DH10B" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia coli str. K12/DH10B" ) + .equals( "Escherichia coli str. K12/DH10B" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis_lyrata_subsp_lyrata" ) + .equals( "Arabidopsis lyrata subsp lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata" ) + .equals( "Arabidopsis lyrata subsp. lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata 395" ) + .equals( "Arabidopsis lyrata subsp. lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp. lyrata bcl2" ) + .equals( "Arabidopsis lyrata subsp. lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subsp lyrata bcl2" ) + .equals( "Arabidopsis lyrata subsp lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Arabidopsis lyrata subspecies lyrata bcl2" ) + .equals( "Arabidopsis lyrata subspecies lyrata" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Verbascum sinuatum var. adenosepalum bcl2" ) + .equals( "Verbascum sinuatum var. adenosepalum" ) ) { + return false; + } + if ( !ParserUtils.extractScientificNameFromNodeName( "Escherichia coli (strain K12) " ) + .equals( "Escherichia coli (strain K12)" ) ) { + System.out.println( ParserUtils.extractScientificNameFromNodeName( "Escherichia coli (strain K12)" ) ); + return false; + } } catch ( final Exception e ) { e.printStackTrace( System.out );