X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Futil%2FParserUtils.java;h=e86ed49fbaabf2b7b67a83bc7a6195b71a5954e0;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=76f2a183d6800bb6b33283dd26e6a1391283dd26;hpb=de830ea1bec9c9e224a53c92c9e5a886ee9642f5;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 76f2a18..e86ed49 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,26 +55,48 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA"; - final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + ")\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_BRACKETED = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); - final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" - + TAX_CODE + ")\\b" ); - final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); - final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern - .compile( "\\b([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)(?:\\b|_)" ); - final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" - + TAX_CODE + ")/\\d+-\\d+\\b" ); - // final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_A = Pattern.compile( "(?:\\b|(?:[A-Z]_))(\\d{1,7})\\b" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern - .compile( "(?:\\b|_)[A-Z0-9]{1,}_(\\d{1,7})\\b" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern - .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" ); + final private static String SN_BN = "[A-Z][a-z]{2,30}[_ ][a-z]{3,30}"; + final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA"; + final public static String TAX_CODE_LO = "(?:[A-Z]{5})|RAT|PIG|PEA"; + final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + + ")(?:\\b|_)" ); + final public static Pattern TAXOMONY_CODE_PATTERN_A_LO = Pattern.compile( "_(" + TAX_CODE_LO + + ")(?:\\b|_)" ); + final public static Pattern TAXOMONY_CODE_PATTERN_BRACKETED = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); + final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + + TAX_CODE + ")\\b" ); + final public static Pattern TAXOMONY_SN_PATTERN_GENUS = Pattern.compile( "([A-Z][a-z]{2,30})" ); + final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern.compile( "(?:\\b|_)(" + SN_BN + + ")(?:(\\s*$)|([_ ][a-z]*[A-Z0-9]))" ); + final public static Pattern TAXOMONY_SN_PATTERN_SNS = Pattern.compile( "(?:\\b|_)(" + SN_BN + + "[_ ][a-z]{3,30}" + + ")[_ ][a-z]*[A-Z0-9]" ); + final public static Pattern TAXOMONY_SN_PATTERN_SNS2 = Pattern.compile( "[A-Z0-9][a-z]*[_ ](" + SN_BN + + "[_ ][a-z]{3,30}" + ")\\s*$" ); + final public static Pattern TAXOMONY_SN_PATTERN_SP = Pattern + .compile( "(?:\\b|_)([A-Z][a-z]{2,30}[_ ]sp\\.?)(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_1 = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ](?:str|subsp|ssp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_2 = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ]\\((?:str|subsp|ssp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_SUBSTRAIN = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ]str[a-z]{0,3}\\.?[_ ]\\S{1,60}[_ ]substr[a-z]{0,3}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + + TAX_CODE + ")/\\d+-\\d+\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{1,}_(\\d{1,7})\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { PhylogenyParser parser = null; final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); if ( first_line.startsWith( "<" ) ) { @@ -105,7 +127,7 @@ public final class ParserUtils { final public static PhylogenyParser createParserDependingOnFileType( final File file, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { PhylogenyParser parser = null; parser = ParserUtils.createParserDependingOnSuffix( file.getName(), phyloxml_validate_against_xsd ); if ( parser == null ) { @@ -124,7 +146,7 @@ public final class ParserUtils { final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { final String lc_filename = url.getFile().toString().toLowerCase(); PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); if ( parser == null ) { @@ -191,20 +213,83 @@ public final class ParserUtils { } else { throw new IllegalArgumentException( "attempt to parse object of type [" + source.getClass() - + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" ); + + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" ); } return reader; } public final static String extractScientificNameFromNodeName( final String name ) { - final Matcher m = TAXOMONY_SN_PATTERN.matcher( name ); - if ( m.find() ) { - return m.group( 1 ).replace( '_', ' ' ); + final Matcher m_ss = TAXOMONY_SN_PATTERN_STRAIN_SUBSTRAIN.matcher( name ); + if ( m_ss.find() ) { + String s = m_ss.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " str " ) > 4 ) { + s = s.replaceFirst( " str ", " str. " ); + } + if ( s.indexOf( " substr " ) > 4 ) { + s = s.replaceFirst( " substr ", " substr. " ); + } + return s; + } + final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name ); + if ( m_str1.find() ) { + String s = m_str1.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " str " ) > 4 ) { + s = s.replaceFirst( " str ", " str. " ); + } + else if ( s.indexOf( " subsp " ) > 4 ) { + s = s.replaceFirst( " subsp ", " subsp. " ); + } + else if ( s.indexOf( " ssp " ) > 4 ) { + s = s.replaceFirst( " ssp ", " subsp. " ); + } + else if ( s.indexOf( " ssp. " ) > 4 ) { + s = s.replaceFirst( " ssp. ", " subsp. " ); + } + else if ( s.indexOf( " var " ) > 4 ) { + s = s.replaceFirst( " var ", " var. " ); + } + return s; + } + final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name ); + if ( m_str2.find() ) { + String s = m_str2.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " (str " ) > 4 ) { + s = s.replaceFirst( " \\(str ", " (str. " ); + } + else if ( s.indexOf( " (subsp " ) > 4 ) { + s = s.replaceFirst( " \\(subsp ", " (subsp. " ); + } + else if ( s.indexOf( " (ssp " ) > 4 ) { + s = s.replaceFirst( " \\(ssp ", " (subsp. " ); + } + else if ( s.indexOf( " (ssp. " ) > 4 ) { + s = s.replaceFirst( " \\(ssp. ", " (subsp. " ); + } + else if ( s.indexOf( " (var " ) > 4 ) { + s = s.replaceFirst( " \\(var ", " (var. " ); + } + return s; + } + final Matcher m_sns = TAXOMONY_SN_PATTERN_SNS.matcher( name ); + if ( m_sns.find() ) { + return m_sns.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_sns2 = TAXOMONY_SN_PATTERN_SNS2.matcher( name ); + if ( m_sns2.find() ) { + return m_sns2.group( 1 ).replace( '_', ' ' ); } final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name ); if ( m_sn.find() ) { return m_sn.group( 1 ).replace( '_', ' ' ); } + final Matcher m_sp = TAXOMONY_SN_PATTERN_SP.matcher( name ); + if ( m_sp.find() ) { + String s = m_sp.group( 1 ).replace( '_', ' ' ); + if ( s.endsWith( " sp" ) ) { + s = s + "."; + } + return s; + } return null; } @@ -232,7 +317,7 @@ public final class ParserUtils { public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node, final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) - throws PhyloXmlDataFormatException { + throws PhyloXmlDataFormatException { if ( taxonomy_extraction == TAXONOMY_EXTRACTION.NO ) { throw new IllegalArgumentException(); } @@ -245,7 +330,23 @@ public final class ParserUtils { return id; } else { - final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); + String code = null; + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + code = extractTaxonomyCodeFromNodeNameLettersOnly( node.getName() ); + if ( ForesterUtil.isEmpty( code ) ) { + final String sn = extractScientificNameFromNodeName( node.getName() ); + if ( !ForesterUtil.isEmpty( sn ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setScientificName( sn ); + return sn; + } + } + } + if ( ForesterUtil.isEmpty( code ) ) { + code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); + } if ( !ForesterUtil.isEmpty( code ) ) { if ( !node.getNodeData().isHasTaxonomy() ) { node.getNodeData().setTaxonomy( new Taxonomy() ); @@ -253,16 +354,6 @@ public final class ParserUtils { node.getNodeData().getTaxonomy().setTaxonomyCode( code ); return code; } - else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { - final String sn = extractScientificNameFromNodeName( node.getName() ); - if ( !ForesterUtil.isEmpty( sn ) ) { - if ( !node.getNodeData().isHasTaxonomy() ) { - node.getNodeData().setTaxonomy( new Taxonomy() ); - } - node.getNodeData().getTaxonomy().setScientificName( sn ); - return sn; - } - } } return null; } @@ -279,12 +370,6 @@ public final class ParserUtils { if ( m.find() ) { return m.group( 1 ); } - //else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { - // m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name ); - // if ( m.find() ) { - // return m.group( 1 ); - // } - //} } return null; } @@ -299,7 +384,7 @@ public final class ParserUtils { /** * Return null if it can not guess the parser to use based on name suffix. - * + * * @param filename * @return */ @@ -335,4 +420,12 @@ public final class ParserUtils { } return parser; } + + private final static String extractTaxonomyCodeFromNodeNameLettersOnly( final String name ) { + final Matcher m = TAXOMONY_CODE_PATTERN_A_LO.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + return null; + } }