X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Futil%2FParserUtils.java;h=e86ed49fbaabf2b7b67a83bc7a6195b71a5954e0;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=4d74229ddb136e831acce4d641cfeeeec3e86205;hpb=9f15c8c3415681ea197bf9d629ee3f2c126fc5b7;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 4d74229..e86ed49 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,29 +55,52 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "[^_]{2,}_([A-Z][a-z]+_[a-z]{2,}(_[A-Za-z]\\w+|))\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern - .compile( "\\b[A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA|CAP\\b" ); - final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern - .compile( "([A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA|CAP)[^0-9A-Za-z].*" ); - final private static Pattern TAXOMONY_CODE_PATTERN_3 = Pattern - .compile( "_([A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA|CAP)_" ); - final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern - .compile( "([A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); - final public static Pattern TAXOMONY_CODE_PATTERN_4 = Pattern - .compile( "\\[(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP)\\]" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\b\\d{1,7}\\b" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" ); + final private static String SN_BN = "[A-Z][a-z]{2,30}[_ ][a-z]{3,30}"; + final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA"; + final public static String TAX_CODE_LO = "(?:[A-Z]{5})|RAT|PIG|PEA"; + final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + + ")(?:\\b|_)" ); + final public static Pattern TAXOMONY_CODE_PATTERN_A_LO = Pattern.compile( "_(" + TAX_CODE_LO + + ")(?:\\b|_)" ); + final public static Pattern TAXOMONY_CODE_PATTERN_BRACKETED = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); + final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + + TAX_CODE + ")\\b" ); + final public static Pattern TAXOMONY_SN_PATTERN_GENUS = Pattern.compile( "([A-Z][a-z]{2,30})" ); + final public static Pattern TAXOMONY_SN_PATTERN_SN = Pattern.compile( "(?:\\b|_)(" + SN_BN + + ")(?:(\\s*$)|([_ ][a-z]*[A-Z0-9]))" ); + final public static Pattern TAXOMONY_SN_PATTERN_SNS = Pattern.compile( "(?:\\b|_)(" + SN_BN + + "[_ ][a-z]{3,30}" + + ")[_ ][a-z]*[A-Z0-9]" ); + final public static Pattern TAXOMONY_SN_PATTERN_SNS2 = Pattern.compile( "[A-Z0-9][a-z]*[_ ](" + SN_BN + + "[_ ][a-z]{3,30}" + ")\\s*$" ); + final public static Pattern TAXOMONY_SN_PATTERN_SP = Pattern + .compile( "(?:\\b|_)([A-Z][a-z]{2,30}[_ ]sp\\.?)(?:\\b|_)?" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_1 = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ](?:str|subsp|ssp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_2 = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ]\\((?:str|subsp|ssp|var)[a-z]{0,5}\\.?[_ ]\\S{1,60}\\))" ); + final public static Pattern TAXOMONY_SN_PATTERN_STRAIN_SUBSTRAIN = Pattern + .compile( "(?:\\b|_)(" + + SN_BN + + "[_ ]str[a-z]{0,3}\\.?[_ ]\\S{1,60}[_ ]substr[a-z]{0,3}\\.?[_ ]\\S{1,60})(?:\\b|_)" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + + TAX_CODE + ")/\\d+-\\d+\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{1,}_(\\d{1,7})\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { PhylogenyParser parser = null; final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); if ( first_line.startsWith( "<" ) ) { - parser = new PhyloXmlParser(); + parser = PhyloXmlParser.createPhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); @@ -104,72 +127,32 @@ public final class ParserUtils { final public static PhylogenyParser createParserDependingOnFileType( final File file, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { PhylogenyParser parser = null; parser = ParserUtils.createParserDependingOnSuffix( file.getName(), phyloxml_validate_against_xsd ); if ( parser == null ) { parser = createParserDependingFileContents( file, phyloxml_validate_against_xsd ); } - return parser; - } - - /** - * Return null if it can not guess the parser to use based on name suffix. - * - * @param filename - * @return - */ - final public static PhylogenyParser createParserDependingOnSuffix( final String filename, - final boolean phyloxml_validate_against_xsd ) { - PhylogenyParser parser = null; - final String filename_lc = filename.toLowerCase(); - if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { - parser = new TolParser(); - } - else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) - || filename_lc.endsWith( ".zip" ) ) { - parser = new PhyloXmlParser(); - if ( phyloxml_validate_against_xsd ) { - final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); - final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); - if ( xsd_url != null ) { - ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); - } - else { - if ( ForesterConstants.RELEASE ) { - throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" - + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); - } - } + if ( ( parser != null ) && file.toString().toLowerCase().endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); } - } - else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { - parser = new NexusPhylogeniesParser(); - } - else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) - || filename_lc.endsWith( ".nwk" ) ) { - parser = new NHXParser(); } return parser; } final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, final boolean phyloxml_validate_against_xsd ) - throws FileNotFoundException, IOException { + throws FileNotFoundException, IOException { final String lc_filename = url.getFile().toString().toLowerCase(); PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); - if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { - if ( parser instanceof PhyloXmlParser ) { - ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); - } - else if ( parser instanceof TolParser ) { - ( ( TolParser ) parser ).setZippedInputstream( true ); - } - } if ( parser == null ) { final String first_line = ForesterUtil.getFirstLine( url ).trim().toLowerCase(); if ( first_line.startsWith( "<" ) ) { - parser = new PhyloXmlParser(); + parser = PhyloXmlParser.createPhyloXmlParser(); if ( phyloxml_validate_against_xsd ) { final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); @@ -190,6 +173,14 @@ public final class ParserUtils { parser = new NHXParser(); } } + if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); + } + } return parser; } @@ -222,133 +213,162 @@ public final class ParserUtils { } else { throw new IllegalArgumentException( "attempt to parse object of type [" + source.getClass() - + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" ); + + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" ); } return reader; } - public final static String extractTaxonomyCodeFromNodeName( final String name, - final TAXONOMY_EXTRACTION taxonomy_extraction ) { - if ( ( name.indexOf( "_" ) > 0 ) - && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) || ( name.indexOf( "/" ) > 4 ) ) ) { - final String[] s = name.split( "[_\\s]" ); - if ( s.length > 1 ) { - final String str = s[ 1 ]; - if ( !ForesterUtil.isEmpty( str ) ) { - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) { - final Matcher m = TAXOMONY_CODE_PATTERN_PF.matcher( str ); - if ( m.matches() ) { - return m.group( 1 ); - } - } - else { - final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( str ); - if ( m1.matches() ) { - return m1.group(); - } - final Matcher m2 = TAXOMONY_CODE_PATTERN_2.matcher( str ); - if ( m2.matches() ) { - return m2.group( 1 ); - } - } - } + public final static String extractScientificNameFromNodeName( final String name ) { + final Matcher m_ss = TAXOMONY_SN_PATTERN_STRAIN_SUBSTRAIN.matcher( name ); + if ( m_ss.find() ) { + String s = m_ss.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " str " ) > 4 ) { + s = s.replaceFirst( " str ", " str. " ); + } + if ( s.indexOf( " substr " ) > 4 ) { + s = s.replaceFirst( " substr ", " substr. " ); + } + return s; + } + final Matcher m_str1 = TAXOMONY_SN_PATTERN_STRAIN_1.matcher( name ); + if ( m_str1.find() ) { + String s = m_str1.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " str " ) > 4 ) { + s = s.replaceFirst( " str ", " str. " ); + } + else if ( s.indexOf( " subsp " ) > 4 ) { + s = s.replaceFirst( " subsp ", " subsp. " ); + } + else if ( s.indexOf( " ssp " ) > 4 ) { + s = s.replaceFirst( " ssp ", " subsp. " ); + } + else if ( s.indexOf( " ssp. " ) > 4 ) { + s = s.replaceFirst( " ssp. ", " subsp. " ); + } + else if ( s.indexOf( " var " ) > 4 ) { + s = s.replaceFirst( " var ", " var. " ); } + return s; } - if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGRESSIVE ) ) { - final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( name ); - if ( m1.matches() ) { - return name; + final Matcher m_str2 = TAXOMONY_SN_PATTERN_STRAIN_2.matcher( name ); + if ( m_str2.find() ) { + String s = m_str2.group( 1 ).replace( '_', ' ' ); + if ( s.indexOf( " (str " ) > 4 ) { + s = s.replaceFirst( " \\(str ", " (str. " ); } - final Matcher m3 = TAXOMONY_CODE_PATTERN_3.matcher( name ); - if ( m3.matches() ) { - return m3.group( 1 ); + else if ( s.indexOf( " (subsp " ) > 4 ) { + s = s.replaceFirst( " \\(subsp ", " (subsp. " ); } + else if ( s.indexOf( " (ssp " ) > 4 ) { + s = s.replaceFirst( " \\(ssp ", " (subsp. " ); + } + else if ( s.indexOf( " (ssp. " ) > 4 ) { + s = s.replaceFirst( " \\(ssp. ", " (subsp. " ); + } + else if ( s.indexOf( " (var " ) > 4 ) { + s = s.replaceFirst( " \\(var ", " (var. " ); + } + return s; + } + final Matcher m_sns = TAXOMONY_SN_PATTERN_SNS.matcher( name ); + if ( m_sns.find() ) { + return m_sns.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_sns2 = TAXOMONY_SN_PATTERN_SNS2.matcher( name ); + if ( m_sns2.find() ) { + return m_sns2.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_sn = TAXOMONY_SN_PATTERN_SN.matcher( name ); + if ( m_sn.find() ) { + return m_sn.group( 1 ).replace( '_', ' ' ); + } + final Matcher m_sp = TAXOMONY_SN_PATTERN_SP.matcher( name ); + if ( m_sp.find() ) { + String s = m_sp.group( 1 ).replace( '_', ' ' ); + if ( s.endsWith( " sp" ) ) { + s = s + "."; + } + return s; } return null; } - public final static String extractScientificNameFromNodeName( final String name ) { - final Matcher m1 = TAXOMONY_SN_PATTERN.matcher( name ); - if ( m1.matches() ) { - return m1.group( 1 ).replace( '_', ' ' ); + public final static String extractTaxonomyCodeFromNodeName( final String name, + final TAXONOMY_EXTRACTION taxonomy_extraction ) { + Matcher m = TAXOMONY_CODE_PATTERN_PFS.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) { + m = TAXOMONY_CODE_PATTERN_PFR.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + m = TAXOMONY_CODE_PATTERN_A.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + } } return null; } public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node, final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) - throws PhyloXmlDataFormatException { + throws PhyloXmlDataFormatException { + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.NO ) { + throw new IllegalArgumentException(); + } final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction ); if ( !ForesterUtil.isEmpty( id ) ) { if ( !node.getNodeData().isHasTaxonomy() ) { node.getNodeData().setTaxonomy( new Taxonomy() ); } - if ( ( node.getNodeData().getTaxonomy().getIdentifier() == null ) - || ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getIdentifier().getValue() ) ) { - node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); - return id; - } + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); + return id; } else { - final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); - if ( !ForesterUtil.isEmpty( code ) ) { - if ( !node.getNodeData().isHasTaxonomy() ) { - node.getNodeData().setTaxonomy( new Taxonomy() ); - } - if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { - node.getNodeData().getTaxonomy().setTaxonomyCode( code ); - return code; - } - } - else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGRESSIVE ) ) { - final String sn = extractScientificNameFromNodeName( node.getName() ); - if ( !ForesterUtil.isEmpty( sn ) ) { - if ( !node.getNodeData().isHasTaxonomy() ) { - node.getNodeData().setTaxonomy( new Taxonomy() ); - } - if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getScientificName() ) ) { + String code = null; + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + code = extractTaxonomyCodeFromNodeNameLettersOnly( node.getName() ); + if ( ForesterUtil.isEmpty( code ) ) { + final String sn = extractScientificNameFromNodeName( node.getName() ); + if ( !ForesterUtil.isEmpty( sn ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } node.getNodeData().getTaxonomy().setScientificName( sn ); return sn; } } } + if ( ForesterUtil.isEmpty( code ) ) { + code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); + } + if ( !ForesterUtil.isEmpty( code ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setTaxonomyCode( code ); + return code; + } } return null; } public final static String extractUniprotTaxonomyIdFromNodeName( final String name, final TAXONOMY_EXTRACTION taxonomy_extraction ) { - if ( ( name.indexOf( "_" ) > 0 ) - && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) || ( name.indexOf( "/" ) > 4 ) ) ) { - final String[] s = name.split( "[_\\s]" ); - if ( s.length > 1 ) { - final String str = s[ 1 ]; - if ( !ForesterUtil.isEmpty( str ) ) { - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) { - final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str ); - if ( m.matches() ) { - return m.group( 1 ); - } - } - else { - final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str ); - if ( m1.matches() ) { - return m1.group(); - } - final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str ); - if ( m2.matches() ) { - return m2.group( 1 ); - } - } - } - } + Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PFS.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); } - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGRESSIVE ) { - final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name ); - if ( m1.matches() ) { - return name; + else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) { + m = TAXOMONY_UNIPROT_ID_PATTERN_PFR.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); } } return null; @@ -361,4 +381,51 @@ public final class ParserUtils { public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException { return readPhylogenies( new File( file_name ) ); } + + /** + * Return null if it can not guess the parser to use based on name suffix. + * + * @param filename + * @return + */ + final private static PhylogenyParser createParserDependingOnSuffix( final String filename, + final boolean phyloxml_validate_against_xsd ) { + PhylogenyParser parser = null; + final String filename_lc = filename.toLowerCase(); + if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { + parser = new TolParser(); + } + else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( "phyloxml" ) || filename_lc.endsWith( ".zip" ) ) { + parser = PhyloXmlParser.createPhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { + parser = new NexusPhylogeniesParser(); + } + else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) + || filename_lc.endsWith( ".nwk" ) ) { + parser = new NHXParser(); + } + return parser; + } + + private final static String extractTaxonomyCodeFromNodeNameLettersOnly( final String name ) { + final Matcher m = TAXOMONY_CODE_PATTERN_A_LO.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + return null; + } }