From: cmzmasek@gmail.com Date: Tue, 9 Apr 2013 22:54:51 +0000 (+0000) Subject: taxonomy extraction changed X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=6062dfb954cafb6af22e01af89222888d9d5ba66;p=jalview.git taxonomy extraction changed --- diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 4f90ca0..1ee5628 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -319,13 +319,13 @@ public class rio { final NHXParser nhx = ( NHXParser ) p; nhx.setReplaceUnderscores( false ); nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); } else if ( p instanceof NexusPhylogeniesParser ) { final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p; nex.setReplaceUnderscores( false ); nex.setIgnoreQuotes( true ); - nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); } else { throw new RuntimeException( "unknown parser type: " + p ); diff --git a/forester/java/src/org/forester/application/support_transfer.java b/forester/java/src/org/forester/application/support_transfer.java index 927da4f..a96b834 100644 --- a/forester/java/src/org/forester/application/support_transfer.java +++ b/forester/java/src/org/forester/application/support_transfer.java @@ -95,7 +95,7 @@ public final class support_transfer { final PhylogenyParser pp_bl = ParserUtils.createParserDependingOnFileType( infile_bl, true ); final PhylogenyParser pp_s = ParserUtils.createParserDependingOnFileType( infile_support_vals, true ); if ( pp_bl instanceof NHXParser ) { - ( ( NHXParser ) pp_bl ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + ( ( NHXParser ) pp_bl ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.NO ); } phylogeny_w_bl = factory.create( infile_bl, pp_bl )[ index_of_tree_w_bl ]; phylogeny_w_support_vals = factory.create( infile_support_vals, pp_s )[ 0 ]; diff --git a/forester/java/src/org/forester/archaeopteryx/Configuration.java b/forester/java/src/org/forester/archaeopteryx/Configuration.java index a1c16c2..09ba61f 100644 --- a/forester/java/src/org/forester/archaeopteryx/Configuration.java +++ b/forester/java/src/org/forester/archaeopteryx/Configuration.java @@ -1307,15 +1307,21 @@ public final class Configuration { if ( s.equalsIgnoreCase( "no" ) ) { setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO ); } - else if ( s.equalsIgnoreCase( "yes" ) ) { + else if ( s.equalsIgnoreCase( "pfam_relaxed" ) ) { setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); } - else if ( s.equalsIgnoreCase( "pfam" ) ) { + else if ( s.equalsIgnoreCase( "pfam_strict" ) ) { setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); } + else if ( s.equalsIgnoreCase( "aggressive" ) ) { + setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); + } else { - ForesterUtil.printWarningMessage( Constants.PRG_NAME, - "unknown value for \"taxonomy_extraction_in_nh_parsing\": " + s ); + ForesterUtil + .printWarningMessage( Constants.PRG_NAME, + "unknown value for \"taxonomy_extraction_in_nh_parsing\": " + + s + + " (must be either: no, pfam_relaxed, pfam_strict, or aggressive)" ); } if ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) && isReplaceUnderscoresInNhParsing() ) { ForesterUtil diff --git a/forester/java/src/org/forester/archaeopteryx/Constants.java b/forester/java/src/org/forester/archaeopteryx/Constants.java index 6beacfa..8df9375 100644 --- a/forester/java/src/org/forester/archaeopteryx/Constants.java +++ b/forester/java/src/org/forester/archaeopteryx/Constants.java @@ -42,8 +42,8 @@ public final class Constants { public final static boolean __SYNTH_LF = false; // TODO remove me public final static boolean ALLOW_DDBJ_BLAST = false; public final static String PRG_NAME = "Archaeopteryx"; - final static String VERSION = "0.9812 A1ST"; - final static String PRG_DATE = "130403"; + final static String VERSION = "0.9812+ A1ST"; + final static String PRG_DATE = "130409"; final static String DEFAULT_CONFIGURATION_FILE_NAME = "_aptx_configuration_file"; final static String[] DEFAULT_FONT_CHOICES = { "Arial", "Helvetica", "Verdana", "Tahoma", "Dialog", "Lucida Sans", "SansSerif", "Sans-serif", "Sans" }; diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrame.java b/forester/java/src/org/forester/archaeopteryx/MainFrame.java index c353213..bc565ac 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrame.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrame.java @@ -201,8 +201,9 @@ public abstract class MainFrame extends JFrame implements ActionListener { // _ parsing JCheckBoxMenuItem _internal_number_are_confidence_for_nh_parsing_cbmi; JRadioButtonMenuItem _extract_taxonomy_no_rbmi; - JRadioButtonMenuItem _extract_taxonomy_yes_rbmi; - JRadioButtonMenuItem _extract_taxonomy_pfam_rbmi; + JRadioButtonMenuItem _extract_taxonomy_agressive_rbmi; + JRadioButtonMenuItem _extract_taxonomy_pfam_strict_rbmi; + JRadioButtonMenuItem _extract_taxonomy_pfam_relaxed_rbmi; JCheckBoxMenuItem _replace_underscores_cbmi; JCheckBoxMenuItem _use_brackets_for_conf_in_nh_export_cbmi; JCheckBoxMenuItem _use_internal_names_for_conf_in_nh_export_cbmi; @@ -1317,14 +1318,17 @@ public abstract class MainFrame extends JFrame implements ActionListener { && _print_black_and_white_cbmi.isSelected() ); options.setInternalNumberAreConfidenceForNhParsing( ( _internal_number_are_confidence_for_nh_parsing_cbmi != null ) && _internal_number_are_confidence_for_nh_parsing_cbmi.isSelected() ); - if ( ( _extract_taxonomy_yes_rbmi != null ) && _extract_taxonomy_yes_rbmi.isSelected() ) { - options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + if ( ( _extract_taxonomy_pfam_strict_rbmi != null ) && _extract_taxonomy_pfam_strict_rbmi.isSelected() ) { + options.setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); } - else if ( ( _extract_taxonomy_pfam_rbmi != null ) && _extract_taxonomy_pfam_rbmi.isSelected() ) { - options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + else if ( ( _extract_taxonomy_pfam_relaxed_rbmi != null ) && _extract_taxonomy_pfam_relaxed_rbmi.isSelected() ) { + options.setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + } + else if ( ( _extract_taxonomy_agressive_rbmi != null ) && _extract_taxonomy_agressive_rbmi.isSelected() ) { + options.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); } else if ( ( _extract_taxonomy_no_rbmi != null ) && _extract_taxonomy_no_rbmi.isSelected() ) { - options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.NO ); + options.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO ); } options.setReplaceUnderscoresInNhParsing( ( _replace_underscores_cbmi != null ) && _replace_underscores_cbmi.isSelected() ); diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index 977bcbb..1060a32 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -541,7 +541,8 @@ public final class MainFrameApplication extends MainFrame { } collapseBelowThreshold(); } - else if ( ( o == _extract_taxonomy_pfam_rbmi ) || ( o == _extract_taxonomy_yes_rbmi ) ) { + else if ( ( o == _extract_taxonomy_pfam_strict_rbmi ) || ( o == _extract_taxonomy_pfam_relaxed_rbmi ) + || ( o == _extract_taxonomy_agressive_rbmi ) ) { if ( _replace_underscores_cbmi != null ) { _replace_underscores_cbmi.setSelected( false ); } @@ -941,17 +942,22 @@ public final class MainFrameApplication extends MainFrame { // _options_jmenu.add( _extract_taxonomy_no_rbmi = new JRadioButtonMenuItem( "No Taxonomy Extraction" ) ); _options_jmenu - .add( _extract_taxonomy_pfam_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes from Pfam-style Node Names" ) ); - _extract_taxonomy_pfam_rbmi - .setToolTipText( "To extract 5-letter taxonomy codes from node names in the form of \"BCL2_MOUSE/134-298\"" ); + .add( _extract_taxonomy_pfam_strict_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes/Ids from Pfam-style Node Names" ) ); _options_jmenu - .add( _extract_taxonomy_yes_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes from Node Names" ) ); - _extract_taxonomy_yes_rbmi - .setToolTipText( "To extract 5-letter taxonomy codes from node names in the form of \"BCL2_MOUSE\" or \"BCL2_MOUSE B-cell lymphoma 2...\"" ); + .add( _extract_taxonomy_pfam_relaxed_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes/Ids from Pfam-style like Node Names" ) ); + _options_jmenu + .add( _extract_taxonomy_agressive_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes/Ids/Scientific Names from Node Names" ) ); + _extract_taxonomy_pfam_strict_rbmi + .setToolTipText( "To extract taxonomy codes/ids from node names in the form of e.g. \"BCL2_MOUSE/123-304\" or \"BCL2_10090/123-304\"" ); + _extract_taxonomy_pfam_relaxed_rbmi + .setToolTipText( "To extract taxonomy codes/ids from node names in the form of e.g. \"bax_MOUSE\" or \"bax_10090\"" ); + _extract_taxonomy_agressive_rbmi + .setToolTipText( "To extract taxonomy codes/ids or scientific names from node names in the form of e.g. \"MOUSE\" or \"10090\" or \"xyz_Nematostella_vectensis\"" ); _radio_group_2 = new ButtonGroup(); _radio_group_2.add( _extract_taxonomy_no_rbmi ); - _radio_group_2.add( _extract_taxonomy_pfam_rbmi ); - _radio_group_2.add( _extract_taxonomy_yes_rbmi ); + _radio_group_2.add( _extract_taxonomy_pfam_strict_rbmi ); + _radio_group_2.add( _extract_taxonomy_pfam_relaxed_rbmi ); + _radio_group_2.add( _extract_taxonomy_agressive_rbmi ); // _options_jmenu.add( customizeMenuItemAsLabel( new JMenuItem( "Newick/Nexus Output:" ), getConfiguration() ) ); _options_jmenu @@ -997,10 +1003,12 @@ public final class MainFrameApplication extends MainFrame { .isInternalNumberAreConfidenceForNhParsing() ); customizeRadioButtonMenuItem( _extract_taxonomy_no_rbmi, getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.NO ); - customizeRadioButtonMenuItem( _extract_taxonomy_yes_rbmi, - getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); - customizeRadioButtonMenuItem( _extract_taxonomy_pfam_rbmi, + customizeRadioButtonMenuItem( _extract_taxonomy_pfam_strict_rbmi, getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + customizeRadioButtonMenuItem( _extract_taxonomy_pfam_relaxed_rbmi, + getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + customizeRadioButtonMenuItem( _extract_taxonomy_agressive_rbmi, + getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.AGGRESSIVE ); customizeCheckBoxMenuItem( _replace_underscores_cbmi, getOptions().isReplaceUnderscoresInNhParsing() ); customizeCheckBoxMenuItem( _search_whole_words_only_cbmi, getOptions().isMatchWholeTermsOnly() ); customizeCheckBoxMenuItem( _inverse_search_result_cbmi, getOptions().isInverseSearchResult() ); @@ -1822,8 +1830,8 @@ public final class MainFrameApplication extends MainFrame { final PhylogenyNode n = it.next(); final String name = n.getName().trim(); if ( !ForesterUtil.isEmpty( name ) ) { - final String nt = ParserUtils - .extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + final String nt = ParserUtils.extractTaxonomyDataFromNodeName( n, + TAXONOMY_EXTRACTION.AGGRESSIVE ); if ( !ForesterUtil.isEmpty( nt ) ) { if ( counter < 15 ) { sb.append( name + ": " + nt + "\n" ); diff --git a/forester/java/src/org/forester/archaeopteryx/Options.java b/forester/java/src/org/forester/archaeopteryx/Options.java index f8aa376..5b855fb 100644 --- a/forester/java/src/org/forester/archaeopteryx/Options.java +++ b/forester/java/src/org/forester/archaeopteryx/Options.java @@ -414,7 +414,7 @@ final public class Options { _taxonomy_colorize_node_shapes = taxonomy_colorize_node_shapes; } - final void setTaxonomyExtractio( final TAXONOMY_EXTRACTION taxonomy_extraction ) { + final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) { _taxonomy_extraction = taxonomy_extraction; } @@ -496,7 +496,7 @@ final public class Options { instance.setNumberOfDigitsAfterCommaForConfidenceValues( configuration .getNumberOfDigitsAfterCommaForConfidenceValues() ); } - instance.setTaxonomyExtractio( configuration.getTaxonomyExtraction() ); + instance.setTaxonomyExtraction( configuration.getTaxonomyExtraction() ); instance.setReplaceUnderscoresInNhParsing( configuration.isReplaceUnderscoresInNhParsing() ); instance.setInternalNumberAreConfidenceForNhParsing( configuration .isInternalNumberAreConfidenceForNhParsing() ); diff --git a/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java b/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java index 7d88a31..1fb4a59 100644 --- a/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java +++ b/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java @@ -59,6 +59,11 @@ public class UrlTreeReader implements Runnable { _webservice_client_index = webservice_client_index; } + @Override + public void run() { + readPhylogeniesFromWebservice(); + } + synchronized void readPhylogeniesFromWebservice() { final long start_time = new Date().getTime(); URL url = null; @@ -109,8 +114,7 @@ public class UrlTreeReader implements Runnable { break; case NH_EXTRACT_TAXONOMY: parser = new NHXParser(); - ( ( NHXParser ) parser ) - .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + ( ( NHXParser ) parser ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); ( ( NHXParser ) parser ).setReplaceUnderscores( false ); ( ( NHXParser ) parser ).setGuessRootedness( true ); break; @@ -254,9 +258,4 @@ public class UrlTreeReader implements Runnable { _main_frame.activateSaveAllIfNeeded(); System.gc(); } - - @Override - public void run() { - readPhylogeniesFromWebservice(); - } } diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 92275ad..26e35b3 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,18 +55,22 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP"; - final public static Pattern TAXOMONY_SN_PATTERN = Pattern - .compile( "[A-Z0-9]{2,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_R1 = Pattern.compile( "[A-Z0-9]+_(" + TAX_CODE + ")\\b" ); - final public static Pattern TAXOMONY_CODE_PATTERN_R2 = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + ")\\b" ); - final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "[A-Z0-9]{2,}_(" + TAX_CODE - + ")/\\d+-\\d+" ); - final public static Pattern TAXOMONY_CODE_PATTERN_4 = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); - final public static Pattern TAXOMONY_CODE_PATTERN_6 = Pattern.compile( "\\[([A-Z9][A-Z]{2}[A-Z0-9]{3})\\]" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\b\\d{1,7}\\b" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" ); + final public static String TAX_CODE = "(?:[A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA|CAP"; + final public static Pattern TAXOMONY_SN_PATTERN = Pattern + .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_([A-Z][a-z]+_[a-z]{2,}(?:_[a-z][a-z0-9_]+)?)\\b" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PFS = Pattern.compile( "(?:\\b|_)[A-Z0-9]{4,}_(" + + TAX_CODE + ")/\\d+-\\d+\\b" ); + final public static Pattern TAXOMONY_CODE_PATTERN_PFR = Pattern.compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(" + + TAX_CODE + ")\\b" ); + final public static Pattern TAXOMONY_CODE_PATTERN_A = Pattern.compile( "(?:\\b|_)(" + TAX_CODE + ")\\b" ); + final public static Pattern TAXOMONY_CODE_PATTERN_4 = Pattern.compile( "\\[(" + TAX_CODE + ")\\]" ); + final public static Pattern TAXOMONY_CODE_PATTERN_6 = Pattern + .compile( "\\[([A-Z9][A-Z]{2}[A-Z0-9]{3})\\]" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_A = Pattern.compile( "(?:\\b|_)(\\d{1,7})\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFS = Pattern + .compile( "(?:\\b|_)[A-Z0-9]{4,}_(\\d{1,7})/\\d+-\\d+\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PFR = Pattern + .compile( "(?:\\b|_)[a-zA-Z0-9]{3,}_(\\d{1,7})\\b" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) @@ -224,36 +228,36 @@ public final class ParserUtils { return reader; } + public final static String extractScientificNameFromNodeName( final String name ) { + final Matcher m = TAXOMONY_SN_PATTERN.matcher( name ); + if ( m.find() ) { + return m.group( 1 ).replace( '_', ' ' ); + } + return null; + } + public final static String extractTaxonomyCodeFromNodeName( final String name, final TAXONOMY_EXTRACTION taxonomy_extraction ) { - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) { - final Matcher m = TAXOMONY_CODE_PATTERN_PF.matcher( name ); - if ( m.find() ) { - return m.group( 1 ); - } + Matcher m = TAXOMONY_CODE_PATTERN_PFS.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); } else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) { - final Matcher m1 = TAXOMONY_CODE_PATTERN_R1.matcher( name ); - if ( m1.find() ) { - return m1.group( 1 ); + m = TAXOMONY_CODE_PATTERN_PFR.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); } - final Matcher m2 = TAXOMONY_CODE_PATTERN_R2.matcher( name ); - if ( m2.find() ) { - return m2.group( 1 ); + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + m = TAXOMONY_CODE_PATTERN_A.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } } } return null; } - public final static String extractScientificNameFromNodeName( final String name ) { - final Matcher m1 = TAXOMONY_SN_PATTERN.matcher( name ); - if ( m1.find() ) { - return m1.group( 1 ).replace( '_', ' ' ); - } - return null; - } - public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node, final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) throws PhyloXmlDataFormatException { @@ -277,7 +281,7 @@ public final class ParserUtils { node.getNodeData().getTaxonomy().setTaxonomyCode( code ); return code; } - else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED || taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) { + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { final String sn = extractScientificNameFromNodeName( node.getName() ); if ( !ForesterUtil.isEmpty( sn ) ) { if ( !node.getNodeData().isHasTaxonomy() ) { @@ -293,36 +297,21 @@ public final class ParserUtils { public final static String extractUniprotTaxonomyIdFromNodeName( final String name, final TAXONOMY_EXTRACTION taxonomy_extraction ) { - if ( ( name.indexOf( "_" ) > 0 ) - && ( ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) || ( ( ( name - .indexOf( "/" ) > 4 ) && ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) ) ) ) ) { - final String[] s = name.split( "[_\\s]" ); - if ( s.length > 1 ) { - final String str = s[ 1 ]; - if ( !ForesterUtil.isEmpty( str ) ) { - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) { - final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str ); - if ( m.matches() ) { - return m.group( 1 ); - } - } - else { - final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str ); - if ( m1.matches() ) { - return m1.group(); - } - final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str ); - if ( m2.matches() ) { - return m2.group( 1 ); - } - } - } - } + Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PFS.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); } - if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { - final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name ); - if ( m1.matches() ) { - return name; + else if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) ) { + m = TAXOMONY_UNIPROT_ID_PATTERN_PFR.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGGRESSIVE ) { + m = TAXOMONY_UNIPROT_ID_PATTERN_A.matcher( name ); + if ( m.find() ) { + return m.group( 1 ); + } } } return null; diff --git a/forester/java/src/org/forester/rio/TestRIO.java b/forester/java/src/org/forester/rio/TestRIO.java index 99f3d48..2fdfc09 100644 --- a/forester/java/src/org/forester/rio/TestRIO.java +++ b/forester/java/src/org/forester/rio/TestRIO.java @@ -48,7 +48,7 @@ public final class TestRIO { final NHXParser nhx = new NHXParser(); nhx.setReplaceUnderscores( false ); nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); // final String gene_trees_00_str = "(MOUSE,RAT);(MOUSE,RAT);(MOUSE,RAT);(RAT,MOUSE);"; final Phylogeny[] gene_trees_00 = factory.create( gene_trees_00_str, nhx ); @@ -737,7 +737,7 @@ public final class TestRIO { final NHXParser nhx = new NHXParser(); nhx.setReplaceUnderscores( false ); nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); final String gene_trees_1_str = "(((((MOUSE,RAT),HUMAN),CAEEL),YEAST),ARATH);" + "((((MOUSE,RAT),HUMAN),(ARATH,YEAST)),CAEEL);" + "((MOUSE,RAT),(((ARATH,YEAST),CAEEL),HUMAN));" + "(((((MOUSE,HUMAN),RAT),CAEEL),YEAST),ARATH);" + "((((HUMAN,MOUSE),RAT),(ARATH,YEAST)),CAEEL);"; diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index cbd304d..598ad3a 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -237,7 +237,6 @@ public final class Test { System.out.println( "failed." ); failed++; } - System.exit( 0 ); System.out.print( "UniProtKB id extraction: " ); if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) { System.out.println( "OK." ); @@ -1105,10 +1104,10 @@ public final class Test { .equals( "Mus musculus musculus" ) ) { return false; } - if ( !ParserUtils.extractScientificNameFromNodeName( " -XS_Mus_musculus-12" ).equals( "Mus musculus" ) ) { + if ( !ParserUtils.extractScientificNameFromNodeName( " -XS12_Mus_musculus-12" ).equals( "Mus musculus" ) ) { return false; } - if ( !ParserUtils.extractScientificNameFromNodeName( " -XS_Mus_musculus-12 affrre e" ) + if ( !ParserUtils.extractScientificNameFromNodeName( " -1234_Mus_musculus-12 affrre e" ) .equals( "Mus musculus" ) ) { return false; } @@ -1122,15 +1121,14 @@ public final class Test { private static boolean testExtractTaxonomyCodeFromNodeName() { try { - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "MOUSE" ) ) { + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) != null ) { return false; } if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( " ARATH ", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( " ARATH ", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "ARATH" ) ) { return false; } @@ -1138,8 +1136,7 @@ public final class Test { .equals( "ARATH" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "RAT", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "RAT" ) ) { + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "RAT", TAXONOMY_EXTRACTION.AGGRESSIVE ).equals( "RAT" ) ) { return false; } if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "RAT", TAXONOMY_EXTRACTION.AGGRESSIVE ).equals( "RAT" ) ) { @@ -1148,71 +1145,50 @@ public final class Test { if ( ParserUtils.extractTaxonomyCodeFromNodeName( "RAT1", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) != null ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( " _MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "MOUSE" ) ) { - return false; - } if ( !ParserUtils.extractTaxonomyCodeFromNodeName( " _SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "SOYBN" ) ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( " SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "SOYBN" ) ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "_SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "SOYBN" ) ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "qwerty SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "SOYBN" ) ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "qwerty_SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN ", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "qwerty SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "qwerty_SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN qwerty", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "ABCD_SOYBN ", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN qwerty", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "SOYBN", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( ",SOYBN,", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( ",SOYBN,", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "xxx,SOYBN,xxx", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "xxx,SOYBN,xxx", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( ParserUtils.extractTaxonomyCodeFromNodeName( "xxxSOYBNxxx", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) != null ) { + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "xxxSOYBNxxx", TAXONOMY_EXTRACTION.AGGRESSIVE ) != null ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "-SOYBN~", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "-SOYBN~", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "SOYBN" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "N8_ECOLI/1-2:0.01", + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "NNN8_ECOLI/1-2:0.01", TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ).equals( "ECOLI" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "blag_9YX45-blag", - TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "blag_9YX45-blag", TAXONOMY_EXTRACTION.AGGRESSIVE ) .equals( "9YX45" ) ) { return false; } @@ -1276,19 +1252,7 @@ public final class Test { .equals( "MOUSE" ) ) { return false; } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "_MOUSE ", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "MOUSE" ) ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "_MOUSE^", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "MOUSE" ) ) { - return false; - } - if ( ParserUtils.extractTaxonomyCodeFromNodeName( "_MOUSE*", TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ) != null ) { - return false; - } - if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "x_MOUSE=x", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) - .equals( "MOUSE" ) ) { + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "_MOUSE ", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) != null ) { return false; } } @@ -6408,7 +6372,7 @@ public final class Test { return false; } final PhylogenyNode n9 = PhylogenyNode - .createInstanceFromNhxString( "blag_12345_blag", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + .createInstanceFromNhxString( "blag_12345/blag", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); if ( !n9.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { System.out.println( n9.toString() ); return false; @@ -6432,39 +6396,24 @@ public final class Test { return false; } final PhylogenyNode n11 = PhylogenyNode - .createInstanceFromNhxString( "BLAG_Mus_musculus", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + .createInstanceFromNhxString( "BLAG_Mus_musculus", NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); if ( !n11.getNodeData().getTaxonomy().getScientificName().equals( "Mus musculus" ) ) { System.out.println( n11.toString() ); return false; } final PhylogenyNode n12 = PhylogenyNode .createInstanceFromNhxString( "BLAG_Mus_musculus_musculus", - NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); if ( !n12.getNodeData().getTaxonomy().getScientificName().equals( "Mus musculus musculus" ) ) { System.out.println( n12.toString() ); return false; } final PhylogenyNode n13 = PhylogenyNode - .createInstanceFromNhxString( "BLAG_Mus_musculus1", - NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + .createInstanceFromNhxString( "BLAG_Mus_musculus1", NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); if ( n13.getNodeData().isHasTaxonomy() ) { System.out.println( n13.toString() ); return false; } - final PhylogenyNode n14 = PhylogenyNode - .createInstanceFromNhxString( "BLAG_Mus_musculus_11", - NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); - if ( n14.getNodeData().isHasTaxonomy() ) { - System.out.println( n14.toString() ); - return false; - } - final PhylogenyNode n16 = PhylogenyNode - .createInstanceFromNhxString( "BLAG_Mus_musculus_/11", - NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); - if ( n16.getNodeData().isHasTaxonomy() ) { - System.out.println( n16.toString() ); - return false; - } } catch ( final Exception e ) { e.printStackTrace( System.out ); @@ -6521,16 +6470,18 @@ public final class Test { return false; } final PhylogenyNode n8 = PhylogenyNode - .createInstanceFromNhxString( "N8_ECOLI/1-2:0.01", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !n8.getName().equals( "N8_ECOLI/1-2" ) ) { + .createInstanceFromNhxString( "ABCD_ECOLI/1-2:0.01", + NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + if ( !n8.getName().equals( "ABCD_ECOLI/1-2" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( n8 ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode n9 = PhylogenyNode - .createInstanceFromNhxString( "N9_ECOLI/1-12:0.01", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !n9.getName().equals( "N9_ECOLI/1-12" ) ) { + .createInstanceFromNhxString( "ABCD_ECOLI/1-12:0.01", + NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + if ( !n9.getName().equals( "ABCD_ECOLI/1-12" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( n9 ).equals( "ECOLI" ) ) { @@ -6542,8 +6493,8 @@ public final class Test { return false; } final PhylogenyNode n20 = PhylogenyNode - .createInstanceFromNhxString( "N20_ECOLI/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !n20.getName().equals( "N20_ECOLI/1-2" ) ) { + .createInstanceFromNhxString( "ABCD_ECOLI/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + if ( !n20.getName().equals( "ABCD_ECOLI/1-2" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( n20 ).equals( "ECOLI" ) ) { @@ -6614,29 +6565,29 @@ public final class Test { return false; } final PhylogenyNode a = PhylogenyNode - .createInstanceFromNhxString( "N10_ECOLI/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !a.getName().equals( "N10_ECOLI/1-2" ) ) { + .createInstanceFromNhxString( "ABCD_ECOLI/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + if ( !a.getName().equals( "ABCD_ECOLI/1-2" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( a ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode c1 = PhylogenyNode - .createInstanceFromNhxString( "n10_BOVIN_1/1000-2000", + .createInstanceFromNhxString( "n10_BOVIN/1000-2000", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); - if ( !c1.getName().equals( "n10_BOVIN_1/1000-2000" ) ) { + if ( !c1.getName().equals( "n10_BOVIN/1000-2000" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( c1 ).equals( "BOVIN" ) ) { return false; } final PhylogenyNode c2 = PhylogenyNode - .createInstanceFromNhxString( "n10_Bovin_1/1000-2000", + .createInstanceFromNhxString( "N10_Bovin_1/1000-2000", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !c2.getName().equals( "n10_Bovin_1/1000-2000" ) ) { + if ( !c2.getName().equals( "N10_Bovin_1/1000-2000" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( c2 ).equals( "" ) ) { + if ( PhylogenyMethods.getSpecies( c2 ).length() > 0 ) { return false; } final PhylogenyNode e3 = PhylogenyNode @@ -6660,9 +6611,9 @@ public final class Test { return false; } final PhylogenyNode n12 = PhylogenyNode - .createInstanceFromNhxString( "n111111-ECOLI---/jdj:0.4", + .createInstanceFromNhxString( "N111111-ECOLI---/jdj:0.4", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !n12.getName().equals( "n111111-ECOLI---/jdj" ) ) { + if ( !n12.getName().equals( "N111111-ECOLI---/jdj" ) ) { return false; } if ( n12.getDistanceToParent() != 0.4 ) { @@ -6672,8 +6623,8 @@ public final class Test { return false; } final PhylogenyNode o = PhylogenyNode - .createInstanceFromNhxString( "n10_MOUSE_", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); - if ( !o.getName().equals( "n10_MOUSE_" ) ) { + .createInstanceFromNhxString( "ABCD_MOUSE", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + if ( !o.getName().equals( "ABCD_MOUSE" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( o ).equals( "MOUSE" ) ) { @@ -6724,8 +6675,8 @@ public final class Test { return false; } final PhylogenyNode n14 = PhylogenyNode - .createInstanceFromNhxString( "BLA_9QX45/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); - if ( !n14.getName().equals( "BLA_9QX45/1-2" ) ) { + .createInstanceFromNhxString( "BLA1_9QX45/1-2", NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + if ( !n14.getName().equals( "BLA1_9QX45/1-2" ) ) { return false; } if ( !PhylogenyMethods.getSpecies( n14 ).equals( "9QX45" ) ) { diff --git a/forester/java/src/org/forester/tools/PhylogenyDecorator.java b/forester/java/src/org/forester/tools/PhylogenyDecorator.java index 085b9d3..8da1094 100644 --- a/forester/java/src/org/forester/tools/PhylogenyDecorator.java +++ b/forester/java/src/org/forester/tools/PhylogenyDecorator.java @@ -491,7 +491,7 @@ public final class PhylogenyDecorator { ForesterUtil.ensurePresenceOfTaxonomy( node ); try { if ( tc.length() == 6 ) { - String t = tc.substring( 0, 5 ); + final String t = tc.substring( 0, 5 ); System.out.println( "WARNING: taxonomy code " + tc + " -> " + t ); tc = t; } diff --git a/forester/java/src/org/forester/util/ForesterConstants.java b/forester/java/src/org/forester/util/ForesterConstants.java index 823fd96..63ffac3 100644 --- a/forester/java/src/org/forester/util/ForesterConstants.java +++ b/forester/java/src/org/forester/util/ForesterConstants.java @@ -27,8 +27,8 @@ package org.forester.util; public final class ForesterConstants { - public final static String FORESTER_VERSION = "1.027"; - public final static String FORESTER_DATE = "130401"; + public final static String FORESTER_VERSION = "1.027+"; + public final static String FORESTER_DATE = "130409"; public final static String PHYLO_XML_VERSION = "1.10"; public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; public final static String PHYLO_XML_XSD = "phyloxml.xsd";