From: cmzmasek@gmail.com Date: Wed, 5 Dec 2012 03:51:58 +0000 (+0000) Subject: work on taxonomy extraction for applets as well... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=5a4b2fee20522338ade76d942cc997840b1d8246;p=jalview.git work on taxonomy extraction for applets as well... --- diff --git a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java index f7fbee6..e60bc2c 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java @@ -64,6 +64,9 @@ import javax.swing.text.MaskFormatter; import org.forester.analysis.TaxonomyDataManager; import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.phyloxml.PhyloXmlUtil; import org.forester.io.parsers.tol.TolParser; import org.forester.io.parsers.util.ParserUtils; @@ -901,17 +904,41 @@ public final class AptxUtil { System.out.println( "[" + applet_name + "] > " + message ); } - final static Phylogeny[] readPhylogeniesFromUrl( final URL url, final boolean phyloxml_validate_against_xsd ) + final static Phylogeny[] readPhylogeniesFromUrl( final URL url, + final boolean phyloxml_validate_against_xsd, + final boolean replace_underscores, + final boolean internal_numbers_are_confidences, + final TAXONOMY_EXTRACTION taxonomy_extraction ) throws FileNotFoundException, IOException { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - PhylogenyParser parser = null; + final PhylogenyParser parser; + boolean nhx_or_nexus = false; if ( url.getHost().toLowerCase().indexOf( "tolweb" ) >= 0 ) { parser = new TolParser(); } else { parser = ParserUtils.createParserDependingOnUrlContents( url, phyloxml_validate_against_xsd ); + if ( parser instanceof NHXParser ) { + nhx_or_nexus = true; + final NHXParser nhx = ( NHXParser ) parser; + nhx.setReplaceUnderscores( replace_underscores ); + nhx.setIgnoreQuotes( false ); + nhx.setTaxonomyExtraction( taxonomy_extraction ); + } + else if ( parser instanceof NexusPhylogeniesParser ) { + nhx_or_nexus = true; + final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) parser; + nex.setReplaceUnderscores( replace_underscores ); + nex.setIgnoreQuotes( false ); + } + } + final Phylogeny[] phys = factory.create( url.openStream(), parser ); + if ( nhx_or_nexus && internal_numbers_are_confidences ) { + for( final Phylogeny phy : phys ) { + PhylogenyMethods.transferInternalNodeNamesToConfidence( phy ); + } } - return factory.create( url.openStream(), parser ); + return phys; } final static void removeBranchColors( final Phylogeny phy ) { diff --git a/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java b/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java index 6a28ef7..91a8fd9 100644 --- a/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java +++ b/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java @@ -94,11 +94,7 @@ public final class Archaeopteryx { final NHXParser nhx = ( NHXParser ) p; nhx.setReplaceUnderscores( conf.isReplaceUnderscoresInNhParsing() ); nhx.setIgnoreQuotes( false ); - NHXParser.TAXONOMY_EXTRACTION te = NHXParser.TAXONOMY_EXTRACTION.NO; - if ( conf.isExtractPfamTaxonomyCodesInNhParsing() ) { - te = NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY; - } - nhx.setTaxonomyExtraction( te ); + nhx.setTaxonomyExtraction( conf.getTaxonomyExtraction() ); } else if ( p instanceof NexusPhylogeniesParser ) { nhx_or_nexus = true; diff --git a/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxA.java b/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxA.java index b4e3801..dad1425 100644 --- a/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxA.java +++ b/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxA.java @@ -111,8 +111,9 @@ public class ArchaeopteryxA extends JApplet { _mainframe_applet = new MainFrameApplet( this, configuration ); URL url = null; url = new URL( getUrlString() ); - final Phylogeny[] phys = AptxUtil.readPhylogeniesFromUrl( url, - configuration.isValidatePhyloXmlAgainstSchema() ); + final Phylogeny[] phys = AptxUtil.readPhylogeniesFromUrl( url, configuration + .isValidatePhyloXmlAgainstSchema(), configuration.isReplaceUnderscoresInNhParsing(), configuration + .isInternalNumberAreConfidenceForNhParsing(), configuration.getTaxonomyExtraction() ); AptxUtil.addPhylogeniesToTabs( phys, new File( url.getFile() ).getName(), getUrlString(), diff --git a/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxE.java b/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxE.java index 8755c67..896bea9 100644 --- a/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxE.java +++ b/forester/java/src/org/forester/archaeopteryx/ArchaeopteryxE.java @@ -787,7 +787,11 @@ public class ArchaeopteryxE extends JApplet implements ActionListener { // Load the tree from URL if ( phys_url != null ) { try { - phys = AptxUtil.readPhylogeniesFromUrl( phys_url, getConfiguration().isValidatePhyloXmlAgainstSchema() ); + phys = AptxUtil.readPhylogeniesFromUrl( phys_url, + getConfiguration().isValidatePhyloXmlAgainstSchema(), + getConfiguration().isReplaceUnderscoresInNhParsing(), + getConfiguration().isInternalNumberAreConfidenceForNhParsing(), + getConfiguration().getTaxonomyExtraction() ); } catch ( final Exception e ) { ForesterUtil.printErrorMessage( NAME, e.toString() ); diff --git a/forester/java/src/org/forester/archaeopteryx/Configuration.java b/forester/java/src/org/forester/archaeopteryx/Configuration.java index ad38502..f506905 100644 --- a/forester/java/src/org/forester/archaeopteryx/Configuration.java +++ b/forester/java/src/org/forester/archaeopteryx/Configuration.java @@ -47,6 +47,7 @@ import org.forester.archaeopteryx.Options.CLADOGRAM_TYPE; import org.forester.archaeopteryx.Options.NODE_LABEL_DIRECTION; import org.forester.archaeopteryx.Options.OVERVIEW_PLACEMENT_TYPE; import org.forester.archaeopteryx.Options.PHYLOGENY_GRAPHICS_TYPE; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.phylogeny.data.NodeData.NODE_DATA; import org.forester.phylogeny.data.NodeVisualization; import org.forester.phylogeny.data.NodeVisualization.NodeFill; @@ -86,7 +87,7 @@ public final class Configuration { private short _number_of_digits_after_comma_for_branch_length_values = Constants.NUMBER_OF_DIGITS_AFTER_COMMA_FOR_BRANCH_LENGTH_VALUES_DEFAULT; private boolean _editable = true; private boolean _nh_parsing_replace_underscores = false; - private boolean _nh_parsing_extract_pfam_taxonomy_codes = false; + private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY; private boolean _internal_number_are_confidence_for_nh_parsing = false; private boolean _display_sequence_relations = false; private boolean _validate_against_phyloxml_xsd_schema = Constants.VALIDATE_AGAINST_PHYLOXML_XSD_SCJEMA_DEFAULT; @@ -629,8 +630,8 @@ public final class Configuration { return _editable; } - boolean isExtractPfamTaxonomyCodesInNhParsing() { - return _nh_parsing_extract_pfam_taxonomy_codes; + final TAXONOMY_EXTRACTION getTaxonomyExtraction() { + return _taxonomy_extraction; } boolean isHasWebLink( final String source ) { @@ -924,8 +925,8 @@ public final class Configuration { _editable = editable; } - public void setExtractPfamTaxonomyCodesInNhParsing( final boolean nh_parsing_extract_pfam_taxonomy_codes ) { - _nh_parsing_extract_pfam_taxonomy_codes = nh_parsing_extract_pfam_taxonomy_codes; + final void setTaxonomyExtraction( final TAXONOMY_EXTRACTION taxonomy_extraction ) { + _taxonomy_extraction = taxonomy_extraction; } private void setGraphicsExportX( final int graphics_export_x ) { @@ -1204,7 +1205,7 @@ public final class Configuration { } else if ( key.equals( "replace_underscores_in_nh_parsing" ) ) { final boolean r = parseBoolean( ( String ) st.nextElement() ); - if ( r && isExtractPfamTaxonomyCodesInNhParsing() ) { + if ( r && ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) { ForesterUtil .printWarningMessage( Constants.PRG_NAME, "attempt to extract taxonomies and replace underscores at the same time" ); @@ -1213,16 +1214,26 @@ public final class Configuration { setReplaceUnderscoresInNhParsing( r ); } } - else if ( key.equals( "extract_taxonomy_codes_in_nh_parsing" ) ) { - final boolean e = parseBoolean( ( String ) st.nextElement() ); - if ( e && isReplaceUnderscoresInNhParsing() ) { + else if ( key.equals( "taxonomy_extraction_in_nh_parsing" ) ) { + final String s = ( String ) st.nextElement(); + if ( s.equalsIgnoreCase( "no" ) ) { + setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO ); + } + else if ( s.equalsIgnoreCase( "yes" ) ) { + setTaxonomyExtraction( TAXONOMY_EXTRACTION.YES ); + } + else if ( s.equalsIgnoreCase( "pfam_only" ) ) { + setTaxonomyExtraction( TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + } + else { + ForesterUtil.printWarningMessage( Constants.PRG_NAME, + "unknown value for \"taxonomy_extraction_in_nh_parsing\": " + s ); + } + if ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) && isReplaceUnderscoresInNhParsing() ) { ForesterUtil .printWarningMessage( Constants.PRG_NAME, "attempt to extract taxonomies and replace underscores at the same time" ); } - else { - setExtractPfamTaxonomyCodesInNhParsing( e ); - } } else if ( key.equals( "internal_labels_are_confidence_values" ) ) { setInternalNumberAreConfidenceForNhParsing( parseBoolean( ( String ) st.nextElement() ) ); diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrame.java b/forester/java/src/org/forester/archaeopteryx/MainFrame.java index 89184ef..e9c0405 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrame.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrame.java @@ -58,6 +58,7 @@ import org.forester.archaeopteryx.Options.PHYLOGENY_GRAPHICS_TYPE; import org.forester.archaeopteryx.tools.InferenceManager; import org.forester.archaeopteryx.tools.ProcessPool; import org.forester.archaeopteryx.tools.ProcessRunning; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; @@ -185,7 +186,9 @@ public abstract class MainFrame extends JFrame implements ActionListener { JMenuItem _choose_pdf_width_mi; // _ parsing JCheckBoxMenuItem _internal_number_are_confidence_for_nh_parsing_cbmi; - JCheckBoxMenuItem _extract_pfam_style_tax_codes_cbmi; + JRadioButtonMenuItem _extract_taxonomy_no_rbmi; + JRadioButtonMenuItem _extract_taxonomy_yes_rbmi; + JRadioButtonMenuItem _extract_taxonomy_pfam_rbmi; JCheckBoxMenuItem _replace_underscores_cbmi; JCheckBoxMenuItem _use_brackets_for_conf_in_nh_export_cbmi; JCheckBoxMenuItem _use_internal_names_for_conf_in_nh_export_cbmi; @@ -1096,8 +1099,15 @@ public abstract class MainFrame extends JFrame implements ActionListener { && _print_black_and_white_cbmi.isSelected() ); options.setInternalNumberAreConfidenceForNhParsing( ( _internal_number_are_confidence_for_nh_parsing_cbmi != null ) && _internal_number_are_confidence_for_nh_parsing_cbmi.isSelected() ); - options.setExtractPfamTaxonomyCodesInNhParsing( ( _extract_pfam_style_tax_codes_cbmi != null ) - && _extract_pfam_style_tax_codes_cbmi.isSelected() ); + if ( ( _extract_taxonomy_yes_rbmi != null ) && _extract_taxonomy_yes_rbmi.isSelected() ) { + options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.YES ); + } + else if ( ( _extract_taxonomy_pfam_rbmi != null ) && _extract_taxonomy_pfam_rbmi.isSelected() ) { + options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + } + else if ( ( _extract_taxonomy_no_rbmi != null ) && _extract_taxonomy_no_rbmi.isSelected() ) { + options.setTaxonomyExtractio( TAXONOMY_EXTRACTION.NO ); + } options.setReplaceUnderscoresInNhParsing( ( _replace_underscores_cbmi != null ) && _replace_underscores_cbmi.isSelected() ); options.setMatchWholeTermsOnly( ( _search_whole_words_only_cbmi != null ) diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplet.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplet.java index 8df235b..fc8a103 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplet.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplet.java @@ -84,7 +84,11 @@ public final class MainFrameApplet extends MainFrame { // Load the tree from URL if ( url != null ) { try { - phys = AptxUtil.readPhylogeniesFromUrl( url, getConfiguration().isValidatePhyloXmlAgainstSchema() ); + phys = AptxUtil.readPhylogeniesFromUrl( url, + configuration.isValidatePhyloXmlAgainstSchema(), + configuration.isReplaceUnderscoresInNhParsing(), + configuration.isInternalNumberAreConfidenceForNhParsing(), + configuration.getTaxonomyExtraction() ); } catch ( final Exception e ) { ForesterUtil.printErrorMessage( ArchaeopteryxA.NAME, e.toString() ); diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index 709c621..a238164 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -79,6 +79,7 @@ import org.forester.io.parsers.GeneralMsaParser; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.phyloxml.PhyloXmlUtil; @@ -218,6 +219,7 @@ public final class MainFrameApplication extends MainFrame { private Phylogeny _species_tree; private File _current_dir; private ButtonGroup _radio_group_1; + private ButtonGroup _radio_group_2; // Others: double _min_not_collapse = Constants.MIN_NOT_COLLAPSE_DEFAULT; // Phylogeny Inference menu @@ -616,8 +618,8 @@ public final class MainFrameApplication extends MainFrame { updateOptions( getOptions() ); } else if ( o == _replace_underscores_cbmi ) { - if ( ( _extract_pfam_style_tax_codes_cbmi != null ) && _replace_underscores_cbmi.isSelected() ) { - _extract_pfam_style_tax_codes_cbmi.setSelected( false ); + if ( ( _extract_taxonomy_no_rbmi != null ) && !_extract_taxonomy_no_rbmi.isSelected() ) { + _extract_taxonomy_no_rbmi.setSelected( true ); } updateOptions( getOptions() ); } @@ -627,8 +629,8 @@ public final class MainFrameApplication extends MainFrame { } collapseBelowThreshold(); } - else if ( o == _extract_pfam_style_tax_codes_cbmi ) { - if ( ( _replace_underscores_cbmi != null ) && _extract_pfam_style_tax_codes_cbmi.isSelected() ) { + else if ( ( o == _extract_taxonomy_pfam_rbmi ) || ( o == _extract_taxonomy_yes_rbmi ) ) { + if ( _replace_underscores_cbmi != null ) { _replace_underscores_cbmi.setSelected( false ); } updateOptions( getOptions() ); @@ -843,8 +845,21 @@ public final class MainFrameApplication extends MainFrame { _options_jmenu .add( _internal_number_are_confidence_for_nh_parsing_cbmi = new JCheckBoxMenuItem( "Internal Node Names are Confidence Values" ) ); _options_jmenu.add( _replace_underscores_cbmi = new JCheckBoxMenuItem( "Replace Underscores with Spaces" ) ); + // + _options_jmenu.add( _extract_taxonomy_no_rbmi = new JRadioButtonMenuItem( "No Taxonomy Extraction" ) ); + _options_jmenu + .add( _extract_taxonomy_pfam_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes from Pfam-style Node Names" ) ); + _extract_taxonomy_pfam_rbmi + .setToolTipText( "To extract 5-letter taxonomy codes from node names in the form of \"BCL2_MOUSE/134-298\"" ); _options_jmenu - .add( _extract_pfam_style_tax_codes_cbmi = new JCheckBoxMenuItem( "Extract Taxonomy Codes from Pfam-style Labels" ) ); + .add( _extract_taxonomy_yes_rbmi = new JRadioButtonMenuItem( "Extract Taxonomy Codes from Node Names" ) ); + _extract_taxonomy_yes_rbmi + .setToolTipText( "To extract 5-letter taxonomy codes from node names in the form of \"BCL2_MOUSE\" or \"BCL2_MOUSE B-cell lymphoma 2...\"" ); + _radio_group_2 = new ButtonGroup(); + _radio_group_2.add( _extract_taxonomy_no_rbmi ); + _radio_group_2.add( _extract_taxonomy_pfam_rbmi ); + _radio_group_2.add( _extract_taxonomy_yes_rbmi ); + // _options_jmenu.add( customizeMenuItemAsLabel( new JMenuItem( "Newick/Nexus Output:" ), getConfiguration() ) ); _options_jmenu .add( _use_brackets_for_conf_in_nh_export_cbmi = new JCheckBoxMenuItem( USE_BRACKETS_FOR_CONF_IN_NH_LABEL ) ); @@ -884,8 +899,12 @@ public final class MainFrameApplication extends MainFrame { customizeCheckBoxMenuItem( _print_black_and_white_cbmi, getOptions().isPrintBlackAndWhite() ); customizeCheckBoxMenuItem( _internal_number_are_confidence_for_nh_parsing_cbmi, getOptions() .isInternalNumberAreConfidenceForNhParsing() ); - customizeCheckBoxMenuItem( _extract_pfam_style_tax_codes_cbmi, getOptions() - .isExtractPfamTaxonomyCodesInNhParsing() ); + customizeRadioButtonMenuItem( _extract_taxonomy_no_rbmi, + getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.NO ); + customizeRadioButtonMenuItem( _extract_taxonomy_yes_rbmi, + getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.YES ); + customizeRadioButtonMenuItem( _extract_taxonomy_pfam_rbmi, + getOptions().getTaxonomyExtraction() == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); customizeCheckBoxMenuItem( _replace_underscores_cbmi, getOptions().isReplaceUnderscoresInNhParsing() ); customizeCheckBoxMenuItem( _search_whole_words_only_cbmi, getOptions().isMatchWholeTermsOnly() ); customizeCheckBoxMenuItem( _inverse_search_result_cbmi, getOptions().isInverseSearchResult() ); @@ -2253,11 +2272,7 @@ public final class MainFrameApplication extends MainFrame { private void setSpecialOptionsForNhxParser( final NHXParser nhx ) { nhx.setReplaceUnderscores( getOptions().isReplaceUnderscoresInNhParsing() ); - NHXParser.TAXONOMY_EXTRACTION te = NHXParser.TAXONOMY_EXTRACTION.NO; - if ( getOptions().isExtractPfamTaxonomyCodesInNhParsing() ) { - te = NHXParser.TAXONOMY_EXTRACTION.YES; - } - nhx.setTaxonomyExtraction( te ); + nhx.setTaxonomyExtraction( getOptions().getTaxonomyExtraction() ); } private void writeAllToFile() { diff --git a/forester/java/src/org/forester/archaeopteryx/Options.java b/forester/java/src/org/forester/archaeopteryx/Options.java index abbc7c7..cb067ab 100644 --- a/forester/java/src/org/forester/archaeopteryx/Options.java +++ b/forester/java/src/org/forester/archaeopteryx/Options.java @@ -27,6 +27,7 @@ package org.forester.archaeopteryx; import java.awt.Font; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.NodeData; import org.forester.phylogeny.data.NodeData.NODE_DATA; @@ -67,7 +68,7 @@ final public class Options { private short _number_of_digits_after_comma_for_confidence_values; private short _number_of_digits_after_comma_for_branch_length_values; private boolean _nh_parsing_replace_underscores; - private boolean _nh_parsing_extract_pfam_taxonomy_codes; + private TAXONOMY_EXTRACTION _taxonomy_extraction; private boolean _editable; private boolean _background_color_gradient; private boolean _show_domain_labels; @@ -185,7 +186,7 @@ final public class Options { _number_of_digits_after_comma_for_branch_length_values = Constants.NUMBER_OF_DIGITS_AFTER_COMMA_FOR_BRANCH_LENGTH_VALUES_DEFAULT; _number_of_digits_after_comma_for_confidence_values = Constants.NUMBER_OF_DIGITS_AFTER_COMMA_FOR_CONFIDENCE_VALUES_DEFAULT; _nh_parsing_replace_underscores = false; - _nh_parsing_extract_pfam_taxonomy_codes = false; + _taxonomy_extraction = TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY; _cladogram_type = Constants.CLADOGRAM_TYPE_DEFAULT; _show_domain_labels = true; setAbbreviateScientificTaxonNames( false ); @@ -239,8 +240,8 @@ final public class Options { return _editable; } - final boolean isExtractPfamTaxonomyCodesInNhParsing() { - return _nh_parsing_extract_pfam_taxonomy_codes; + final TAXONOMY_EXTRACTION getTaxonomyExtraction() { + return _taxonomy_extraction; } final boolean isGraphicsExportUsingActualSize() { @@ -343,8 +344,8 @@ final public class Options { _editable = editable; } - final void setExtractPfamTaxonomyCodesInNhParsing( final boolean nh_parsing_extract_pfam_taxonomy_codes ) { - _nh_parsing_extract_pfam_taxonomy_codes = nh_parsing_extract_pfam_taxonomy_codes; + final void setTaxonomyExtractio( final TAXONOMY_EXTRACTION taxonomy_extraction ) { + _taxonomy_extraction = taxonomy_extraction; } final void setGraphicsExportUsingActualSize( final boolean graphics_export_using_actual_size ) { @@ -481,7 +482,7 @@ final public class Options { instance.setNumberOfDigitsAfterCommaForConfidenceValues( configuration .getNumberOfDigitsAfterCommaForConfidenceValues() ); } - instance.setExtractPfamTaxonomyCodesInNhParsing( configuration.isExtractPfamTaxonomyCodesInNhParsing() ); + instance.setTaxonomyExtractio( configuration.getTaxonomyExtraction() ); instance.setReplaceUnderscoresInNhParsing( configuration.isReplaceUnderscoresInNhParsing() ); instance.setInternalNumberAreConfidenceForNhParsing( configuration .isInternalNumberAreConfidenceForNhParsing() ); diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 99a78bf..92935cd 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -61,7 +61,7 @@ import org.forester.util.ForesterUtil; public final class NHXParser implements PhylogenyParser { - public static final TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = NHXParser.TAXONOMY_EXTRACTION.NO; + public static final TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = TAXONOMY_EXTRACTION.NO; final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; final static private boolean GUESS_IF_SUPPORT_VALUES = true; final static private boolean IGNORE_QUOTES_DEFAULT = false; @@ -82,7 +82,7 @@ public final class NHXParser implements PhylogenyParser { private int _clade_level; private List _phylogenies; private Phylogeny _current_phylogeny; - private NHXParser.TAXONOMY_EXTRACTION _taxonomy_extraction; + private TAXONOMY_EXTRACTION _taxonomy_extraction; private boolean _replace_underscores; public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern.compile( "^[A-Z0-9]+$" ); public final static Pattern NUMBERS_ONLY_PATTERN = Pattern.compile( "^[0-9\\.]+$" ); @@ -94,105 +94,6 @@ public final class NHXParser implements PhylogenyParser { init(); } - /** - * Decreases the clade level by one. - * - * @throws PhylogenyParserException - * if level goes below zero. - */ - private void decreaseCladeLevel() throws PhylogenyParserException { - if ( getCladeLevel() < 0 ) { - throw new PhylogenyParserException( "error in NH (Newick)/NHX formatted data: most likely cause: number of close parens is larger than number of open parens" ); - } - --_clade_level; - } - - /** - * Finishes the current Phylogeny and adds it to the list of Phylogenies - * created. - * - * @throws PhylogenyParserException - * @throws NHXFormatException - * @throws PhyloXmlDataFormatException - */ - private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { - setCladeLevel( 0 ); - if ( getCurrentPhylogeny() != null ) { - parseNHX( getCurrentAnotation().toString(), - getCurrentPhylogeny().getRoot(), - getTaxonomyExtraction(), - isReplaceUnderscores() ); - if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { - if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { - NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() ); - } - } - if ( isGuessRootedness() ) { - final PhylogenyNode root = getCurrentPhylogeny().getRoot(); - if ( ( root.getDistanceToParent() >= 0.0 ) || !ForesterUtil.isEmpty( root.getName() ) - || !ForesterUtil.isEmpty( PhylogenyMethods.getSpecies( root ) ) || root.isHasAssignedEvent() ) { - getCurrentPhylogeny().setRooted( true ); - } - } - getPhylogenies().add( getCurrentPhylogeny() ); - } - } - - private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException, - PhyloXmlDataFormatException { - setCladeLevel( 0 ); - final PhylogenyNode new_node = new PhylogenyNode(); - parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); - setCurrentPhylogeny( new Phylogeny() ); - getCurrentPhylogeny().setRoot( new_node ); - getPhylogenies().add( getCurrentPhylogeny() ); - } - - private int getCladeLevel() { - return _clade_level; - } - - private StringBuilder getCurrentAnotation() { - return _current_anotation; - } - - private PhylogenyNode getCurrentNode() { - return _current_node; - } - - private Phylogeny getCurrentPhylogeny() { - return _current_phylogeny; - } - - private byte getInputType() { - return _input_type; - } - - private Object getNhxSource() { - return _nhx_source; - } - - private List getPhylogenies() { - return _phylogenies; - } - - /** - * Returns the Phylogenies created as Array. - * - * @return the Phylogenies created as Array - */ - private Phylogeny[] getPhylogeniesAsArray() { - final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ]; - for( int i = 0; i < getPhylogenies().size(); ++i ) { - p[ i ] = getPhylogenies().get( i ); - } - return p; - } - - private int getSourceLength() { - return _source_length; - } - public NHXParser.TAXONOMY_EXTRACTION getTaxonomyExtraction() { return _taxonomy_extraction; } @@ -202,44 +103,6 @@ public final class NHXParser implements PhylogenyParser { } /** - * Increases the clade level by one. - */ - private void increaseCladeLevel() { - ++_clade_level; - } - - private void init() { - setTaxonomyExtraction( TAXONOMY_EXTRACTION_DEFAULT ); - setReplaceUnderscores( REPLACE_UNDERSCORES_DEFAULT ); - setGuessRootedness( GUESS_ROOTEDNESS_DEFAULT ); - setIgnoreQuotes( IGNORE_QUOTES_DEFAULT ); - setHasNext( false ); - } - - private boolean isGuessRootedness() { - return _guess_rootedness; - } - - private boolean isIgnoreQuotes() { - return _ignore_quotes; - } - - private boolean isReplaceUnderscores() { - return _replace_underscores; - } - - private boolean isSawClosingParen() { - return _saw_closing_paren; - } - - /** - * Replaces the current annotation with a new StringBuffer. - */ - private void newCurrentAnotation() { - setCurrentAnotation( new StringBuilder() ); - } - - /** * Parses the source set with setSource( final Object nhx_source ). Returns * the Phylogenies found in the source as Phylogeny[]. * Everything between [ and ] is considered comment and ignored, @@ -398,6 +261,226 @@ public final class NHXParser implements PhylogenyParser { return getPhylogeniesAsArray(); } // parse() + public void setGuessRootedness( final boolean guess_rootedness ) { + _guess_rootedness = guess_rootedness; + } + + public void setIgnoreQuotes( final boolean ignore_quotes ) { + _ignore_quotes = ignore_quotes; + } + + public void setReplaceUnderscores( final boolean replace_underscores ) { + _replace_underscores = replace_underscores; + } + + /** + * This sets the source to be parsed. The source can be: String, + * StringBuffer, char[], File, or InputStream. The source can contain more + * than one phylogenies in either New Hamphshire (NH) or New Hamphshire + * Extended (NHX) format. There is no need to separate phylogenies with any + * special character. White space is always ignored, as are semicolons + * inbetween phylogenies. Example of a source describing two phylogenies + * (source is a String, in this example): "(A,(B,(C,(D,E)de)cde)bcde)abcde + * ((((A,B)ab,C)abc,D)abcd,E)abcde". Everything between a '[' followed by any + * character other than '&' and ']' is considered a comment and ignored + * (example: "[this is a comment]"). NHX tags are surrounded by '[&&NHX' and + * ']' (example: "[&&NHX:S=Varanus_storri]"). A sequence like "[& some + * info]" is ignored, too (at the PhylogenyNode level, though). + * Exception: numbers only between [ and ] (e.g. [90]) are interpreted as support values. + * + * @see #parse() + * @see org.forester.io.parsers.PhylogenyParser#setSource(java.lang.Object) + * @param nhx_source + * the source to be parsed (String, StringBuffer, char[], File, + * or InputStream) + * @throws IOException + * @throws PhylogenyParserException + */ + @Override + public void setSource( final Object nhx_source ) throws PhylogenyParserException, IOException { + if ( nhx_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + else if ( nhx_source instanceof String ) { + setInputType( NHXParser.STRING ); + setSourceLength( ( ( String ) nhx_source ).length() ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof StringBuffer ) { + setInputType( NHXParser.STRING_BUFFER ); + setSourceLength( ( ( StringBuffer ) nhx_source ).length() ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof char[] ) { + setInputType( NHXParser.CHAR_ARRAY ); + setSourceLength( ( ( char[] ) nhx_source ).length ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof File ) { + setInputType( NHXParser.BUFFERED_READER ); + setSourceLength( 0 ); + final File f = ( File ) nhx_source; + final String error = ForesterUtil.isReadableFile( f ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new PhylogenyParserException( error ); + } + setNhxSource( new BufferedReader( new FileReader( f ) ) ); + } + else if ( nhx_source instanceof InputStream ) { + setInputType( NHXParser.BUFFERED_READER ); + setSourceLength( 0 ); + final InputStreamReader isr = new InputStreamReader( ( InputStream ) nhx_source ); + setNhxSource( new BufferedReader( isr ) ); + } + else { + throw new IllegalArgumentException( getClass() + " can only parse objects of type String," + + " StringBuffer, char[], File," + " or InputStream " + " [attempt to parse object of " + + nhx_source.getClass() + "]." ); + } + setHasNext( true ); + } + + public void setTaxonomyExtraction( final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) { + _taxonomy_extraction = taxonomy_extraction; + } + + /** + * Decreases the clade level by one. + * + * @throws PhylogenyParserException + * if level goes below zero. + */ + private void decreaseCladeLevel() throws PhylogenyParserException { + if ( getCladeLevel() < 0 ) { + throw new PhylogenyParserException( "error in NH (Newick)/NHX formatted data: most likely cause: number of close parens is larger than number of open parens" ); + } + --_clade_level; + } + + /** + * Finishes the current Phylogeny and adds it to the list of Phylogenies + * created. + * + * @throws PhylogenyParserException + * @throws NHXFormatException + * @throws PhyloXmlDataFormatException + */ + private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { + setCladeLevel( 0 ); + if ( getCurrentPhylogeny() != null ) { + parseNHX( getCurrentAnotation().toString(), + getCurrentPhylogeny().getRoot(), + getTaxonomyExtraction(), + isReplaceUnderscores() ); + if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { + if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { + NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() ); + } + } + if ( isGuessRootedness() ) { + final PhylogenyNode root = getCurrentPhylogeny().getRoot(); + if ( ( root.getDistanceToParent() >= 0.0 ) || !ForesterUtil.isEmpty( root.getName() ) + || !ForesterUtil.isEmpty( PhylogenyMethods.getSpecies( root ) ) || root.isHasAssignedEvent() ) { + getCurrentPhylogeny().setRooted( true ); + } + } + getPhylogenies().add( getCurrentPhylogeny() ); + } + } + + private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException, + PhyloXmlDataFormatException { + setCladeLevel( 0 ); + final PhylogenyNode new_node = new PhylogenyNode(); + parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); + setCurrentPhylogeny( new Phylogeny() ); + getCurrentPhylogeny().setRoot( new_node ); + getPhylogenies().add( getCurrentPhylogeny() ); + } + + private int getCladeLevel() { + return _clade_level; + } + + private StringBuilder getCurrentAnotation() { + return _current_anotation; + } + + private PhylogenyNode getCurrentNode() { + return _current_node; + } + + private Phylogeny getCurrentPhylogeny() { + return _current_phylogeny; + } + + private byte getInputType() { + return _input_type; + } + + private Object getNhxSource() { + return _nhx_source; + } + + private List getPhylogenies() { + return _phylogenies; + } + + /** + * Returns the Phylogenies created as Array. + * + * @return the Phylogenies created as Array + */ + private Phylogeny[] getPhylogeniesAsArray() { + final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ]; + for( int i = 0; i < getPhylogenies().size(); ++i ) { + p[ i ] = getPhylogenies().get( i ); + } + return p; + } + + private int getSourceLength() { + return _source_length; + } + + /** + * Increases the clade level by one. + */ + private void increaseCladeLevel() { + ++_clade_level; + } + + private void init() { + setTaxonomyExtraction( TAXONOMY_EXTRACTION_DEFAULT ); + setReplaceUnderscores( REPLACE_UNDERSCORES_DEFAULT ); + setGuessRootedness( GUESS_ROOTEDNESS_DEFAULT ); + setIgnoreQuotes( IGNORE_QUOTES_DEFAULT ); + setHasNext( false ); + } + + private boolean isGuessRootedness() { + return _guess_rootedness; + } + + private boolean isIgnoreQuotes() { + return _ignore_quotes; + } + + private boolean isReplaceUnderscores() { + return _replace_underscores; + } + + private boolean isSawClosingParen() { + return _saw_closing_paren; + } + + /** + * Replaces the current annotation with a new StringBuffer. + */ + private void newCurrentAnotation() { + setCurrentAnotation( new StringBuilder() ); + } + /** * Called if a closing paren is encountered. * @@ -497,18 +580,10 @@ public final class NHXParser implements PhylogenyParser { _current_phylogeny = current_phylogeny; } - public void setGuessRootedness( final boolean guess_rootedness ) { - _guess_rootedness = guess_rootedness; - } - private void setHasNext( final boolean has_next ) { _has_next = has_next; } - public void setIgnoreQuotes( final boolean ignore_quotes ) { - _ignore_quotes = ignore_quotes; - } - private void setInputType( final byte input_type ) { _input_type = input_type; } @@ -521,129 +596,20 @@ public final class NHXParser implements PhylogenyParser { _phylogenies = phylogenies; } - public void setReplaceUnderscores( final boolean replace_underscores ) { - _replace_underscores = replace_underscores; - } - private void setSawClosingParen( final boolean saw_closing_paren ) { _saw_closing_paren = saw_closing_paren; } - /** - * This sets the source to be parsed. The source can be: String, - * StringBuffer, char[], File, or InputStream. The source can contain more - * than one phylogenies in either New Hamphshire (NH) or New Hamphshire - * Extended (NHX) format. There is no need to separate phylogenies with any - * special character. White space is always ignored, as are semicolons - * inbetween phylogenies. Example of a source describing two phylogenies - * (source is a String, in this example): "(A,(B,(C,(D,E)de)cde)bcde)abcde - * ((((A,B)ab,C)abc,D)abcd,E)abcde". Everything between a '[' followed by any - * character other than '&' and ']' is considered a comment and ignored - * (example: "[this is a comment]"). NHX tags are surrounded by '[&&NHX' and - * ']' (example: "[&&NHX:S=Varanus_storri]"). A sequence like "[& some - * info]" is ignored, too (at the PhylogenyNode level, though). - * Exception: numbers only between [ and ] (e.g. [90]) are interpreted as support values. - * - * @see #parse() - * @see org.forester.io.parsers.PhylogenyParser#setSource(java.lang.Object) - * @param nhx_source - * the source to be parsed (String, StringBuffer, char[], File, - * or InputStream) - * @throws IOException - * @throws PhylogenyParserException - */ - @Override - public void setSource( final Object nhx_source ) throws PhylogenyParserException, IOException { - if ( nhx_source == null ) { - throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); - } - else if ( nhx_source instanceof String ) { - setInputType( NHXParser.STRING ); - setSourceLength( ( ( String ) nhx_source ).length() ); - setNhxSource( nhx_source ); - } - else if ( nhx_source instanceof StringBuffer ) { - setInputType( NHXParser.STRING_BUFFER ); - setSourceLength( ( ( StringBuffer ) nhx_source ).length() ); - setNhxSource( nhx_source ); - } - else if ( nhx_source instanceof char[] ) { - setInputType( NHXParser.CHAR_ARRAY ); - setSourceLength( ( ( char[] ) nhx_source ).length ); - setNhxSource( nhx_source ); - } - else if ( nhx_source instanceof File ) { - setInputType( NHXParser.BUFFERED_READER ); - setSourceLength( 0 ); - final File f = ( File ) nhx_source; - final String error = ForesterUtil.isReadableFile( f ); - if ( !ForesterUtil.isEmpty( error ) ) { - throw new PhylogenyParserException( error ); - } - setNhxSource( new BufferedReader( new FileReader( f ) ) ); - } - else if ( nhx_source instanceof InputStream ) { - setInputType( NHXParser.BUFFERED_READER ); - setSourceLength( 0 ); - final InputStreamReader isr = new InputStreamReader( ( InputStream ) nhx_source ); - setNhxSource( new BufferedReader( isr ) ); - } - else { - throw new IllegalArgumentException( getClass() + " can only parse objects of type String," - + " StringBuffer, char[], File," + " or InputStream " + " [attempt to parse object of " - + nhx_source.getClass() + "]." ); - } - setHasNext( true ); - } - private void setSourceLength( final int source_length ) { _source_length = source_length; } - public void setTaxonomyExtraction( final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) { - _taxonomy_extraction = taxonomy_extraction; - } - - private static double doubleValue( final String str ) throws NHXFormatException { - try { - return Double.valueOf( str ).doubleValue(); - } - catch ( final NumberFormatException ex ) { - throw new NHXFormatException( "error in NH/NHX formatted data: failed to parse number from " + "\"" + str - + "\"" ); - } - } - - private static boolean isBranchLengthsLikeBootstrapValues( final Phylogeny p ) { - final PhylogenyNodeIterator it = p.iteratorExternalForward(); - final double d0 = it.next().getDistanceToParent(); - if ( ( d0 < 10 ) || !it.hasNext() ) { - return false; - } - while ( it.hasNext() ) { - final double d = it.next().getDistanceToParent(); - if ( ( d != d0 ) || ( d < 10 ) ) { - return false; - } - } - return true; - } - - private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) { - final PhylogenyNodeIterator it = p.iteratorPostorder(); - while ( it.hasNext() ) { - final PhylogenyNode n = it.next(); - PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); - n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ); - } - } - public static void parseNHX( String s, final PhylogenyNode node_to_annotate, - final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction, + final TAXONOMY_EXTRACTION taxonomy_extraction, final boolean replace_underscores ) throws NHXFormatException, PhyloXmlDataFormatException { - if ( ( taxonomy_extraction != NHXParser.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { + if ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" ); } if ( ( s != null ) && ( s.length() > 0 ) ) { @@ -682,8 +648,7 @@ public final class NHXParser implements PhylogenyParser { if ( t.countTokens() > 0 ) { if ( !s.startsWith( ":" ) ) { node_to_annotate.setName( t.nextToken() ); - if ( !replace_underscores - && ( !is_nhx && ( taxonomy_extraction != NHXParser.TAXONOMY_EXTRACTION.NO ) ) ) { + if ( !replace_underscores && ( !is_nhx && ( taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) { final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(), taxonomy_extraction ); if ( !ForesterUtil.isEmpty( tax ) ) { @@ -789,6 +754,40 @@ public final class NHXParser implements PhylogenyParser { } } + private static double doubleValue( final String str ) throws NHXFormatException { + try { + return Double.valueOf( str ).doubleValue(); + } + catch ( final NumberFormatException ex ) { + throw new NHXFormatException( "error in NH/NHX formatted data: failed to parse number from " + "\"" + str + + "\"" ); + } + } + + private static boolean isBranchLengthsLikeBootstrapValues( final Phylogeny p ) { + final PhylogenyNodeIterator it = p.iteratorExternalForward(); + final double d0 = it.next().getDistanceToParent(); + if ( ( d0 < 10 ) || !it.hasNext() ) { + return false; + } + while ( it.hasNext() ) { + final double d = it.next().getDistanceToParent(); + if ( ( d != d0 ) || ( d < 10 ) ) { + return false; + } + } + return true; + } + + private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) { + final PhylogenyNodeIterator it = p.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); + n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ); + } + } + private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate ) throws NHXFormatException { double sd = -1; diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index cd1627c..35ec60c 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -41,6 +41,7 @@ import java.util.regex.Pattern; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.tol.TolParser; import org.forester.phylogeny.Phylogeny; @@ -50,8 +51,9 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}" ); - final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern.compile( "([A-Z0-9]{5})[^A-Z].*" ); + final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA" ); + final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)[^A-Za-z].*" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)/\\d+-\\d+" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) @@ -209,57 +211,37 @@ public final class ParserUtils { return reader; } - /** - * Extracts a code if and only if: - * one and only one _, - * shorter than 25, - * no |, - * no ., - * if / present it has to be after the _, - * if PFAM_STYLE_ONLY: / must be present, - * tax code can only contain uppercase letters and numbers, - * and must contain at least one uppercase letter. - * Return null if no code extractable. - * - * @param name - * @return - */ - public static String extractTaxonomyCodeFromNodeName( final String name, - final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) { + public final static String extractTaxonomyCodeFromNodeName( final String name, + final TAXONOMY_EXTRACTION taxonomy_extraction ) { if ( ( name.indexOf( "_" ) > 0 ) - && ( name.length() < 31 ) - // && ( name.lastIndexOf( "_" ) == name.indexOf( "_" ) ) - && ( name.indexOf( "|" ) < 0 ) - && ( name.indexOf( "." ) < 0 ) - && ( ( taxonomy_extraction != NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) >= 0 ) ) - && ( ( ( name.indexOf( "/" ) ) < 0 ) || ( name.indexOf( "/" ) > name.indexOf( "_" ) ) ) ) { - final String[] s = name.split( "[_/]" ); + && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) { + final String[] s = name.split( "[_\\s]" ); if ( s.length > 1 ) { final String str = s[ 1 ]; - // if ( str.length() < 6 ) { - if ( ( str.length() < 5 ) - && ( str.startsWith( "RAT" ) || str.startsWith( "PIG" ) || str.startsWith( "CAP" ) ) ) { - return str.substring( 0, 3 ); - } - final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( str ); - if ( m1.matches() ) { - return m1.group(); - } - final Matcher m2 = TAXOMONY_CODE_PATTERN_2.matcher( str ); - if ( m2.matches() ) { - return m2.group( 1 ); + if ( !ForesterUtil.isEmpty( str ) ) { + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) { + final Matcher m = TAXOMONY_CODE_PATTERN_PF.matcher( str ); + if ( m.matches() ) { + return m.group( 1 ); + } + } + else { + final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( str ); + if ( m1.matches() ) { + return m1.group(); + } + final Matcher m2 = TAXOMONY_CODE_PATTERN_2.matcher( str ); + if ( m2.matches() ) { + return m2.group( 1 ); + } + } } - // return null; - // final Matcher uc_letters_and_numbers = NHXParser.UC_LETTERS_NUMBERS_PATTERN.matcher( str ); - // if ( !uc_letters_and_numbers.matches() ) { - // return null; - // } - // final Matcher numbers_only = NHXParser.NUMBERS_ONLY_PATTERN.matcher( str ); - // if ( numbers_only.matches() ) { - // return null; - // } - // return str; - // } + } + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) { + final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( name ); + if ( m1.matches() ) { + return name; } } return null; diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 66f883e..c2797fb 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -52,8 +52,10 @@ import org.forester.io.parsers.nexus.NexusBinaryStatesMatrixParser; import org.forester.io.parsers.nexus.NexusCharactersParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.tol.TolParser; +import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.msa.BasicMsa; import org.forester.msa.Mafft; @@ -180,7 +182,6 @@ public final class Test { } else { System.out.println( "failed." ); - System.exit( -1 ); //TODO FIXME remove me!! ~ failed++; } System.out.print( "Hmmscan output parser: " ); @@ -201,6 +202,15 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "Taxonomy extraction: " ); + if ( Test.testExtractTaxonomyCodeFromNodeName() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } System.out.print( "Basic node construction and parsing of NHX (node level): " ); if ( Test.testNHXNodeParsing() ) { System.out.println( "OK." ); @@ -755,15 +765,6 @@ public final class Test { System.out.println( "failed." ); failed++; } - // System.out.print( "WABI TxSearch: " ); - // if ( Test.testWabiTxSearch() ) { - // System.out.println( "OK." ); - // succeeded++; - // } - // else { - // System.out - // .println( "failed [will not count towards failed tests since it might be due to absence internet connection]" ); - // } System.out.println(); final Runtime rt = java.lang.Runtime.getRuntime(); final long free_memory = rt.freeMemory() / 1000000; @@ -780,73 +781,76 @@ public final class Test { else { System.out.println( "Not OK." ); } - // System.out.println(); - // Development.setTime( true ); - //try { - // final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - // final String clc = System.getProperty( "user.dir" ) + ForesterUtil.getFileSeparator() - // + "examples" + ForesterUtil.getFileSeparator() + "CLC.nhx"; - // final String multi = Test.PATH_TO_EXAMPLE_FILES + - // "multifurcations_ex_1.nhx"; - // final String domains = Test.PATH_TO_EXAMPLE_FILES + "domains1.nhx"; - // final Phylogeny t1 = factory.create( new File( domains ), new - // NHXParser() )[ 0 ]; - // final Phylogeny t2 = factory.create( new File( clc ), new NHXParser() )[ 0 ]; - // } - // catch ( final Exception e ) { - // e.printStackTrace(); - // } - // t1.getRoot().preorderPrint(); - // final PhylogenyFactory factory = ParserBasedPhylogenyFactory - // .getInstance(); - // try { - // - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\AtNBSpos.nhx" ) ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), - // new NHXParser() ); - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\AtNBSpos.nhx" ) ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), - // new NHXParser() ); - // - // - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\big_tree.nhx" ) ); - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\big_tree.nhx" ) ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), - // new NHXParser() ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), - // new NHXParser() ); - // - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\big_tree.nhx" ) ); - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\big_tree.nhx" ) ); - // - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), - // new NHXParser() ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), - // new NHXParser() ); - // - // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES - // + "\\AtNBSpos.nhx" ) ); - // factory.create( - // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), - // new NHXParser() ); - // - // } - // catch ( IOException e ) { - // // TODO Auto-generated catch block - // e.printStackTrace(); - // } + } + + private static boolean testExtractTaxonomyCodeFromNodeName() { + try { + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.YES ).equals( "MOUSE" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "RAT", TAXONOMY_EXTRACTION.YES ).equals( "RAT" ) ) { + return false; + } + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "RAT1", TAXONOMY_EXTRACTION.YES ) != null ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSE function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "MOUSE" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSE_function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "MOUSE" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSE|function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "MOUSE" ) ) { + return false; + } + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSEfunction = 23445", TAXONOMY_EXTRACTION.YES ) != null ) { + return false; + } + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSEFunction = 23445", TAXONOMY_EXTRACTION.YES ) != null ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RAT function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "RAT" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RAT_function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "RAT" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RAT|function = 23445", TAXONOMY_EXTRACTION.YES ) + .equals( "RAT" ) ) { + return false; + } + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RATfunction = 23445", TAXONOMY_EXTRACTION.YES ) != null ) { + return false; + } + if ( ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RATFunction = 23445", TAXONOMY_EXTRACTION.YES ) != null ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_RAT/1-3", TAXONOMY_EXTRACTION.YES ).equals( "RAT" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_PIG/1-3", TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) + .equals( "PIG" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSE/1-3", TAXONOMY_EXTRACTION.YES ) + .equals( "MOUSE" ) ) { + return false; + } + if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "BCL2_MOUSE/1-3", TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) + .equals( "MOUSE" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; } private static boolean testBasicNodeMethods() { @@ -4822,7 +4826,7 @@ public final class Test { if ( !n8.getName().equals( "n8_ECOLI/12" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( n8 ).equals( "ECOLI" ) ) { + if ( PhylogenyMethods.getSpecies( n8 ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode n9 = PhylogenyNode @@ -4830,7 +4834,7 @@ public final class Test { if ( !n9.getName().equals( "n9_ECOLI/12=12" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( n9 ).equals( "ECOLI" ) ) { + if ( PhylogenyMethods.getSpecies( n9 ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode n10 = PhylogenyNode @@ -4923,7 +4927,7 @@ public final class Test { if ( !b.getName().equals( "n10_ECOLI1/1-2" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( b ).equals( "ECOLI" ) ) { + if ( PhylogenyMethods.getSpecies( b ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode c = PhylogenyNode @@ -4932,7 +4936,7 @@ public final class Test { if ( !c.getName().equals( "n10_RATAF12/1000-2000" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( c ).equals( "RATAF" ) ) { + if ( PhylogenyMethods.getSpecies( c ).equals( "RATAF" ) ) { return false; } final PhylogenyNode c1 = PhylogenyNode @@ -4941,7 +4945,7 @@ public final class Test { if ( !c1.getName().equals( "n10_BOVIN_1/1000-2000" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( c1 ).equals( "BOVIN" ) ) { + if ( PhylogenyMethods.getSpecies( c1 ).equals( "BOVIN" ) ) { return false; } final PhylogenyNode c2 = PhylogenyNode @@ -4958,7 +4962,7 @@ public final class Test { if ( !d.getName().equals( "n10_RAT1/1-2" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( d ).equals( "RAT" ) ) { + if ( PhylogenyMethods.getSpecies( d ).equals( "RAT" ) ) { return false; } final PhylogenyNode e = PhylogenyNode @@ -4994,7 +4998,7 @@ public final class Test { if ( n11.getDistanceToParent() != 0.4 ) { return false; } - if ( !PhylogenyMethods.getSpecies( n11 ).equals( "ECOLI" ) ) { + if ( PhylogenyMethods.getSpecies( n11 ).equals( "ECOLI" ) ) { return false; } final PhylogenyNode n12 = PhylogenyNode @@ -5014,7 +5018,7 @@ public final class Test { if ( !m.getName().equals( "n10_MOUSEa" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( m ).equals( "MOUSE" ) ) { + if ( PhylogenyMethods.getSpecies( m ).equals( "MOUSE" ) ) { return false; } final PhylogenyNode o = PhylogenyNode.createInstanceFromNhxString( "n10_MOUSE_",