From: cmzmasek Date: Wed, 26 Dec 2012 02:41:00 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=71cc3617df51ad241c30e2eb1407c56d4df64035;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index db00321..95b23cb 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -507,7 +507,7 @@ public final class MainFrameApplication extends MainFrame { moveNodeNamesToSeqNames(); } else if ( o == _extract_tax_code_from_node_names_jmi ) { - extractTaxCodeFromNodeNames(); + extractTaxDataFromNodeNames(); } else if ( o == _gsdi_item ) { if ( isSubtreeDisplayed() ) { @@ -1137,10 +1137,10 @@ public final class MainFrameApplication extends MainFrame { customizeJMenuItem( _move_node_names_to_seq_names_jmi ); _move_node_names_to_seq_names_jmi.setToolTipText( "To interpret node names as sequence (protein, gene) names" ); _tools_menu - .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes from Node Names" ) ); + .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes or Ids from Node Names" ) ); customizeJMenuItem( _extract_tax_code_from_node_names_jmi ); _extract_tax_code_from_node_names_jmi - .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI'" ); + .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI', or Uniprot identifiers from nodes names in the form of 'xyz_1234567'" ); _tools_menu.addSeparator(); _tools_menu .add( _obtain_detailed_taxonomic_information_jmi = new JMenuItem( OBTAIN_DETAILED_TAXONOMIC_INFORMATION ) ); @@ -1871,24 +1871,58 @@ public final class MainFrameApplication extends MainFrame { } } - private void extractTaxCodeFromNodeNames() throws PhyloXmlDataFormatException { + private void extractTaxDataFromNodeNames() throws PhyloXmlDataFormatException { + final StringBuilder sb = new StringBuilder(); + final StringBuilder sb_failed = new StringBuilder(); + int counter = 0; + int counter_failed = 0; if ( getCurrentTreePanel() != null ) { final Phylogeny phy = getCurrentTreePanel().getPhylogeny(); if ( ( phy != null ) && !phy.isEmpty() ) { - final PhylogenyNodeIterator it = phy.iteratorPostorder(); + final PhylogenyNodeIterator it = phy.iteratorExternalForward(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); final String name = n.getName().trim(); if ( !ForesterUtil.isEmpty( name ) ) { + final String nt = ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES ); + if ( !ForesterUtil.isEmpty( nt ) ) { + if ( counter < 15 ) { + sb.append( name + ": " + nt + "\n" ); + } + else if ( counter == 15 ) { + sb.append( "...\n" ); + } + counter++; + } + else { + if ( counter_failed < 15 ) { + sb_failed.append( name + "\n" ); + } + else if ( counter_failed == 15 ) { + sb_failed.append( "...\n" ); + } + counter_failed++; + } + } + } + if ( counter > 0 ) { + String failed = ""; + if ( counter_failed > 0 ) { + failed = "\nDid not extract taxonomic data for " + counter_failed + " (named) external nodes:\n" + sb_failed; - ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES ); - - // final String code = ParserUtils - // .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES ); - // if ( !ForesterUtil.isEmpty( code ) ) { - // PhylogenyMethods.setTaxonomyCode( n, code ); - // } } + JOptionPane.showMessageDialog( this, + "Successfully extracted taxonomic data from " + counter + + " external nodes:\n" + sb.toString() + failed, + "Taxonomic Data Extraction Successfully Completed", + JOptionPane.INFORMATION_MESSAGE ); + } + else { + JOptionPane + .showMessageDialog( this, + "Could not extract any taxonomic data, maybe node names are empty\nor not in the form \"XYZ_CAEEL\", \"XYZ_CAEEL/12-394\", or \"XYZ_1234567\"?", + "No Taxonomic Data Extracted", + JOptionPane.WARNING_MESSAGE ); } } } diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index db628cf..655ec7b 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -55,18 +55,15 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" ); - final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern - .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" ); - final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); - - - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\d{1,7}" ); - final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern - .compile( "(\\d{1,7})[^A-Za-z].*" ); + final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "\\b[A-Z0-9]{5}|RAT|PIG|PEA|CAP\\b" ); + final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern + .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^0-9A-Za-z].*" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern + .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\b\\d{1,7}\\b" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" ); final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" ); - final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) throws FileNotFoundException, IOException { @@ -258,9 +255,9 @@ public final class ParserUtils { } return null; } - + public final static String extractUniprotTaxonomyIdFromNodeName( final String name, - final TAXONOMY_EXTRACTION taxonomy_extraction ) { + final TAXONOMY_EXTRACTION taxonomy_extraction ) { if ( ( name.indexOf( "_" ) > 0 ) && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) { final String[] s = name.split( "[_\\s]" ); @@ -303,15 +300,18 @@ public final class ParserUtils { return readPhylogenies( new File( file_name ) ); } - public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node, - final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) + public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node, + final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) throws PhyloXmlDataFormatException { final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction ); if ( !ForesterUtil.isEmpty( id ) ) { if ( !node.getNodeData().isHasTaxonomy() ) { node.getNodeData().setTaxonomy( new Taxonomy() ); } - node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); + if ( node.getNodeData().getTaxonomy().getIdentifier() == null || ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getIdentifier().getValue() ) ) { + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); + return id; + } } else { final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); @@ -319,8 +319,12 @@ public final class ParserUtils { if ( !node.getNodeData().isHasTaxonomy() ) { node.getNodeData().setTaxonomy( new Taxonomy() ); } - node.getNodeData().getTaxonomy().setTaxonomyCode( code ); + if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + node.getNodeData().getTaxonomy().setTaxonomyCode( code ); + return code; + } } } + return null; } } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index de9ab0c..4aa0fc2 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -198,7 +198,7 @@ public final class Test { System.out.println( "failed." ); failed++; } - System.out.print( "Taxonomy extraction: " ); + System.out.print( "Taxonomy code extraction: " ); if ( Test.testExtractTaxonomyCodeFromNodeName() ) { System.out.println( "OK." ); succeeded++; @@ -207,6 +207,15 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "Taxonomy extraction (general): " ); + if ( Test.testTaxonomyExtraction() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } System.out.print( "Basic node construction and parsing of NHX (node level): " ); if ( Test.testNHXNodeParsing() ) { System.out.println( "OK." ); @@ -225,6 +234,7 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "Conversion to NHX (node level): " ); if ( Test.testNHXconversion() ) { System.out.println( "OK." ); @@ -3967,29 +3977,24 @@ public final class Test { return true; } - private static boolean testNodeRemoval() { try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); final Phylogeny t0 = factory.create( "((a)b)", new NHXParser() )[ 0 ]; PhylogenyMethods.removeNode( t0.getNode( "b" ), t0 ); - if ( !t0.toNewHampshire().equals( "(a);" ) ) { return false; } final Phylogeny t1 = factory.create( "((a:2)b:4)", new NHXParser() )[ 0 ]; PhylogenyMethods.removeNode( t1.getNode( "b" ), t1 ); - if ( !t1.toNewHampshire().equals( "(a:6.0);" ) ) { return false; } final Phylogeny t2 = factory.create( "((a,b),c)", new NHXParser() )[ 0 ]; PhylogenyMethods.removeNode( t2.getNode( "b" ), t2 ); - if ( !t2.toNewHampshire().equals( "((a),c);" ) ) { return false; } - } catch ( final Exception e ) { e.printStackTrace( System.out ); @@ -3997,7 +4002,7 @@ public final class Test { } return true; } - + private static boolean testMidpointrooting() { try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); @@ -4882,6 +4887,81 @@ public final class Test { return true; } + private static boolean testTaxonomyExtraction() { + try { + final PhylogenyNode n0 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n0.getNodeData().isHasTaxonomy() ) { + return false; + } + final PhylogenyNode n1 = PhylogenyNode.createInstanceFromNhxString( "sd_12345x", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n1.getNodeData().isHasTaxonomy() ) { + System.out.println( n1.toString() ); + return false; + } + final PhylogenyNode n2 = PhylogenyNode.createInstanceFromNhxString( "12345", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n2.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { + System.out.println( n2.toString() ); + return false; + } + final PhylogenyNode n3 = PhylogenyNode.createInstanceFromNhxString( "blag_12345", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n3.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { + System.out.println( n3.toString() ); + return false; + } + final PhylogenyNode n4 = PhylogenyNode.createInstanceFromNhxString( "blag-12345", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n4.getNodeData().isHasTaxonomy() ) { + System.out.println( n4.toString() ); + return false; + } + final PhylogenyNode n5 = PhylogenyNode.createInstanceFromNhxString( "12345-blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n5.getNodeData().isHasTaxonomy() ) { + System.out.println( n5.toString() ); + return false; + } + final PhylogenyNode n6 = PhylogenyNode.createInstanceFromNhxString( "blag-12345-blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n6.getNodeData().isHasTaxonomy() ) { + System.out.println( n6.toString() ); + return false; + } + final PhylogenyNode n7 = PhylogenyNode.createInstanceFromNhxString( "blag-12345_blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n7.getNodeData().isHasTaxonomy() ) { + System.out.println( n7.toString() ); + return false; + } + final PhylogenyNode n8 = PhylogenyNode.createInstanceFromNhxString( "blag_12345-blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n8.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { + System.out.println( n8.toString() ); + return false; + } + final PhylogenyNode n9 = PhylogenyNode.createInstanceFromNhxString( "blag_12345_blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n9.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { + System.out.println( n9.toString() ); + return false; + } + final PhylogenyNode n10 = PhylogenyNode.createInstanceFromNhxString( "blag_12X45-blag", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n10.getNodeData().getTaxonomy().getTaxonomyCode().equals( "12X45" ) ) { + System.out.println( n10.toString() ); + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + private static boolean testNHXNodeParsing() { try { final PhylogenyNode n1 = new PhylogenyNode(); @@ -5092,7 +5172,7 @@ public final class Test { if ( !e2.getName().equals( "n10_RAT1" ) ) { return false; } - if ( !PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) { + if ( PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) { return false; } final PhylogenyNode e3 = PhylogenyNode.createInstanceFromNhxString( "n10_RAT~", @@ -5229,10 +5309,10 @@ public final class Test { if ( PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) { return false; } - if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { + if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) { return false; } - if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { + if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { return false; } final PhylogenyNode n14 = PhylogenyNode @@ -5285,39 +5365,32 @@ public final class Test { if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) { return false; } - - - // - final PhylogenyNode n19 = PhylogenyNode - .createInstanceFromNhxString( "blah_1-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES ); - - - if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) { + final PhylogenyNode n19 = PhylogenyNode.createInstanceFromNhxString( "blah_1-roejojoej", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) { return false; } - if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { + if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { return false; } - final PhylogenyNode n30 = PhylogenyNode - .createInstanceFromNhxString( "blah_1234567-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES ); - - - if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) { + final PhylogenyNode n30 = PhylogenyNode.createInstanceFromNhxString( "blah_1234567-roejojoej", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) { return false; } - if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { + if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { return false; } - final PhylogenyNode n31 = PhylogenyNode - .createInstanceFromNhxString( "blah_12345678-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES ); - - - if ( n31.getNodeData().isHasTaxonomy() ) { + final PhylogenyNode n31 = PhylogenyNode.createInstanceFromNhxString( "blah_12345678-roejojoej", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n31.getNodeData().isHasTaxonomy() ) { + return false; + } + final PhylogenyNode n32 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678", + NHXParser.TAXONOMY_EXTRACTION.YES ); + if ( n32.getNodeData().isHasTaxonomy() ) { return false; } - // if ( !n31.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) { - // return false; - // } } catch ( final Exception e ) { e.printStackTrace( System.out );