moveNodeNamesToSeqNames();
}
else if ( o == _extract_tax_code_from_node_names_jmi ) {
- extractTaxCodeFromNodeNames();
+ extractTaxDataFromNodeNames();
}
else if ( o == _gsdi_item ) {
if ( isSubtreeDisplayed() ) {
customizeJMenuItem( _move_node_names_to_seq_names_jmi );
_move_node_names_to_seq_names_jmi.setToolTipText( "To interpret node names as sequence (protein, gene) names" );
_tools_menu
- .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes from Node Names" ) );
+ .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes or Ids from Node Names" ) );
customizeJMenuItem( _extract_tax_code_from_node_names_jmi );
_extract_tax_code_from_node_names_jmi
- .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI'" );
+ .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI', or Uniprot identifiers from nodes names in the form of 'xyz_1234567'" );
_tools_menu.addSeparator();
_tools_menu
.add( _obtain_detailed_taxonomic_information_jmi = new JMenuItem( OBTAIN_DETAILED_TAXONOMIC_INFORMATION ) );
}
}
- private void extractTaxCodeFromNodeNames() throws PhyloXmlDataFormatException {
+ private void extractTaxDataFromNodeNames() throws PhyloXmlDataFormatException {
+ final StringBuilder sb = new StringBuilder();
+ final StringBuilder sb_failed = new StringBuilder();
+ int counter = 0;
+ int counter_failed = 0;
if ( getCurrentTreePanel() != null ) {
final Phylogeny phy = getCurrentTreePanel().getPhylogeny();
if ( ( phy != null ) && !phy.isEmpty() ) {
- final PhylogenyNodeIterator it = phy.iteratorPostorder();
+ final PhylogenyNodeIterator it = phy.iteratorExternalForward();
while ( it.hasNext() ) {
final PhylogenyNode n = it.next();
final String name = n.getName().trim();
if ( !ForesterUtil.isEmpty( name ) ) {
+ final String nt = ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
+ if ( !ForesterUtil.isEmpty( nt ) ) {
+ if ( counter < 15 ) {
+ sb.append( name + ": " + nt + "\n" );
+ }
+ else if ( counter == 15 ) {
+ sb.append( "...\n" );
+ }
+ counter++;
+ }
+ else {
+ if ( counter_failed < 15 ) {
+ sb_failed.append( name + "\n" );
+ }
+ else if ( counter_failed == 15 ) {
+ sb_failed.append( "...\n" );
+ }
+ counter_failed++;
+ }
+ }
+ }
+ if ( counter > 0 ) {
+ String failed = "";
+ if ( counter_failed > 0 ) {
+ failed = "\nDid not extract taxonomic data for " + counter_failed + " (named) external nodes:\n" + sb_failed;
- ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
-
- // final String code = ParserUtils
- // .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
- // if ( !ForesterUtil.isEmpty( code ) ) {
- // PhylogenyMethods.setTaxonomyCode( n, code );
- // }
}
+ JOptionPane.showMessageDialog( this,
+ "Successfully extracted taxonomic data from " + counter
+ + " external nodes:\n" + sb.toString() + failed,
+ "Taxonomic Data Extraction Successfully Completed",
+ JOptionPane.INFORMATION_MESSAGE );
+ }
+ else {
+ JOptionPane
+ .showMessageDialog( this,
+ "Could not extract any taxonomic data, maybe node names are empty\nor not in the form \"XYZ_CAEEL\", \"XYZ_CAEEL/12-394\", or \"XYZ_1234567\"?",
+ "No Taxonomic Data Extracted",
+ JOptionPane.WARNING_MESSAGE );
}
}
}
public final class ParserUtils {
- final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
- final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern
- .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
- final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
-
-
- final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\d{1,7}" );
- final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern
- .compile( "(\\d{1,7})[^A-Za-z].*" );
+ final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "\\b[A-Z0-9]{5}|RAT|PIG|PEA|CAP\\b" );
+ final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern
+ .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^0-9A-Za-z].*" );
+ final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern
+ .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
+ final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\b\\d{1,7}\\b" );
+ final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" );
final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" );
-
final public static PhylogenyParser createParserDependingFileContents( final File file,
final boolean phyloxml_validate_against_xsd )
throws FileNotFoundException, IOException {
}
return null;
}
-
+
public final static String extractUniprotTaxonomyIdFromNodeName( final String name,
- final TAXONOMY_EXTRACTION taxonomy_extraction ) {
+ final TAXONOMY_EXTRACTION taxonomy_extraction ) {
if ( ( name.indexOf( "_" ) > 0 )
&& ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) {
final String[] s = name.split( "[_\\s]" );
return readPhylogenies( new File( file_name ) );
}
- public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node,
- final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
+ public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node,
+ final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
throws PhyloXmlDataFormatException {
final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction );
if ( !ForesterUtil.isEmpty( id ) ) {
if ( !node.getNodeData().isHasTaxonomy() ) {
node.getNodeData().setTaxonomy( new Taxonomy() );
}
- node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+ if ( node.getNodeData().getTaxonomy().getIdentifier() == null || ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getIdentifier().getValue() ) ) {
+ node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+ return id;
+ }
}
else {
final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
if ( !node.getNodeData().isHasTaxonomy() ) {
node.getNodeData().setTaxonomy( new Taxonomy() );
}
- node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+ if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) {
+ node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+ return code;
+ }
}
}
+ return null;
}
}
System.out.println( "failed." );
failed++;
}
- System.out.print( "Taxonomy extraction: " );
+ System.out.print( "Taxonomy code extraction: " );
if ( Test.testExtractTaxonomyCodeFromNodeName() ) {
System.out.println( "OK." );
succeeded++;
System.out.println( "failed." );
failed++;
}
+ System.out.print( "Taxonomy extraction (general): " );
+ if ( Test.testTaxonomyExtraction() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
System.out.print( "Basic node construction and parsing of NHX (node level): " );
if ( Test.testNHXNodeParsing() ) {
System.out.println( "OK." );
System.out.println( "failed." );
failed++;
}
+
System.out.print( "Conversion to NHX (node level): " );
if ( Test.testNHXconversion() ) {
System.out.println( "OK." );
return true;
}
-
private static boolean testNodeRemoval() {
try {
final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
final Phylogeny t0 = factory.create( "((a)b)", new NHXParser() )[ 0 ];
PhylogenyMethods.removeNode( t0.getNode( "b" ), t0 );
-
if ( !t0.toNewHampshire().equals( "(a);" ) ) {
return false;
}
final Phylogeny t1 = factory.create( "((a:2)b:4)", new NHXParser() )[ 0 ];
PhylogenyMethods.removeNode( t1.getNode( "b" ), t1 );
-
if ( !t1.toNewHampshire().equals( "(a:6.0);" ) ) {
return false;
}
final Phylogeny t2 = factory.create( "((a,b),c)", new NHXParser() )[ 0 ];
PhylogenyMethods.removeNode( t2.getNode( "b" ), t2 );
-
if ( !t2.toNewHampshire().equals( "((a),c);" ) ) {
return false;
}
-
}
catch ( final Exception e ) {
e.printStackTrace( System.out );
}
return true;
}
-
+
private static boolean testMidpointrooting() {
try {
final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
return true;
}
+ private static boolean testTaxonomyExtraction() {
+ try {
+ final PhylogenyNode n0 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n0.getNodeData().isHasTaxonomy() ) {
+ return false;
+ }
+ final PhylogenyNode n1 = PhylogenyNode.createInstanceFromNhxString( "sd_12345x",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n1.getNodeData().isHasTaxonomy() ) {
+ System.out.println( n1.toString() );
+ return false;
+ }
+ final PhylogenyNode n2 = PhylogenyNode.createInstanceFromNhxString( "12345",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n2.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ System.out.println( n2.toString() );
+ return false;
+ }
+ final PhylogenyNode n3 = PhylogenyNode.createInstanceFromNhxString( "blag_12345",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n3.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ System.out.println( n3.toString() );
+ return false;
+ }
+ final PhylogenyNode n4 = PhylogenyNode.createInstanceFromNhxString( "blag-12345",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n4.getNodeData().isHasTaxonomy() ) {
+ System.out.println( n4.toString() );
+ return false;
+ }
+ final PhylogenyNode n5 = PhylogenyNode.createInstanceFromNhxString( "12345-blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n5.getNodeData().isHasTaxonomy() ) {
+ System.out.println( n5.toString() );
+ return false;
+ }
+ final PhylogenyNode n6 = PhylogenyNode.createInstanceFromNhxString( "blag-12345-blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n6.getNodeData().isHasTaxonomy() ) {
+ System.out.println( n6.toString() );
+ return false;
+ }
+ final PhylogenyNode n7 = PhylogenyNode.createInstanceFromNhxString( "blag-12345_blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n7.getNodeData().isHasTaxonomy() ) {
+ System.out.println( n7.toString() );
+ return false;
+ }
+ final PhylogenyNode n8 = PhylogenyNode.createInstanceFromNhxString( "blag_12345-blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n8.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ System.out.println( n8.toString() );
+ return false;
+ }
+ final PhylogenyNode n9 = PhylogenyNode.createInstanceFromNhxString( "blag_12345_blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n9.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ System.out.println( n9.toString() );
+ return false;
+ }
+ final PhylogenyNode n10 = PhylogenyNode.createInstanceFromNhxString( "blag_12X45-blag",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n10.getNodeData().getTaxonomy().getTaxonomyCode().equals( "12X45" ) ) {
+ System.out.println( n10.toString() );
+ return false;
+ }
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace( System.out );
+ return false;
+ }
+ return true;
+ }
+
private static boolean testNHXNodeParsing() {
try {
final PhylogenyNode n1 = new PhylogenyNode();
if ( !e2.getName().equals( "n10_RAT1" ) ) {
return false;
}
- if ( !PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) {
+ if ( PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) {
return false;
}
final PhylogenyNode e3 = PhylogenyNode.createInstanceFromNhxString( "n10_RAT~",
if ( PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
return false;
}
- if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+ if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
return false;
}
- if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
return false;
}
final PhylogenyNode n14 = PhylogenyNode
if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) {
return false;
}
-
-
- //
- final PhylogenyNode n19 = PhylogenyNode
- .createInstanceFromNhxString( "blah_1-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-
-
- if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) {
+ final PhylogenyNode n19 = PhylogenyNode.createInstanceFromNhxString( "blah_1-roejojoej",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) {
return false;
}
- if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
return false;
}
- final PhylogenyNode n30 = PhylogenyNode
- .createInstanceFromNhxString( "blah_1234567-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-
-
- if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) {
+ final PhylogenyNode n30 = PhylogenyNode.createInstanceFromNhxString( "blah_1234567-roejojoej",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) {
return false;
}
- if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
+ if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
return false;
}
- final PhylogenyNode n31 = PhylogenyNode
- .createInstanceFromNhxString( "blah_12345678-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-
-
- if ( n31.getNodeData().isHasTaxonomy() ) {
+ final PhylogenyNode n31 = PhylogenyNode.createInstanceFromNhxString( "blah_12345678-roejojoej",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n31.getNodeData().isHasTaxonomy() ) {
+ return false;
+ }
+ final PhylogenyNode n32 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678",
+ NHXParser.TAXONOMY_EXTRACTION.YES );
+ if ( n32.getNodeData().isHasTaxonomy() ) {
return false;
}
- // if ( !n31.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
- // return false;
- // }
}
catch ( final Exception e ) {
e.printStackTrace( System.out );