import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
-import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.CommandLineArguments;
import org.forester.util.ForesterUtil;
public class phyloxml_converter {
- final static private String HELP_OPTION_1 = "help";
- final static private String HELP_OPTION_2 = "h";
- final static private String FIELD_OPTION = "f";
- final static private String FIELD_CLADE_NAME = "nn";
- final static private String FIELD_TAXONOMY_CODE = "tc";
- final static private String FIELD_TAXONOMY_SCI_NAME = "sn";
- final static private String FIELD_TAXONOMY_COMM_NAME = "cn";
- final static private String FIELD_SEQUENCE_GENE_NAME = "gn";
- final static private String FIELD_SEQUENCE_SYMBOL = "sy";
- final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1";
- final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2";
- final static private String FIELD_DUMMY = "dummy";
- final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i";
- final static private String MIDPOINT_REROOT = "m";
- final static private String EXTRACT_TAXONOMY = "xt";
- final static private String EXTRACT_TAXONOMY_PF = "xp";
- final static private String ORDER_SUBTREES = "o";
- final static private String NO_TREE_LEVEL_INDENDATION = "ni";
- final static private String REPLACE_UNDER_SCORES = "ru";
- final static private String IGNORE_QUOTES = "iqs";
- final static private String PRG_NAME = "phyloxml_converter";
- final static private String PRG_VERSION = "1.302";
- final static private String PRG_DATE = "140516";
- final static private String E_MAIL = "phyloxml@gmail.com";
- final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester";
- final static private boolean SPECIAL = false;
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String FIELD_OPTION = "f";
+ final static private String FIELD_CLADE_NAME = "nn";
+ final static private String FIELD_TAXONOMY_CODE = "tc";
+ final static private String FIELD_TAXONOMY_SCI_NAME = "sn";
+ final static private String FIELD_TAXONOMY_COMM_NAME = "cn";
+ final static private String FIELD_SEQUENCE_GENE_NAME = "gn";
+ final static private String FIELD_SEQUENCE_SYMBOL = "sy";
+ final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1";
+ final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2";
+ final static private String FIELD_DUMMY = "dummy";
+ final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i";
+ final static private String MIDPOINT_REROOT = "m";
+ final static private String EXTRACT_TAXONOMY = "xt";
+ final static private String EXTRACT_TAXONOMY_PF = "xp";
+ final static private String ORDER_SUBTREES = "o";
+ final static private String NO_TREE_LEVEL_INDENDATION = "ni";
+ final static private String REPLACE_UNDER_SCORES = "ru";
+ final static private String IGNORE_QUOTES = "iqs";
+ final static private String CONFIDENCE_TYPE = "c";
+ final static private String PRG_NAME = "phyloxml_converter";
+ final static private String PRG_VERSION = "1.303";
+ final static private String PRG_DATE = "170510";
+ final static private String E_MAIL = "phyloxml@gmail.com";
+ final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester/phyloxml-converter";
public static void main( final String args[] ) throws PhyloXmlDataFormatException {
ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
allowed_options.add( EXTRACT_TAXONOMY );
allowed_options.add( EXTRACT_TAXONOMY_PF );
allowed_options.add( IGNORE_QUOTES );
+ allowed_options.add( CONFIDENCE_TYPE );
if ( cla.getNumberOfNames() != 2 ) {
System.out.println();
System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
else if ( field_option_value.equals( FIELD_DUMMY ) ) {
}
else {
- ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \""
- + field_option_value + "\"" );
+ ForesterUtil
+ .fatalError( PRG_NAME,
+ "unknown value for -\"" + FIELD_OPTION + "\" option: \"" + field_option_value + "\"" );
}
boolean ignore_quotes = false;
if ( cla.isOptionSet( IGNORE_QUOTES ) ) {
if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) {
int_values_are_boots = true;
}
+ String conf_type = "unknown";
+ if ( cla.isOptionSet( CONFIDENCE_TYPE ) ) {
+ final String str = cla.getOptionValueAsCleanString( CONFIDENCE_TYPE );
+ if ( !ForesterUtil.isEmpty( str ) ) {
+ conf_type = str;
+ }
+ }
boolean midpoint_reroot = false;
if ( cla.isOptionSet( MIDPOINT_REROOT ) ) {
midpoint_reroot = true;
&& ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) {
if ( extr_taxonomy_pf_only ) {
( ( NHXParser ) parser )
- .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT );
+ .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT );
replace_underscores = false;
}
else if ( extr_taxonomy ) {
( ( NHXParser ) parser )
- .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
+ .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
replace_underscores = false;
}
}
catch ( final IOException e ) {
ForesterUtil.fatalError( PRG_NAME, "failed to read phylogeny from [" + infile + "]: " + e.getMessage() );
}
- if ( SPECIAL ) {
- for( final Phylogeny phy : phys ) {
- performSpecialProcessing( phy );
- }
- }
if ( int_values_are_boots ) {
for( final Phylogeny phy : phys ) {
- PhylogenyMethods.transferInternalNamesToBootstrapSupport( phy );
+ PhylogenyMethods.transferInternalNamesToConfidenceValues( phy, conf_type );
}
}
if ( field != null ) {
System.out.println();
}
- private static void performSpecialProcessing( final Phylogeny phy ) {
- // Can place some kind of custom processing here.
- // final List<PhylogenyNode> remove_us = new ArrayList<PhylogenyNode>();
- // int counter = 0;
- // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
- // final PhylogenyNode node = it.next();
- // final String name = node.getNodeName().toLowerCase();
- // if ( name.startsWith( "environmental_samples" ) || name.startsWith( "unclassified" )
- // || name.startsWith( "bacteria" ) || name.startsWith( "other" )
- // || name.startsWith( "viroids" ) || name.startsWith( "viruses" ) ) {
- // remove_us.add( node );
- // System.out.println( counter++ );
- // }
- // }
- // phy.hashIDs();
- // for( final PhylogenyNode node : remove_us ) {
- // if ( phy.getNode( node.getNodeId() ) != null ) {
- // phy.deleteSubtree( node );
- // System.out.println( "deleted: " + node );
- // }
- // }
- // phy.hashIDs();
- //
- // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
- // final PhylogenyNode node = it.next();
- // node.getNodeData().setTaxonomy( null );
- // }
- // phy.reRoot( phy.getFirstExternalNode() );
- // PhylogenyMethods.midpointRoot( phy );
- // phy.orderAppearance( true );
- for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
- final PhylogenyNode node = it.next();
- final String name = node.getName();
- if ( !ForesterUtil.isEmpty( name ) ) {
- // final Taxonomy taxo = new Taxonomy();
- // if ( node.isExternal() ) {
- // taxo.setTaxonomyCode( name );
- // node.getNodeData().setTaxonomy( taxo );
- // }
- // else if ( name.indexOf( '_' ) == -1 || name.length() > 6 ) {
- // taxo.setScientificName( name );
- // node.getNodeData().setTaxonomy( taxo );
- // }
- // node.setName( "" );
- // if ( name.indexOf( "BF" ) >= 0 ) {
- // taxo.setTaxonomyCode( "BACFR" );
- // }
- // else if ( name.indexOf( "BT" ) >= 0 ) {
- // taxo.setTaxonomyCode( "BACTN" );
- // }
- // else if ( name.indexOf( "MXAN" ) >= 0 ) {
- // taxo.setTaxonomyCode( "MYXXD" );
- // }
- // else if ( name.indexOf( "STIAU" ) >= 0 ) {
- // taxo.setTaxonomyCode( "STIAU" );
- // }
- // else if ( name.indexOf( "BOVA" ) >= 0 ) {
- // taxo.setTaxonomyCode( "BACOV" );
- // }
- // else if ( name.indexOf( "BUNI" ) >= 0 ) {
- // taxo.setTaxonomyCode( "BACUN" );
- // }
- // else if ( name.indexOf( "Pgin" ) >= 0 ) {
- // taxo.setTaxonomyCode( "PORGI" );
- // }
- // else if ( name.equals( "3CGH" ) || name.equals( "3CK7" ) ) {
- // taxo.setTaxonomyCode( "BACTN" );
- // }
- // node.getNodeData().setTaxonomy( taxo );
- }
- }
- }
-
private static void printHelp() {
System.out.println( "Usage:" );
System.out.println();
- System.out
- .println( PRG_NAME
- + " -"
- + FIELD_OPTION
- + "=<field option> [options] <infile in New Hamphshire, NHX, Nexus, ToL XML, or phyloXML format> <outfile>" );
+ System.out.println( PRG_NAME + " -" + FIELD_OPTION
+ + "=<field option> [options] <infile in New Hamphshire, NHX, Nexus, ToL XML, or phyloXML format> <outfile>" );
System.out.println();
System.out.println( " field options: " );
System.out.println();
System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" );
System.out.println( " " + FIELD_DUMMY + ": to convert NHX formatted trees to phyloXML" );
System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1
- + ": transfer/split name to taxonomy uniprot identifier" );
+ + ": transfer/split name to taxonomy uniprot identifier" );
System.out.println( " (split at underscore if \"id_name\" pattern, e.g. \"817_SusD\")" );
System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2
- + ": transfer/split name to taxonomy uniprot identifier" );
+ + ": transfer/split name to taxonomy uniprot identifier" );
System.out.println( " (split at underscore if \"name_id\" pattern, e.g. \"SusD_817\")" );
System.out.println();
System.out.println( " options: " );
System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT
- + " : internal names in NH or NHX tree are bootstrap support values" );
- System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" );
- System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" );
- System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" );
- System.out
- .println( " -"
- + EXTRACT_TAXONOMY
- + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: "
+ + " : internal names in NH or NHX tree are confidence values" );
+ System.out.println( " -" + CONFIDENCE_TYPE + "=<conf>"
+ + ": confidence type (e.g. \"bootstrap\", default is \"unknown\")" );
+ System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" );
+ System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" );
+ System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" );
+ System.out.println( " -" + EXTRACT_TAXONOMY
+ + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: "
+ FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
- System.out
- .println( " -"
- + EXTRACT_TAXONOMY_PF
- + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: "
+ System.out.println( " -" + EXTRACT_TAXONOMY_PF
+ + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: "
+ FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
- System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" );
- System.out.println( " -" + IGNORE_QUOTES + ": ignore quotes and whitespace (e.g. \"a b\" becomes ab)" );
+ System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" );
+ System.out.println( " -" + IGNORE_QUOTES + " : ignore quotes and whitespace (e.g. \"a b\" becomes ab)" );
System.out.println();
}
}