From 3929e5396afcdc3c79e7e4421c9fd444bb2e7a1b Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 10 May 2017 11:37:08 -0700 Subject: [PATCH] in progress --- .../forester/application/phyloxml_converter.java | 188 ++++++-------------- .../org/forester/phylogeny/PhylogenyMethods.java | 4 +- .../java/src/org/forester/util/ForesterUtil.java | 17 +- 3 files changed, 68 insertions(+), 141 deletions(-) diff --git a/forester/java/src/org/forester/application/phyloxml_converter.java b/forester/java/src/org/forester/application/phyloxml_converter.java index 0dac775..b827750 100644 --- a/forester/java/src/org/forester/application/phyloxml_converter.java +++ b/forester/java/src/org/forester/application/phyloxml_converter.java @@ -40,41 +40,39 @@ import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY; -import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; -import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; public class phyloxml_converter { - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String FIELD_OPTION = "f"; - final static private String FIELD_CLADE_NAME = "nn"; - final static private String FIELD_TAXONOMY_CODE = "tc"; - final static private String FIELD_TAXONOMY_SCI_NAME = "sn"; - final static private String FIELD_TAXONOMY_COMM_NAME = "cn"; - final static private String FIELD_SEQUENCE_GENE_NAME = "gn"; - final static private String FIELD_SEQUENCE_SYMBOL = "sy"; - final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1"; - final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2"; - final static private String FIELD_DUMMY = "dummy"; - final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i"; - final static private String MIDPOINT_REROOT = "m"; - final static private String EXTRACT_TAXONOMY = "xt"; - final static private String EXTRACT_TAXONOMY_PF = "xp"; - final static private String ORDER_SUBTREES = "o"; - final static private String NO_TREE_LEVEL_INDENDATION = "ni"; - final static private String REPLACE_UNDER_SCORES = "ru"; - final static private String IGNORE_QUOTES = "iqs"; - final static private String PRG_NAME = "phyloxml_converter"; - final static private String PRG_VERSION = "1.302"; - final static private String PRG_DATE = "140516"; - final static private String E_MAIL = "phyloxml@gmail.com"; - final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester"; - final static private boolean SPECIAL = false; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String FIELD_OPTION = "f"; + final static private String FIELD_CLADE_NAME = "nn"; + final static private String FIELD_TAXONOMY_CODE = "tc"; + final static private String FIELD_TAXONOMY_SCI_NAME = "sn"; + final static private String FIELD_TAXONOMY_COMM_NAME = "cn"; + final static private String FIELD_SEQUENCE_GENE_NAME = "gn"; + final static private String FIELD_SEQUENCE_SYMBOL = "sy"; + final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1"; + final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2"; + final static private String FIELD_DUMMY = "dummy"; + final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i"; + final static private String MIDPOINT_REROOT = "m"; + final static private String EXTRACT_TAXONOMY = "xt"; + final static private String EXTRACT_TAXONOMY_PF = "xp"; + final static private String ORDER_SUBTREES = "o"; + final static private String NO_TREE_LEVEL_INDENDATION = "ni"; + final static private String REPLACE_UNDER_SCORES = "ru"; + final static private String IGNORE_QUOTES = "iqs"; + final static private String CONFIDENCE_TYPE = "c"; + final static private String PRG_NAME = "phyloxml_converter"; + final static private String PRG_VERSION = "1.303"; + final static private String PRG_DATE = "170510"; + final static private String E_MAIL = "phyloxml@gmail.com"; + final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester/phyloxml-converter"; public static void main( final String args[] ) throws PhyloXmlDataFormatException { ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); @@ -106,6 +104,7 @@ public class phyloxml_converter { allowed_options.add( EXTRACT_TAXONOMY ); allowed_options.add( EXTRACT_TAXONOMY_PF ); allowed_options.add( IGNORE_QUOTES ); + allowed_options.add( CONFIDENCE_TYPE ); if ( cla.getNumberOfNames() != 2 ) { System.out.println(); System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); @@ -157,8 +156,9 @@ public class phyloxml_converter { else if ( field_option_value.equals( FIELD_DUMMY ) ) { } else { - ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \"" - + field_option_value + "\"" ); + ForesterUtil + .fatalError( PRG_NAME, + "unknown value for -\"" + FIELD_OPTION + "\" option: \"" + field_option_value + "\"" ); } boolean ignore_quotes = false; if ( cla.isOptionSet( IGNORE_QUOTES ) ) { @@ -168,6 +168,13 @@ public class phyloxml_converter { if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) { int_values_are_boots = true; } + String conf_type = "unknown"; + if ( cla.isOptionSet( CONFIDENCE_TYPE ) ) { + final String str = cla.getOptionValueAsCleanString( CONFIDENCE_TYPE ); + if ( !ForesterUtil.isEmpty( str ) ) { + conf_type = str; + } + } boolean midpoint_reroot = false; if ( cla.isOptionSet( MIDPOINT_REROOT ) ) { midpoint_reroot = true; @@ -210,12 +217,12 @@ public class phyloxml_converter { && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) { if ( extr_taxonomy_pf_only ) { ( ( NHXParser ) parser ) - .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); + .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); replace_underscores = false; } else if ( extr_taxonomy ) { ( ( NHXParser ) parser ) - .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); replace_underscores = false; } } @@ -234,14 +241,9 @@ public class phyloxml_converter { catch ( final IOException e ) { ForesterUtil.fatalError( PRG_NAME, "failed to read phylogeny from [" + infile + "]: " + e.getMessage() ); } - if ( SPECIAL ) { - for( final Phylogeny phy : phys ) { - performSpecialProcessing( phy ); - } - } if ( int_values_are_boots ) { for( final Phylogeny phy : phys ) { - PhylogenyMethods.transferInternalNamesToBootstrapSupport( phy ); + PhylogenyMethods.transferInternalNamesToConfidenceValues( phy, conf_type ); } } if ( field != null ) { @@ -283,87 +285,11 @@ public class phyloxml_converter { System.out.println(); } - private static void performSpecialProcessing( final Phylogeny phy ) { - // Can place some kind of custom processing here. - // final List remove_us = new ArrayList(); - // int counter = 0; - // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { - // final PhylogenyNode node = it.next(); - // final String name = node.getNodeName().toLowerCase(); - // if ( name.startsWith( "environmental_samples" ) || name.startsWith( "unclassified" ) - // || name.startsWith( "bacteria" ) || name.startsWith( "other" ) - // || name.startsWith( "viroids" ) || name.startsWith( "viruses" ) ) { - // remove_us.add( node ); - // System.out.println( counter++ ); - // } - // } - // phy.hashIDs(); - // for( final PhylogenyNode node : remove_us ) { - // if ( phy.getNode( node.getNodeId() ) != null ) { - // phy.deleteSubtree( node ); - // System.out.println( "deleted: " + node ); - // } - // } - // phy.hashIDs(); - // - // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { - // final PhylogenyNode node = it.next(); - // node.getNodeData().setTaxonomy( null ); - // } - // phy.reRoot( phy.getFirstExternalNode() ); - // PhylogenyMethods.midpointRoot( phy ); - // phy.orderAppearance( true ); - for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode node = it.next(); - final String name = node.getName(); - if ( !ForesterUtil.isEmpty( name ) ) { - // final Taxonomy taxo = new Taxonomy(); - // if ( node.isExternal() ) { - // taxo.setTaxonomyCode( name ); - // node.getNodeData().setTaxonomy( taxo ); - // } - // else if ( name.indexOf( '_' ) == -1 || name.length() > 6 ) { - // taxo.setScientificName( name ); - // node.getNodeData().setTaxonomy( taxo ); - // } - // node.setName( "" ); - // if ( name.indexOf( "BF" ) >= 0 ) { - // taxo.setTaxonomyCode( "BACFR" ); - // } - // else if ( name.indexOf( "BT" ) >= 0 ) { - // taxo.setTaxonomyCode( "BACTN" ); - // } - // else if ( name.indexOf( "MXAN" ) >= 0 ) { - // taxo.setTaxonomyCode( "MYXXD" ); - // } - // else if ( name.indexOf( "STIAU" ) >= 0 ) { - // taxo.setTaxonomyCode( "STIAU" ); - // } - // else if ( name.indexOf( "BOVA" ) >= 0 ) { - // taxo.setTaxonomyCode( "BACOV" ); - // } - // else if ( name.indexOf( "BUNI" ) >= 0 ) { - // taxo.setTaxonomyCode( "BACUN" ); - // } - // else if ( name.indexOf( "Pgin" ) >= 0 ) { - // taxo.setTaxonomyCode( "PORGI" ); - // } - // else if ( name.equals( "3CGH" ) || name.equals( "3CK7" ) ) { - // taxo.setTaxonomyCode( "BACTN" ); - // } - // node.getNodeData().setTaxonomy( taxo ); - } - } - } - private static void printHelp() { System.out.println( "Usage:" ); System.out.println(); - System.out - .println( PRG_NAME - + " -" - + FIELD_OPTION - + "= [options] " ); + System.out.println( PRG_NAME + " -" + FIELD_OPTION + + "= [options] " ); System.out.println(); System.out.println( " field options: " ); System.out.println(); @@ -375,30 +301,28 @@ public class phyloxml_converter { System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" ); System.out.println( " " + FIELD_DUMMY + ": to convert NHX formatted trees to phyloXML" ); System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 - + ": transfer/split name to taxonomy uniprot identifier" ); + + ": transfer/split name to taxonomy uniprot identifier" ); System.out.println( " (split at underscore if \"id_name\" pattern, e.g. \"817_SusD\")" ); System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 - + ": transfer/split name to taxonomy uniprot identifier" ); + + ": transfer/split name to taxonomy uniprot identifier" ); System.out.println( " (split at underscore if \"name_id\" pattern, e.g. \"SusD_817\")" ); System.out.println(); System.out.println( " options: " ); System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT - + " : internal names in NH or NHX tree are bootstrap support values" ); - System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" ); - System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" ); - System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" ); - System.out - .println( " -" - + EXTRACT_TAXONOMY - + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: " + + " : internal names in NH or NHX tree are confidence values" ); + System.out.println( " -" + CONFIDENCE_TYPE + "=" + + ": confidence type (e.g. \"bootstrap\", default is \"unknown\")" ); + System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" ); + System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" ); + System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" ); + System.out.println( " -" + EXTRACT_TAXONOMY + + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: " + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); - System.out - .println( " -" - + EXTRACT_TAXONOMY_PF - + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: " + System.out.println( " -" + EXTRACT_TAXONOMY_PF + + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: " + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); - System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" ); - System.out.println( " -" + IGNORE_QUOTES + ": ignore quotes and whitespace (e.g. \"a b\" becomes ab)" ); + System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" ); + System.out.println( " -" + IGNORE_QUOTES + " : ignore quotes and whitespace (e.g. \"a b\" becomes ab)" ); System.out.println(); } } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index ee17de9..150177b 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -1543,7 +1543,7 @@ public class PhylogenyMethods { return nodes_to_delete; } - final static public void transferInternalNamesToBootstrapSupport( final Phylogeny phy ) { + final static public void transferInternalNamesToConfidenceValues( final Phylogeny phy, final String confidence_type ) { final PhylogenyNodeIterator it = phy.iteratorPostorder(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); @@ -1557,7 +1557,7 @@ public class PhylogenyMethods { + e.getLocalizedMessage() ); } if ( value >= 0.0 ) { - n.getBranchData().addConfidence( new Confidence( value, "bootstrap" ) ); + n.getBranchData().addConfidence( new Confidence( value, confidence_type ) ); n.setName( "" ); } } diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 8f0744e..2374654 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -1507,9 +1507,6 @@ public final class ForesterUtil { return trees; } - private ForesterUtil() { - } - public final static File getMatchingFile( final File dir, final String prefix, final String suffix ) throws IOException { if ( !dir.exists() ) { @@ -1518,23 +1515,26 @@ public final class ForesterUtil { if ( !dir.isDirectory() ) { throw new IOException( "[" + dir + "] is not a directory" ); } - final File mapping_files[] = dir.listFiles( new FilenameFilter() { + if ( dir.listFiles().length == 0 ) { + throw new IOException( "[" + dir + "] is empty" ); + } + final File files[] = dir.listFiles( new FilenameFilter() { @Override public boolean accept( final File dir, final String name ) { return ( name.endsWith( suffix ) ); } } ); - if ( mapping_files.length == 1 ) { + if ( files.length == 0 ) { throw new IOException( "no files ending with \"" + suffix + "\" found in [" + dir + "]" ); } - String my_prefix = removeFileExtension( prefix ); + String my_prefix = prefix; boolean done = false; boolean more_than_one = false; File the_one = null; do { int matches = 0; - for( File file : mapping_files ) { + for( File file : files ) { if ( file.getName().startsWith( my_prefix ) ) { matches++; if ( matches > 1 ) { @@ -1571,4 +1571,7 @@ public final class ForesterUtil { } return the_one; } + + private ForesterUtil() { + } } -- 1.7.10.2