From: cmzmasek Date: Sat, 8 Apr 2017 00:47:00 +0000 (-0700) Subject: in progress... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=afd778bc2dbaecab4baff1d0b485e1e44256b328;p=jalview.git in progress... --- diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index d6f29e6..ab63791 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -28,9 +28,11 @@ package org.forester.application; import java.io.File; +import java.io.FilenameFilter; import java.io.IOException; import java.math.RoundingMode; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.forester.datastructures.IntMatrix; @@ -57,7 +59,7 @@ public class rio { final static private String PRG_NAME = "rio"; final static private String PRG_VERSION = "4.000 beta 11"; - final static private String PRG_DATE = "170417"; + final static private String PRG_DATE = "170406"; final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; final static private String HELP_OPTION_1 = "help"; @@ -110,8 +112,8 @@ public class rio { } final File gene_trees_file = cla.getFile( 0 ); final File species_tree_file = cla.getFile( 1 ); - final File orthology_outtable = cla.getFile( 2 ); - final File logfile; + File orthology_outtable = cla.getFile( 2 ); + File logfile; if ( cla.getNumberOfNames() > 3 ) { logfile = cla.getFile( 3 ); if ( logfile.exists() ) { @@ -303,6 +305,131 @@ public class rio { else { algorithm = ALGORITHM.GSDIR; } + ////////////////////////// + ////////////////////////// + final boolean use_gene_trees_dir = true; + if ( use_gene_trees_dir ) { + final String LOGFILE_SUFFIX = "_RIO_log.tsv"; + final String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml"; + final String ORTHO_OUTTABLE_SUFFIX = "_RIO_o_table.tsv"; + final String OUT_GENE_TREE_SUFFIX = "_RIO_gene_tree.xml"; + final String gene_trees_suffix = ".mlt"; + final File indir = new File( "in" ); + final File outdir = new File( "out" ); + if ( !indir.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "in-directory [" + indir + "] does not exist" ); + } + if ( !indir.isDirectory() ) { + ForesterUtil.fatalError( PRG_NAME, "in-directory [" + indir + "] is not a directory" ); + } + if ( outdir.exists() ) { + if ( !outdir.isDirectory() ) { + ForesterUtil.fatalError( PRG_NAME, + "out-directory [" + outdir + "] already exists but is not a directory" ); + } + } + else { + final boolean success = outdir.mkdirs(); + if ( !success ) { + ForesterUtil.fatalError( PRG_NAME, "could not create out-directory [" + outdir + "]" ); + } + } + final String species_tree_file_name = species_tree_file.getName(); + final File gene_trees_files[] = indir.listFiles( new FilenameFilter() { + + @Override + public boolean accept( final File dir, final String name ) { + return ( ( name.endsWith( gene_trees_suffix ) ) && !( name.equals( species_tree_file_name ) ) ); + } + } ); + if ( gene_trees_files.length < 1 ) { + ForesterUtil.fatalError( PRG_NAME, + "in-directory [" + indir + + "] does not contain any gene tree files with suffix " + + gene_trees_suffix ); + } + Arrays.sort( gene_trees_files ); + System.out.print( "NAME" ); + System.out.print( '\t' ); + System.out.print( "EXT NODES" ); + System.out.print( '\t' ); + System.out.print( "MEAN DUP" ); + System.out.print( '\t' ); + System.out.print( "MEAN DUP SD" ); + System.out.print( '\t' ); + System.out.print( "MEDIAN DUP" ); + System.out.print( '\t' ); + System.out.print( "MIN DUP" ); + System.out.print( '\t' ); + System.out.print( "MAX DUP" ); + System.out.print( '\t' ); + System.out.print( "REMOVED EXT NODES" ); + System.out.print( '\t' ); + System.out.print( "N" ); + System.out.println(); + for( final File gf : gene_trees_files ) { + String outname = gf.getName(); + if ( outname.indexOf( "." ) > 0 ) { + outname = outname.substring( 0, outname.lastIndexOf( "." ) ); + } + try { + x( gf, + species_tree_file, + new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ), + new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ), + outgroup, + rerooting, + gt_first, + gt_last, + new File( outdir.getCanonicalFile() + "/" + outname + STRIPPED_SPECIES_TREE_SUFFIX ), + new File( outdir.getCanonicalFile() + "/" + outname + OUT_GENE_TREE_SUFFIX ), + transfer_taxonomy, + algorithm, + true ); + } + catch ( IOException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + else { + x( gene_trees_file, + species_tree_file, + orthology_outtable, + logfile, + outgroup, + rerooting, + gt_first, + gt_last, + return_species_tree, + return_gene_tree, + transfer_taxonomy, + algorithm, + false ); + } + //////////////////// + /////////////////// + if ( !use_gene_trees_dir ) { + time = System.currentTimeMillis() - time; + System.out.println( "Time :\t" + time + "ms" ); + } + System.exit( 0 ); + } + + private static final void x( final File gene_trees_file, + final File species_tree_file, + final File orthology_outtable, + final File logfile, + final String outgroup, + final REROOTING rerooting, + final int gt_first, + final int gt_last, + final File return_species_tree, + final File return_gene_tree, + final boolean transfer_taxonomy, + final ALGORITHM algorithm, + final boolean use_gene_trees_dir ) { try { final RIO rio; boolean iterating = false; @@ -346,11 +473,13 @@ public class rio { gt_first, gt_last, logfile != null, - true, + !use_gene_trees_dir, transfer_taxonomy ); } - if ( algorithm == ALGORITHM.GSDIR ) { - System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() ); + if ( !use_gene_trees_dir ) { + if ( algorithm == ALGORITHM.GSDIR ) { + System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() ); + } } final IntMatrix m; if ( iterating ) { @@ -360,7 +489,7 @@ public class rio { m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); } final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); - writeTable( orthology_outtable, stats.getN(), m ); + writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir ); if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) { writeLogFile( logfile, rio, @@ -370,17 +499,20 @@ public class rio { PRG_NAME, PRG_VERSION, PRG_DATE, - ForesterUtil.getForesterLibraryInformation() ); + ForesterUtil.getForesterLibraryInformation(), + !use_gene_trees_dir ); } if ( return_species_tree != null ) { - writeTree( rio.getSpeciesTree(), return_species_tree, "Wrote (stripped) species tree to :\t" ); + writeTree( rio.getSpeciesTree(), + return_species_tree, + use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" ); } if ( return_gene_tree != null ) { writeTree( rio.getMinDuplicationsGeneTree(), return_gene_tree, - "Wrote one min duplication gene tree :\t" ); + use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" ); } - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" ); final int min = ( int ) stats.getMin(); final int max = ( int ) stats.getMax(); final int median = ( int ) stats.median(); @@ -401,25 +533,57 @@ public class rio { final double min_count_percentage = ( 100.0 * min_count ) / stats.getN(); final double max_count_percentage = ( 100.0 * max_count ) / stats.getN(); final double median_count_percentage = ( 100.0 * median_count ) / stats.getN(); - System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() ); - System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() ); - System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() ) + "\t" - + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() ) - + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" ); - if ( stats.getN() > 3 ) { - System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t" - + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - } - System.out.println( "Minimum duplications :\t" + min + "\t" - + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - System.out.println( "Maximum duplications :\t" + ( int ) max + "\t" - + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - System.out.println( "Gene trees with median duplications :\t" + median_count + "\t" - + df.format( median_count_percentage ) + "%" ); - System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t" - + df.format( min_count_percentage ) + "%" ); - System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t" - + df.format( max_count_percentage ) + "%" ); + if ( use_gene_trees_dir ) { + String name = gene_trees_file.getName(); + if ( name.indexOf( "." ) > 0 ) { + name = name.substring( 0, name.lastIndexOf( "." ) ); + } + System.out.print( name ); + System.out.print( '\t' ); + System.out.print( rio.getExtNodesOfAnalyzedGeneTrees() ); + System.out.print( '\t' ); + System.out.print( df.format( stats.arithmeticMean() ) ); + System.out.print( '\t' ); + System.out.print( df.format( stats.sampleStandardDeviation() ) ); + System.out.print( '\t' ); + if ( stats.getN() > 3 ) { + System.out.print( df.format( median ) ); + } + else { + System.out.print( "" ); + } + System.out.print( '\t' ); + System.out.print( min ); + System.out.print( '\t' ); + System.out.print( max ); + System.out.print( '\t' ); + System.out.print( rio.getRemovedGeneTreeNodes().size() ); + System.out.print( '\t' ); + System.out.print( stats.getN() ); + System.out.println(); + } + else { + System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() ); + System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() ); + System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() ) + + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" ); + if ( stats.getN() > 3 ) { + System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t" + + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + } + System.out.println( "Minimum duplications :\t" + min + "\t" + + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + System.out.println( "Maximum duplications :\t" + ( int ) max + "\t" + + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + System.out.println( "Gene trees with median duplications :\t" + median_count + "\t" + + df.format( median_count_percentage ) + "%" ); + System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t" + + df.format( min_count_percentage ) + "%" ); + System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t" + + df.format( max_count_percentage ) + "%" ); + System.out.println( "Removed ext gene tree nodes:\t" + rio.getRemovedGeneTreeNodes().size() ); + } } catch ( final RIOException e ) { ForesterUtil.fatalError( e.getLocalizedMessage() ); @@ -439,9 +603,6 @@ public class rio { catch ( final Error e ) { ForesterUtil.unexpectedFatalError( e ); } - time = System.currentTimeMillis() - time; - System.out.println( "Time :\t" + time + "ms" ); - System.exit( 0 ); } private final static void printHelp() { @@ -494,24 +655,30 @@ public class rio { final String prg_name, final String prg_v, final String prg_date, - final String f ) + final String f, + final boolean verbose ) throws IOException { final EasyWriter out = ForesterUtil.createEasyWriter( logfile ); - out.println( prg_name ); - out.println( "version : " + prg_v ); - out.println( "date : " + prg_date ); - out.println( "based on: " + f ); - out.println( "----------------------------------" ); - out.println( "Gene trees : " + gene_trees_file.getCanonicalPath() ); - out.println( "Species tree : " + species_tree_file.getCanonicalPath() ); - out.println( "All vs all orthology table : " + outtable.getCanonicalPath() ); + out.println( "# " + prg_name ); + out.println( "# version : " + prg_v ); + out.println( "# date : " + prg_date ); + out.println( "# based on: " + f ); + out.println( "# ----------------------------------" ); + out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() ); + out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() ); + out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() ); out.flush(); out.println( rio.getLog().toString() ); out.close(); - System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() ); + if ( verbose ) { + System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() ); + } } - private static void writeTable( final File table_outfile, final int gene_trees_analyzed, final IntMatrix m ) + private static void writeTable( final File table_outfile, + final int gene_trees_analyzed, + final IntMatrix m, + final boolean verbose ) throws IOException { final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile ); final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); @@ -539,12 +706,16 @@ public class rio { w.println(); } w.close(); - System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() ); + if ( verbose ) { + System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() ); + } } private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException { final PhylogenyWriter writer = new PhylogenyWriter(); writer.toPhyloXML( f, p, 0 ); - System.out.println( comment + f.getCanonicalPath() ); + if ( comment != null ) { + System.out.println( comment + f.getCanonicalPath() ); + } } } diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index cca036a..0902034 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -30,7 +30,6 @@ package org.forester.rio; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -369,7 +368,6 @@ public final class RIO { } private final void logRemovedGeneTreeNodes() { - log( "Species stripped from gene trees:" ); final SortedSet rn = new TreeSet(); for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) { final Taxonomy t = n.getNodeData().getTaxonomy(); @@ -388,10 +386,12 @@ public final class RIO { } } } + final StringBuilder sb = new StringBuilder(); for( final String s : rn ) { - log( s ); + sb.append( '\t' ); + sb.append( s ); } - log( "" ); + log( "Species stripped from gene trees :" + sb); } private final Phylogeny performOrthologInference( final Phylogeny gene_tree, @@ -476,6 +476,7 @@ public final class RIO { } if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) { _min_dub_gene_tree = assigned_tree; + _min_dub_gene_tree.setRerootable( false ); } _duplications_stats.addValue( dups ); return assigned_tree; @@ -488,34 +489,62 @@ public final class RIO { } private final void postLog( final Phylogeny species_tree, final int first, final int last ) { - log( "" ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" ); + final int min = ( int ) getDuplicationsStatistics().getMin(); + final int max = ( int ) getDuplicationsStatistics().getMax(); + final int median = ( int ) getDuplicationsStatistics().median(); + int min_count = 0; + int max_count = 0; + int median_count = 0; + for( double d : getDuplicationsStatistics().getData() ) { + if ( ( ( int ) d ) == min ) { + ++min_count; + } + if ( ( ( int ) d ) == max ) { + ++max_count; + } + if ( ( ( int ) d ) == median ) { + ++median_count; + } + } + final double min_count_percentage = ( 100.0 * min_count ) / getDuplicationsStatistics().getN(); + final double max_count_percentage = ( 100.0 * max_count ) / getDuplicationsStatistics().getN(); + final double median_count_percentage = ( 100.0 * median_count ) / getDuplicationsStatistics().getN(); + + if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) { logRemovedGeneTreeNodes(); } - log( "Species tree external nodes (after stripping) : " + species_tree.getNumberOfExternalNodes() ); - log( "Species tree polytomies (after stripping) : " - + PhylogenyMethods.countNumberOfPolytomies( species_tree ) ); - log( "Taxonomy linking based on : " + getGSDIRtaxCompBase() ); - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" ); + + log( "Gene trees analyzed :\t" + getDuplicationsStatistics().getN() ); if ( ( first >= 0 ) && ( last >= 0 ) ) { - log( "Gene trees analyzed range : " + first + "-" + last ); - } - log( "Gene trees analyzed : " + _duplications_stats.getN() ); - log( "Mean number of duplications : " + df.format( _duplications_stats.arithmeticMean() ) - + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " (" - + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() ) - + "%)" ); - if ( _duplications_stats.getN() > 3 ) { - log( "Median number of duplications : " + df.format( _duplications_stats.median() ) - + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() ) - + "%)" ); - } - log( "Minimum duplications : " + ( int ) _duplications_stats.getMin() + " (" - + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); - log( "Maximum duplications : " + ( int ) _duplications_stats.getMax() + " (" - + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); - log( "Gene tree internal nodes : " + getIntNodesOfAnalyzedGeneTrees() ); - log( "Gene tree external nodes : " + getExtNodesOfAnalyzedGeneTrees() ); + log( "Gene trees analyzed range :\t" + first + "-" + last ); + } + log( "Gene tree internal nodes :\t" + getIntNodesOfAnalyzedGeneTrees() ); + log( "Gene tree external nodes :\t" + getExtNodesOfAnalyzedGeneTrees() ); + log( "Removed ext gene tree nodes :\t" + getRemovedGeneTreeNodes().size() ); + log( "Spec tree ext nodes (after strip) :\t" + species_tree.getNumberOfExternalNodes() ); + log( "Spec tree polytomies (after strip) :\t" + + PhylogenyMethods.countNumberOfPolytomies( species_tree ) ); + log( "Taxonomy linking based on :\t" + getGSDIRtaxCompBase() ); + log( "Mean number of duplications :\t" + df.format( getDuplicationsStatistics().arithmeticMean() ) + + "\t" + df.format( ( 100.0 * getDuplicationsStatistics().arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() ) + + "%\t(sd: " + df.format( getDuplicationsStatistics().sampleStandardDeviation() ) + ")" ); + if ( getDuplicationsStatistics().getN() > 3 ) { + log( "Median number of duplications :\t" + df.format( median ) + "\t" + + df.format( ( 100.0 * median ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + } + log( "Minimum duplications :\t" + min + "\t" + + df.format( ( 100.0 * min ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + log( "Maximum duplications :\t" + ( int ) max + "\t" + + df.format( ( 100.0 * max ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + log( "Gene trees with median duplications :\t" + median_count + "\t" + + df.format( median_count_percentage ) + "%" ); + log( "Gene trees with minimum duplications:\t" + min_count + "\t" + + df.format( min_count_percentage ) + "%" ); + log( "Gene trees with maximum duplications:\t" + max_count + "\t" + + df.format( max_count_percentage ) + "%" ); + } private final void preLog( final int gene_trees, @@ -523,11 +552,12 @@ public final class RIO { final ALGORITHM algorithm, final String outgroup ) { if ( gene_trees > 0 ) { - log( "Number of gene trees (total) : " + gene_trees ); + log( "Number of gene trees (total) :\t" + gene_trees ); } - log( "Algorithm : " + algorithm ); - log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() ); - log( "Species tree polytomies (prior to stripping) : " + + log( "Algorithm :\t" + algorithm ); + log( "Spec tree ext nodes (prior strip) :\t" + species_tree.getNumberOfExternalNodes() ); + log( "Spec tree polytomies (prior strip) :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) ); String rs = ""; switch ( _rerooting ) { @@ -548,7 +578,8 @@ public final class RIO { break; } } - log( "Re-rooting : " + rs ); + log( "Re-rooting :\t" + rs ); + } public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )