From e127c72b4d6cf6ba9bc5a5e60721d7bf7a0d2aeb Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Mon, 17 Apr 2017 13:41:44 -0700 Subject: [PATCH] in progress... --- .../java/src/org/forester/application/rio.java | 437 +---------------- forester/java/src/org/forester/rio/RIOUtil.java | 490 ++++++++++++++++++++ 2 files changed, 496 insertions(+), 431 deletions(-) create mode 100644 forester/java/src/org/forester/rio/RIOUtil.java diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 29dec92..fac270c 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -26,39 +26,23 @@ package org.forester.application; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; -import java.math.RoundingMode; import java.util.ArrayList; import java.util.Arrays; -import java.util.Iterator; import java.util.List; -import java.util.SortedSet; -import java.util.TreeSet; -import org.forester.datastructures.IntMatrix; -import org.forester.io.parsers.IteratingPhylogenyParser; -import org.forester.io.parsers.PhylogenyParser; -import org.forester.io.parsers.nexus.NexusPhylogeniesParser; -import org.forester.io.parsers.nhx.NHXParser; -import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; -import org.forester.io.parsers.phyloxml.PhyloXmlParser; -import org.forester.io.parsers.util.ParserUtils; -import org.forester.io.writers.PhylogenyWriter; -import org.forester.phylogeny.Phylogeny; import org.forester.rio.RIO; import org.forester.rio.RIO.REROOTING; -import org.forester.rio.RIOException; -import org.forester.sdi.SDIException; +import org.forester.rio.RIOUtil; import org.forester.sdi.SDIutil.ALGORITHM; -import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.CommandLineArguments; import org.forester.util.EasyWriter; import org.forester.util.ForesterUtil; public class rio { - final static private String PRG_NAME = "rio"; - final static private String PRG_VERSION = "5.000"; - final static private String PRG_DATE = "170411"; + public final static String PRG_NAME = "rio"; + public final static String PRG_VERSION = "5.000"; + public final static String PRG_DATE = "170411"; final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; final static private String HELP_OPTION_1 = "help"; @@ -507,7 +491,7 @@ public class rio { outname = outname.substring( 0, outname.lastIndexOf( "." ) ); } try { - executeAnalysis( gf, + RIOUtil.executeAnalysis( gf, species_tree_file, new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ), @@ -541,7 +525,7 @@ public class rio { if ( outname.indexOf( "." ) > 0 ) { outname = outname.substring( 0, outname.lastIndexOf( "." ) ); } - executeAnalysis( gene_trees_file, + RIOUtil.executeAnalysis( gene_trees_file, species_tree_file, orthology_outtable, new File( outname + ORTHOLOG_GROUPS_SUFFIX ), @@ -576,238 +560,6 @@ public class rio { System.exit( 0 ); } - private static final void executeAnalysis( final File gene_trees_file, - final File species_tree_file, - final File orthology_outtable, - final File orthology_groups_outfile, - final File logfile, - final String outgroup, - final REROOTING rerooting, - final int gt_first, - final int gt_last, - final File return_species_tree, - final File return_min_dup_gene_tree, - final File return_median_dup_gene_tree, - final boolean transfer_taxonomy, - final ALGORITHM algorithm, - final boolean use_gene_trees_dir, - final EasyWriter log, - final double ortholog_group_cutoff ) { - try { - final RIO rio; - boolean iterating = false; - final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); - if ( p instanceof PhyloXmlParser ) { - rio = RIO.executeAnalysis( gene_trees_file, - species_tree_file, - algorithm, - rerooting, - outgroup, - gt_first, - gt_last, - logfile != null, - true, - transfer_taxonomy ); - } - else { - iterating = true; - if ( p instanceof NHXParser ) { - final NHXParser nhx = ( NHXParser ) p; - nhx.setReplaceUnderscores( false ); - nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); - } - else if ( p instanceof NexusPhylogeniesParser ) { - final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p; - nex.setReplaceUnderscores( false ); - nex.setIgnoreQuotes( true ); - nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); - } - else { - throw new RuntimeException( "unknown parser type: " + p ); - } - final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p; - ip.setSource( gene_trees_file ); - rio = RIO.executeAnalysis( ip, - species_tree_file, - algorithm, - rerooting, - outgroup, - gt_first, - gt_last, - logfile != null, - !use_gene_trees_dir, - transfer_taxonomy ); - } - if ( !use_gene_trees_dir ) { - if ( algorithm == ALGORITHM.GSDIR ) { - System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() ); - } - } - final IntMatrix m; - if ( iterating ) { - m = rio.getOrthologTable(); - } - else { - m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); - } - final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); - writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir ); - final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile, - ortholog_group_cutoff, - stats.getN(), - m, - !use_gene_trees_dir, - false ); - final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true ); - final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true ); - final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true ); - final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true ); - final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true ); - if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) { - writeLogFile( logfile, - rio, - species_tree_file, - gene_trees_file, - orthology_outtable, - PRG_NAME, - PRG_VERSION, - PRG_DATE, - ForesterUtil.getForesterLibraryInformation(), - !use_gene_trees_dir ); - } - if ( return_species_tree != null ) { - writeTree( rio.getSpeciesTree(), - return_species_tree, - use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" ); - } - if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) { - final int min = ( int ) rio.getDuplicationsStatistics().getMin(); - writeTree( rio.getMinDuplicationsGeneTree(), - new File( return_min_dup_gene_tree.toString() + min + ".xml" ), - use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" ); - } - if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) { - final int med = ( int ) rio.getDuplicationsStatistics().median(); - writeTree( rio.getDuplicationsToTreeMap().get( med ), - new File( return_median_dup_gene_tree.toString() + med + ".xml" ), - use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" ); - } - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" ); - final int min = ( int ) stats.getMin(); - final int max = ( int ) stats.getMax(); - final int median = ( int ) stats.median(); - int min_count = 0; - int max_count = 0; - int median_count = 0; - for( double d : stats.getData() ) { - if ( ( ( int ) d ) == min ) { - ++min_count; - } - if ( ( ( int ) d ) == max ) { - ++max_count; - } - if ( ( ( int ) d ) == median ) { - ++median_count; - } - } - final double min_count_percentage = ( 100.0 * min_count ) / stats.getN(); - final double max_count_percentage = ( 100.0 * max_count ) / stats.getN(); - final double median_count_percentage = ( 100.0 * median_count ) / stats.getN(); - if ( use_gene_trees_dir ) { - String name = gene_trees_file.getName(); - if ( name.indexOf( "." ) > 0 ) { - name = name.substring( 0, name.lastIndexOf( "." ) ); - } - log.print( name ); - log.print( "\t" ); - log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) ); - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups ) ); - // - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups_005 ) ); - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups_025 ) ); - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups_05 ) ); - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups_075 ) ); - log.print( "\t" ); - log.print( Integer.toString( ortholog_groups_095 ) ); - // - log.print( "\t" ); - if ( stats.getN() > 3 ) { - log.print( df.format( median ) ); - } - else { - log.print( "" ); - } - log.print( "\t" ); - log.print( df.format( stats.arithmeticMean() ) ); - log.print( "\t" ); - if ( stats.getN() > 3 ) { - log.print( df.format( stats.sampleStandardDeviation() ) ); - } - else { - log.print( "" ); - } - log.print( "\t" ); - log.print( Integer.toString( min ) ); - log.print( "\t" ); - log.print( Integer.toString( max ) ); - log.print( "\t" ); - log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) ); - log.print( "\t" ); - log.print( Integer.toString( stats.getN() ) ); - log.println(); - } - else { - System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() ); - System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() ); - System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() ) - + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() ) - + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" ); - if ( stats.getN() > 3 ) { - System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t" - + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - } - System.out.println( "Minimum duplications :\t" + min + "\t" - + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - System.out.println( "Maximum duplications :\t" + ( int ) max + "\t" - + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); - System.out.println( "Gene trees with median duplications :\t" + median_count + "\t" - + df.format( median_count_percentage ) + "%" ); - System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t" - + df.format( min_count_percentage ) + "%" ); - System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t" - + df.format( max_count_percentage ) + "%" ); - if ( algorithm == ALGORITHM.GSDIR ) { - System.out.println( "Removed ext gene tree nodes :\t" - + rio.getRemovedGeneTreeNodes().size() ); - } - } - } - catch ( final RIOException e ) { - ForesterUtil.fatalError( e.getLocalizedMessage() ); - } - catch ( final SDIException e ) { - ForesterUtil.fatalError( e.getLocalizedMessage() ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( e.getLocalizedMessage() ); - } - catch ( final OutOfMemoryError e ) { - ForesterUtil.outOfMemoryError( e ); - } - catch ( final Exception e ) { - ForesterUtil.unexpectedFatalError( e ); - } - catch ( final Error e ) { - ForesterUtil.unexpectedFatalError( e ); - } - } - private final static void printHelp() { System.out.println( "Usage" ); System.out.println(); @@ -850,181 +602,4 @@ public class rio { System.out.println(); System.exit( -1 ); } - - private static void writeLogFile( final File logfile, - final RIO rio, - final File species_tree_file, - final File gene_trees_file, - final File outtable, - final String prg_name, - final String prg_v, - final String prg_date, - final String f, - final boolean verbose ) - throws IOException { - final EasyWriter out = ForesterUtil.createEasyWriter( logfile ); - out.println( "# " + prg_name ); - out.println( "# version : " + prg_v ); - out.println( "# date : " + prg_date ); - out.println( "# based on: " + f ); - out.println( "# ----------------------------------" ); - out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() ); - out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() ); - out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() ); - out.flush(); - out.println( rio.getLog().toString() ); - out.close(); - if ( verbose ) { - System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() ); - } - } - - private static final void writeTable( final File table_outfile, - final int gene_trees_analyzed, - final IntMatrix m, - final boolean verbose ) - throws IOException { - final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile ); - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); - df.setDecimalSeparatorAlwaysShown( false ); - df.setRoundingMode( RoundingMode.HALF_UP ); - for( int i = 0; i < m.size(); ++i ) { - w.print( "\t" ); - w.print( m.getLabel( i ) ); - } - w.println(); - for( int x = 0; x < m.size(); ++x ) { - w.print( m.getLabel( x ) ); - for( int y = 0; y < m.size(); ++y ) { - w.print( "\t" ); - if ( x == y ) { - if ( m.get( x, y ) != gene_trees_analyzed ) { - ForesterUtil.unexpectedFatalError( "diagonal value is off" ); - } - w.print( "-" ); - } - else { - w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) ); - } - } - w.println(); - } - w.close(); - if ( verbose ) { - System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() ); - } - } - - private static final int writeOrtologGroups( final File outfile, - final double cutoff, - final int gene_trees_analyzed, - final IntMatrix m, - final boolean verbose, - final boolean calc_conly ) - throws IOException { - List> groups = new ArrayList>(); - BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics(); - int below_075 = 0; - int below_05 = 0; - int below_025 = 0; - for( int x = 1; x < m.size(); ++x ) { - final String a = m.getLabel( x ); - for( int y = 0; y < x; ++y ) { - final String b = m.getLabel( y ); - final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed; - stats.addValue( s ); - if ( s < 0.75 ) { - below_075++; - if ( s < 0.5 ) { - below_05++; - if ( s < 0.25 ) { - below_025++; - } - } - } - if ( s >= cutoff ) { - boolean found = false; - for( final SortedSet group : groups ) { - if ( group.contains( a ) ) { - group.add( b ); - found = true; - } - if ( group.contains( b ) ) { - group.add( a ); - found = true; - } - } - if ( !found ) { - final SortedSet new_group = new TreeSet(); - new_group.add( a ); - new_group.add( b ); - groups.add( new_group ); - } - } - } - } - //Deal with singlets: - for( int x = 0; x < m.size(); ++x ) { - final String a = m.getLabel( x ); - boolean found = false; - for( final SortedSet group : groups ) { - if ( group.contains( a ) ) { - found = true; - break; - } - } - if ( !found ) { - final SortedSet new_group = new TreeSet(); - new_group.add( a ); - groups.add( new_group ); - } - } - if ( calc_conly ) { - return groups.size(); - } - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); - df.setDecimalSeparatorAlwaysShown( false ); - df.setRoundingMode( RoundingMode.HALF_UP ); - final EasyWriter w = ForesterUtil.createEasyWriter( outfile ); - int counter = 1; - for( final SortedSet group : groups ) { - w.print( Integer.toString( counter++ ) ); - for( final String s : group ) { - w.print( "\t" ); - w.print( s ); - } - w.println(); - } - w.println(); - w.println( "# Cutoff\t" + df.format( cutoff ) ); - w.println(); - w.println( "# Orthology support statistics:" ); - if ( stats.getN() > 3 ) { - w.println( "# Median\t" + df.format( stats.median() ) ); - } - w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) ); - if ( stats.getN() > 3 ) { - w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) ); - } - w.println( "# Min\t" + df.format( stats.getMin() ) ); - w.println( "# Max\t" + df.format( stats.getMax() ) ); - w.println( "# Total\t" + df.format( stats.getN() ) ); - w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" ); - w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" ); - w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" ); - w.close(); - if ( verbose ) { - System.out.println( "Number of ortholog groups :\t" + groups.size() ); - System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() ); - } - return groups.size(); - } - - private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException { - final PhylogenyWriter writer = new PhylogenyWriter(); - writer.toPhyloXML( f, p, 0 ); - if ( comment != null ) { - System.out.println( comment + f.getCanonicalPath() ); - } - } } diff --git a/forester/java/src/org/forester/rio/RIOUtil.java b/forester/java/src/org/forester/rio/RIOUtil.java new file mode 100644 index 0000000..8d01afa --- /dev/null +++ b/forester/java/src/org/forester/rio/RIOUtil.java @@ -0,0 +1,490 @@ + +package org.forester.rio; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.math.RoundingMode; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.datastructures.IntMatrix; +import org.forester.io.parsers.IteratingPhylogenyParser; +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.rio.RIO.REROOTING; +import org.forester.sdi.SDIException; +import org.forester.sdi.SDIutil.ALGORITHM; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.EasyWriter; +import org.forester.util.ForesterUtil; + +public final class RIOUtil { + + public static final void executeAnalysis( final File gene_trees_file, + final File species_tree_file, + final File orthology_outtable, + final File orthology_groups_outfile, + final File logfile, + final String outgroup, + final REROOTING rerooting, + final int gt_first, + final int gt_last, + final File return_species_tree, + final File return_min_dup_gene_tree, + final File return_median_dup_gene_tree, + final boolean transfer_taxonomy, + final ALGORITHM algorithm, + final boolean use_gene_trees_dir, + final EasyWriter log, + final double ortholog_group_cutoff ) { + try { + final RIO rio; + boolean iterating = false; + final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); + if ( p instanceof PhyloXmlParser ) { + rio = RIO.executeAnalysis( gene_trees_file, + species_tree_file, + algorithm, + rerooting, + outgroup, + gt_first, + gt_last, + logfile != null, + true, + transfer_taxonomy ); + } + else { + iterating = true; + if ( p instanceof NHXParser ) { + final NHXParser nhx = ( NHXParser ) p; + nhx.setReplaceUnderscores( false ); + nhx.setIgnoreQuotes( true ); + nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); + } + else if ( p instanceof NexusPhylogeniesParser ) { + final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p; + nex.setReplaceUnderscores( false ); + nex.setIgnoreQuotes( true ); + nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); + } + else { + throw new RuntimeException( "unknown parser type: " + p ); + } + final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p; + ip.setSource( gene_trees_file ); + rio = RIO.executeAnalysis( ip, + species_tree_file, + algorithm, + rerooting, + outgroup, + gt_first, + gt_last, + logfile != null, + !use_gene_trees_dir, + transfer_taxonomy ); + } + if ( !use_gene_trees_dir ) { + if ( algorithm == ALGORITHM.GSDIR ) { + System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() ); + } + } + /// + //// + final IntMatrix m; + if ( iterating ) { + m = rio.getOrthologTable(); + } + else { + m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); + } + final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); + writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir ); + final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile, + ortholog_group_cutoff, + stats.getN(), + m, + !use_gene_trees_dir, + false ); + final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true ); + final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true ); + final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true ); + final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true ); + final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true ); + if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) { + writeLogFile( logfile, + rio, + species_tree_file, + gene_trees_file, + orthology_outtable, + org.forester.application.rio.PRG_NAME, + org.forester.application.rio.PRG_VERSION, + org.forester.application.rio.PRG_DATE, + ForesterUtil.getForesterLibraryInformation(), + !use_gene_trees_dir ); + } + if ( return_species_tree != null ) { + writeTree( rio.getSpeciesTree(), + return_species_tree, + use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" ); + } + if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) { + final int min = ( int ) rio.getDuplicationsStatistics().getMin(); + writeTree( rio.getMinDuplicationsGeneTree(), + new File( return_min_dup_gene_tree.toString() + min + ".xml" ), + use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" ); + } + if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) { + final int med = ( int ) rio.getDuplicationsStatistics().median(); + writeTree( rio.getDuplicationsToTreeMap().get( med ), + new File( return_median_dup_gene_tree.toString() + med + ".xml" ), + use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" ); + } + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" ); + final int min = ( int ) stats.getMin(); + final int max = ( int ) stats.getMax(); + final int median = ( int ) stats.median(); + int min_count = 0; + int max_count = 0; + int median_count = 0; + for( double d : stats.getData() ) { + if ( ( ( int ) d ) == min ) { + ++min_count; + } + if ( ( ( int ) d ) == max ) { + ++max_count; + } + if ( ( ( int ) d ) == median ) { + ++median_count; + } + } + final double min_count_percentage = ( 100.0 * min_count ) / stats.getN(); + final double max_count_percentage = ( 100.0 * max_count ) / stats.getN(); + final double median_count_percentage = ( 100.0 * median_count ) / stats.getN(); + if ( use_gene_trees_dir ) { + String name = gene_trees_file.getName(); + if ( name.indexOf( "." ) > 0 ) { + name = name.substring( 0, name.lastIndexOf( "." ) ); + } + log.print( name ); + log.print( "\t" ); + log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups ) ); + // + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_005 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_025 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_05 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_075 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_095 ) ); + // + log.print( "\t" ); + if ( stats.getN() > 3 ) { + log.print( df.format( median ) ); + } + else { + log.print( "" ); + } + log.print( "\t" ); + log.print( df.format( stats.arithmeticMean() ) ); + log.print( "\t" ); + if ( stats.getN() > 3 ) { + log.print( df.format( stats.sampleStandardDeviation() ) ); + } + else { + log.print( "" ); + } + log.print( "\t" ); + log.print( Integer.toString( min ) ); + log.print( "\t" ); + log.print( Integer.toString( max ) ); + log.print( "\t" ); + log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) ); + log.print( "\t" ); + log.print( Integer.toString( stats.getN() ) ); + log.println(); + } + else { + System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() ); + System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() ); + System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() ) + + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" ); + if ( stats.getN() > 3 ) { + System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t" + + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + } + System.out.println( "Minimum duplications :\t" + min + "\t" + + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + System.out.println( "Maximum duplications :\t" + ( int ) max + "\t" + + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" ); + System.out.println( "Gene trees with median duplications :\t" + median_count + "\t" + + df.format( median_count_percentage ) + "%" ); + System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t" + + df.format( min_count_percentage ) + "%" ); + System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t" + + df.format( max_count_percentage ) + "%" ); + if ( algorithm == ALGORITHM.GSDIR ) { + System.out.println( "Removed ext gene tree nodes :\t" + + rio.getRemovedGeneTreeNodes().size() ); + } + } + } + catch ( final RIOException e ) { + ForesterUtil.fatalError( e.getLocalizedMessage() ); + } + catch ( final SDIException e ) { + ForesterUtil.fatalError( e.getLocalizedMessage() ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( e.getLocalizedMessage() ); + } + catch ( final OutOfMemoryError e ) { + ForesterUtil.outOfMemoryError( e ); + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( e ); + } + catch ( final Error e ) { + ForesterUtil.unexpectedFatalError( e ); + } + } + + private static final void writeTable( final File table_outfile, + final int gene_trees_analyzed, + final IntMatrix m, + final boolean verbose ) + throws IOException { + final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); + df.setDecimalSeparatorAlwaysShown( false ); + df.setRoundingMode( RoundingMode.HALF_UP ); + for( int i = 0; i < m.size(); ++i ) { + w.print( "\t" ); + w.print( m.getLabel( i ) ); + } + w.println(); + for( int x = 0; x < m.size(); ++x ) { + w.print( m.getLabel( x ) ); + for( int y = 0; y < m.size(); ++y ) { + w.print( "\t" ); + if ( x == y ) { + if ( m.get( x, y ) != gene_trees_analyzed ) { + ForesterUtil.unexpectedFatalError( "diagonal value is off" ); + } + w.print( "-" ); + } + else { + w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) ); + } + } + w.println(); + } + w.close(); + if ( verbose ) { + System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() ); + } + } + + private static final int writeOrtologGroups( final File outfile, + final double cutoff, + final int gene_trees_analyzed, + final IntMatrix m, + final boolean verbose, + final boolean calc_conly ) + throws IOException { + List> groups = new ArrayList>(); + BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics(); + int below_075 = 0; + int below_05 = 0; + int below_025 = 0; + for( int x = 1; x < m.size(); ++x ) { + final String a = m.getLabel( x ); + for( int y = 0; y < x; ++y ) { + final String b = m.getLabel( y ); + final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed; + stats.addValue( s ); + if ( s < 0.75 ) { + below_075++; + if ( s < 0.5 ) { + below_05++; + if ( s < 0.25 ) { + below_025++; + } + } + } + if ( s >= cutoff ) { + boolean found = false; + for( final SortedSet group : groups ) { + if ( group.contains( a ) ) { + group.add( b ); + found = true; + } + if ( group.contains( b ) ) { + group.add( a ); + found = true; + } + } + if ( !found ) { + final SortedSet new_group = new TreeSet(); + new_group.add( a ); + new_group.add( b ); + groups.add( new_group ); + } + } + } + } + //Deal with singlets: + for( int x = 0; x < m.size(); ++x ) { + final String a = m.getLabel( x ); + boolean found = false; + for( final SortedSet group : groups ) { + if ( group.contains( a ) ) { + found = true; + break; + } + } + if ( !found ) { + final SortedSet new_group = new TreeSet(); + new_group.add( a ); + groups.add( new_group ); + } + } + if ( calc_conly ) { + return groups.size(); + } + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); + df.setDecimalSeparatorAlwaysShown( false ); + df.setRoundingMode( RoundingMode.HALF_UP ); + final EasyWriter w = ForesterUtil.createEasyWriter( outfile ); + int counter = 1; + for( final SortedSet group : groups ) { + w.print( Integer.toString( counter++ ) ); + for( final String s : group ) { + w.print( "\t" ); + w.print( s ); + } + w.println(); + } + w.println(); + w.println( "# Cutoff\t" + df.format( cutoff ) ); + w.println(); + w.println( "# Orthology support statistics:" ); + if ( stats.getN() > 3 ) { + w.println( "# Median\t" + df.format( stats.median() ) ); + } + w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) ); + if ( stats.getN() > 3 ) { + w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) ); + } + w.println( "# Min\t" + df.format( stats.getMin() ) ); + w.println( "# Max\t" + df.format( stats.getMax() ) ); + w.println( "# Total\t" + df.format( stats.getN() ) ); + w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" ); + w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" ); + w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" ); + w.close(); + if ( verbose ) { + System.out.println( "Number of ortholog groups :\t" + groups.size() ); + System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() ); + } + return groups.size(); + } + + private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( f, p, 0 ); + if ( comment != null ) { + System.out.println( comment + f.getCanonicalPath() ); + } + } + + private static void writeLogFile( final File logfile, + final RIO rio, + final File species_tree_file, + final File gene_trees_file, + final File outtable, + final String prg_name, + final String prg_v, + final String prg_date, + final String f, + final boolean verbose ) + throws IOException { + final EasyWriter out = ForesterUtil.createEasyWriter( logfile ); + out.println( "# " + prg_name ); + out.println( "# version : " + prg_v ); + out.println( "# date : " + prg_date ); + out.println( "# based on: " + f ); + out.println( "# ----------------------------------" ); + out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() ); + out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() ); + out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() ); + out.flush(); + out.println( rio.getLog().toString() ); + out.close(); + if ( verbose ) { + System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() ); + } + } + + private final static Map obtainMapping( final File dir, final String prefix, final String suffix ) + throws IOException { + if ( !dir.exists() ) { + throw new IOException( "[" + dir + "] does not exist" ); + } + if ( !dir.isDirectory() ) { + throw new IOException( "[" + dir + "] is not a directory" ); + } + final File mapping_files[] = dir.listFiles( new FilenameFilter() { + + @Override + public boolean accept( final File dir, final String name ) { + return ( name.endsWith( suffix ) ); + } + } ); + String my_suffix = suffix; + boolean done = false; + do { + int matches = 0; + for( File file : mapping_files ) { + if ( file.getName().equals( my_suffix ) ) { + matches++; + } + } + if ( matches == 1) { + done = true; + } + else { + my_suffix = my_suffix.substring( 0, my_suffix.length() - 1); + } + } while (!done ); + + + if ( mapping_files.length == 0 ) { + throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in [" + + dir + "] " ); + } + if ( mapping_files.length > 1 ) { + throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in [" + + dir + "] " ); + } + final BasicTable t = BasicTableParser.parse( mapping_files[ 0 ], '\t' ); + return t.getColumnsAsMap( 0, 1 ); + } +} -- 1.7.10.2