From: cmzmasek Date: Mon, 17 Apr 2017 15:53:55 +0000 (-0700) Subject: in progress... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=62a1ce08fda061852cc3bd559e4b76ac47ac3647;p=jalview.git in progress... --- diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 4ef4ec4..29dec92 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -2,10 +2,7 @@ // FORESTER -- software libraries and applications // for evolutionary biology research and applications. // -// Copyright (C) 2008-2009 Christian M. Zmasek -// Copyright (C) 2008-2009 Burnham Institute for Medical Research -// Copyright (C) 2000-2001 Washington University School of Medicine -// and Howard Hughes Medical Institute +// Copyright (C) 2017 Christian M. Zmasek // All rights reserved // // This library is free software; you can redistribute it and/or @@ -32,7 +29,10 @@ import java.io.IOException; import java.math.RoundingMode; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.datastructures.IntMatrix; import org.forester.io.parsers.IteratingPhylogenyParser; @@ -56,26 +56,27 @@ import org.forester.util.ForesterUtil; public class rio { - final static private String PRG_NAME = "rio"; - final static private String PRG_VERSION = "4.000 beta 11"; - final static private String PRG_DATE = "170410"; - final static private String E_MAIL = "phyloxml@gmail.com"; - final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; - final static private String HELP_OPTION_1 = "help"; - final static private String LOGFILE_SUFFIX = "_RIO_log.tsv"; - final static private String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml"; - final static private String ORTHO_OUTTABLE_SUFFIX = "_RIO_o_table.tsv"; - final static private String OUT_GENE_TREE_SUFFIX = "_RIO_gene_tree.xml"; - final static private String HELP_OPTION_2 = "h"; - final static private String GT_FIRST = "f"; - final static private String GT_LAST = "l"; - final static private String REROOTING_OPT = "r"; - final static private String OUTGROUP = "o"; - final static private String RETURN_SPECIES_TREE = "s"; - final static private String RETURN_BEST_GENE_TREE = "g"; - final static private String USE_SDIR = "b"; - final static private String TRANSFER_TAXONOMY_OPTION = "t"; - final static private String GENE_TREES_SUFFIX_OPTION = "u"; + final static private String PRG_NAME = "rio"; + final static private String PRG_VERSION = "5.000"; + final static private String PRG_DATE = "170411"; + final static private String E_MAIL = "phyloxml@gmail.com"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; + final static private String HELP_OPTION_1 = "help"; + final static private String LOGFILE_SUFFIX = "_RIO_log.tsv"; + final static private String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml"; + final static private String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv"; + final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_"; + final static private String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_"; + final static private String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv"; + final static private String HELP_OPTION_2 = "h"; + final static private String GT_FIRST = "f"; + final static private String GT_LAST = "l"; + final static private String REROOTING_OPT = "r"; + final static private String OUTGROUP = "o"; + final static private String USE_SDIR = "s"; + final static private String GENE_TREES_SUFFIX_OPTION = "g"; + final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION = "c"; + final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5; public static void main( final String[] args ) { ForesterUtil.printProgramInformation( PRG_NAME, @@ -107,10 +108,8 @@ public class rio { allowed_options.add( REROOTING_OPT ); allowed_options.add( OUTGROUP ); allowed_options.add( USE_SDIR ); - allowed_options.add( RETURN_SPECIES_TREE ); - allowed_options.add( RETURN_BEST_GENE_TREE ); - allowed_options.add( TRANSFER_TAXONOMY_OPTION ); allowed_options.add( GENE_TREES_SUFFIX_OPTION ); + allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options ); @@ -167,7 +166,7 @@ public class rio { ForesterUtil.fatalError( "no value allowed for -" + USE_SDIR ); } sdir = true; - if ( logfile != null ) { + if ( !use_dir && logfile != null ) { ForesterUtil.fatalError( "no logfile output for SDIR algorithm" ); } } @@ -219,12 +218,12 @@ public class rio { int gt_first = RIO.DEFAULT_RANGE; int gt_last = RIO.DEFAULT_RANGE; if ( cla.isOptionSet( GT_FIRST ) ) { - if ( !cla.isOptionHasAValue( GT_FIRST ) ) { - ForesterUtil.fatalError( "no value for -" + GT_FIRST ); - } if ( sdir ) { ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" ); } + if ( !cla.isOptionHasAValue( GT_FIRST ) ) { + ForesterUtil.fatalError( "no value for -" + GT_FIRST ); + } try { gt_first = cla.getOptionValueAsInt( GT_FIRST ); } @@ -236,12 +235,12 @@ public class rio { } } if ( cla.isOptionSet( GT_LAST ) ) { - if ( !cla.isOptionHasAValue( GT_LAST ) ) { - ForesterUtil.fatalError( "no value for -" + GT_LAST ); - } if ( sdir ) { ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" ); } + if ( !cla.isOptionHasAValue( GT_LAST ) ) { + ForesterUtil.fatalError( "no value for -" + GT_LAST ); + } try { gt_last = cla.getOptionValueAsInt( GT_LAST ); } @@ -256,50 +255,30 @@ public class rio { ForesterUtil.fatalError( "attempt to set range (0-based) of gene to analyze to: from " + gt_first + " to " + gt_last ); } - File return_species_tree = null; - if ( !sdir && cla.isOptionSet( RETURN_SPECIES_TREE ) ) { - if ( use_dir ) { - ForesterUtil.fatalError( "no return species tree option when operating on gene trees directory" ); - } - if ( !cla.isOptionHasAValue( RETURN_SPECIES_TREE ) ) { - ForesterUtil.fatalError( "no value for -" + RETURN_SPECIES_TREE ); - } - final String s = cla.getOptionValueAsCleanString( RETURN_SPECIES_TREE ); - return_species_tree = new File( s ); - if ( return_species_tree.exists() ) { - ForesterUtil.fatalError( "\"" + return_species_tree + "\" already exists" ); + double ortholog_group_cutoff = ORTHOLOG_GROUPS_CUTOFF_DEFAULT; + if ( cla.isOptionSet( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) { + if ( sdir ) { + ForesterUtil.fatalError( "ortholog groups cutoff for SDIR algorithm" ); } - } - File return_gene_tree = null; - if ( !sdir && cla.isOptionSet( RETURN_BEST_GENE_TREE ) ) { - if ( use_dir ) { - ForesterUtil.fatalError( "no best gene tree return option when operating on gene trees directory" ); + if ( !cla.isOptionHasAValue( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) { + ForesterUtil.fatalError( "no value for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION ); } - if ( !cla.isOptionHasAValue( RETURN_BEST_GENE_TREE ) ) { - ForesterUtil.fatalError( "no value for -" + RETURN_BEST_GENE_TREE ); + try { + ortholog_group_cutoff = cla.getOptionValueAsDouble( ORTHOLOG_GROUPS_CUTOFF_OPTION ); } - final String s = cla.getOptionValueAsCleanString( RETURN_BEST_GENE_TREE ); - return_gene_tree = new File( s ); - if ( return_gene_tree.exists() ) { - ForesterUtil.fatalError( "\"" + return_gene_tree + "\" already exists" ); + catch ( final IOException e ) { + ForesterUtil.fatalError( "could not parse double for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION + " option" ); } - } - boolean transfer_taxonomy = false; - if ( !sdir && cla.isOptionSet( TRANSFER_TAXONOMY_OPTION ) ) { - if ( use_dir ) { - ForesterUtil.fatalError( "no transferring taxonomy option when operating on gene trees directory" ); + if ( ortholog_group_cutoff < 0 ) { + ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff ); } - if ( return_gene_tree == null ) { - ForesterUtil.fatalError( "no point in transferring taxonomy data without returning best gene tree" ); + if ( ortholog_group_cutoff > 1 ) { + ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff ); } - transfer_taxonomy = true; } if ( !use_dir ) { ForesterUtil.fatalErrorIfFileNotReadable( gene_trees_file ); } - else { - transfer_taxonomy = true; - } final String gene_trees_suffix; if ( cla.isOptionSet( GENE_TREES_SUFFIX_OPTION ) ) { if ( !use_dir ) { @@ -340,6 +319,7 @@ public class rio { if ( logfile != null ) { System.out.println( "Logfile :\t" + logfile ); } + System.out.println( "Ortholog groups cutoff :\t" + ortholog_group_cutoff ); if ( gt_first != RIO.DEFAULT_RANGE ) { System.out.println( "First gene tree to analyze :\t" + gt_first ); } @@ -372,13 +352,6 @@ public class rio { else { System.out.println( "Non binary species tree :\tdisallowed" ); } - if ( return_species_tree != null ) { - System.out.println( "Write used species tree to :\t" + return_species_tree ); - } - if ( return_gene_tree != null ) { - System.out.println( "Write best gene tree to :\t" + return_gene_tree ); - System.out.println( "Transfer taxonomic data :\t" + transfer_taxonomy ); - } time = System.currentTimeMillis(); final ALGORITHM algorithm; if ( sdir ) { @@ -459,6 +432,10 @@ public class rio { log.print( "\t" ); log.print( logfile.getCanonicalPath() ); log.println(); + log.print( "# Ortholog groups cutoff" ); + log.print( "\t" ); + log.print( Double.toString( ortholog_group_cutoff ) ); + log.println(); if ( gt_first != RIO.DEFAULT_RANGE ) { log.print( "# First gene tree to analyze" ); log.print( "\t" ); @@ -489,12 +466,24 @@ public class rio { log.print( "\t" ); log.print( "EXT NODES" ); log.print( "\t" ); - log.print( "MEAN DUP" ); + log.print( ortholog_group_cutoff + " O GROUPS" ); log.print( "\t" ); - log.print( "MEAN DUP SD" ); + log.print( "0.05 O GROUPS" ); + log.print( "\t" ); + log.print( "0.25 O GROUPS" ); + log.print( "\t" ); + log.print( "0.5 O GROUPS" ); + log.print( "\t" ); + log.print( "0.75 O GROUPS" ); + log.print( "\t" ); + log.print( "0.95 O GROUPS" ); log.print( "\t" ); log.print( "MEDIAN DUP" ); log.print( "\t" ); + log.print( "MEAN DUP" ); + log.print( "\t" ); + log.print( "MEAN DUP SD" ); + log.print( "\t" ); log.print( "MIN DUP" ); log.print( "\t" ); log.print( "MAX DUP" ); @@ -521,6 +510,7 @@ public class rio { executeAnalysis( gf, species_tree_file, new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ), + new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ), outgroup, rerooting, @@ -528,11 +518,15 @@ public class rio { gt_last, new File( outdir.getCanonicalFile() + "/" + outname + STRIPPED_SPECIES_TREE_SUFFIX ), - new File( outdir.getCanonicalFile() + "/" + outname + OUT_GENE_TREE_SUFFIX ), - transfer_taxonomy, + new File( outdir.getCanonicalFile() + "/" + outname + + OUT_MIN_DUP_GENE_TREE_SUFFIX ), + new File( outdir.getCanonicalFile() + "/" + outname + + OUT_MED_DUP_GENE_TREE_SUFFIX ), + true, algorithm, true, - log ); + log, + ortholog_group_cutoff ); } catch ( IOException e ) { ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); @@ -543,20 +537,27 @@ public class rio { System.out.println(); } else { + String outname = orthology_outtable.toString(); + if ( outname.indexOf( "." ) > 0 ) { + outname = outname.substring( 0, outname.lastIndexOf( "." ) ); + } executeAnalysis( gene_trees_file, species_tree_file, orthology_outtable, + new File( outname + ORTHOLOG_GROUPS_SUFFIX ), logfile, outgroup, rerooting, gt_first, gt_last, - return_species_tree, - return_gene_tree, - transfer_taxonomy, + new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ), + new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ), + new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ), + algorithm == ALGORITHM.GSDIR, algorithm, false, - null ); + null, + ortholog_group_cutoff ); } if ( !use_dir ) { time = System.currentTimeMillis() - time; @@ -578,17 +579,20 @@ public class rio { private static final void executeAnalysis( final File gene_trees_file, final File species_tree_file, final File orthology_outtable, + final File orthology_groups_outfile, final File logfile, final String outgroup, final REROOTING rerooting, final int gt_first, final int gt_last, final File return_species_tree, - final File return_gene_tree, + final File return_min_dup_gene_tree, + final File return_median_dup_gene_tree, final boolean transfer_taxonomy, final ALGORITHM algorithm, final boolean use_gene_trees_dir, - final EasyWriter log ) { + final EasyWriter log, + final double ortholog_group_cutoff ) { try { final RIO rio; boolean iterating = false; @@ -649,6 +653,17 @@ public class rio { } final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir ); + final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile, + ortholog_group_cutoff, + stats.getN(), + m, + !use_gene_trees_dir, + false ); + final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true ); + final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true ); + final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true ); + final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true ); + final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true ); if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) { writeLogFile( logfile, rio, @@ -666,11 +681,18 @@ public class rio { return_species_tree, use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" ); } - if ( return_gene_tree != null ) { + if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) { + final int min = ( int ) rio.getDuplicationsStatistics().getMin(); writeTree( rio.getMinDuplicationsGeneTree(), - return_gene_tree, + new File( return_min_dup_gene_tree.toString() + min + ".xml" ), use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" ); } + if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) { + final int med = ( int ) rio.getDuplicationsStatistics().median(); + writeTree( rio.getDuplicationsToTreeMap().get( med ), + new File( return_median_dup_gene_tree.toString() + med + ".xml" ), + use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" ); + } final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" ); final int min = ( int ) stats.getMin(); final int max = ( int ) stats.getMax(); @@ -701,9 +723,19 @@ public class rio { log.print( "\t" ); log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) ); log.print( "\t" ); - log.print( df.format( stats.arithmeticMean() ) ); + log.print( Integer.toString( ortholog_groups ) ); + // + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_005 ) ); log.print( "\t" ); - log.print( df.format( stats.sampleStandardDeviation() ) ); + log.print( Integer.toString( ortholog_groups_025 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_05 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_075 ) ); + log.print( "\t" ); + log.print( Integer.toString( ortholog_groups_095 ) ); + // log.print( "\t" ); if ( stats.getN() > 3 ) { log.print( df.format( median ) ); @@ -712,6 +744,15 @@ public class rio { log.print( "" ); } log.print( "\t" ); + log.print( df.format( stats.arithmeticMean() ) ); + log.print( "\t" ); + if ( stats.getN() > 3 ) { + log.print( df.format( stats.sampleStandardDeviation() ) ); + } + else { + log.print( "" ); + } + log.print( "\t" ); log.print( Integer.toString( min ) ); log.print( "\t" ); log.print( Integer.toString( max ) ); @@ -741,7 +782,10 @@ public class rio { + df.format( min_count_percentage ) + "%" ); System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t" + df.format( max_count_percentage ) + "%" ); - System.out.println( "Removed ext gene tree nodes:\t" + rio.getRemovedGeneTreeNodes().size() ); + if ( algorithm == ALGORITHM.GSDIR ) { + System.out.println( "Removed ext gene tree nodes :\t" + + rio.getRemovedGeneTreeNodes().size() ); + } } } catch ( final RIOException e ) { @@ -776,21 +820,18 @@ public class rio { System.out.println( " Options" ); System.out.println( " -" + GT_FIRST + "= : first gene tree to analyze (0-based index)" ); System.out.println( " -" + GT_LAST + "= : last gene tree to analyze (0-based index)" ); + System.out.println( " -" + ORTHOLOG_GROUPS_CUTOFF_OPTION + + "= : cutoff value for ortholog groups (default: " + ORTHOLOG_GROUPS_CUTOFF_DEFAULT + ")" ); System.out.println( " -" + REROOTING_OPT + "=: re-rooting method for gene trees, possible values or 'none', 'midpoint'," ); System.out.println( " or 'outgroup' (default: by minizming duplications)" ); System.out.println( " -" + OUTGROUP + "= : for rooting by outgroup, name of outgroup (external gene tree node)" ); - System.out - .println( " -" + RETURN_SPECIES_TREE + "= : to write the (stripped) species tree to file" ); - System.out.println( " -" + RETURN_BEST_GENE_TREE - + "= : to write (one) minimal duplication gene tree to file" ); - System.out.println( " -" + TRANSFER_TAXONOMY_OPTION - + " : to transfer taxonomic data from species tree to returned minimal duplication gene tree\n" - + " (if -" + RETURN_BEST_GENE_TREE + " option is used)" ); System.out.println( " -" + USE_SDIR + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are" ); System.out.println( " disallowed, as are most options)" ); + System.out.println( " -" + GENE_TREES_SUFFIX_OPTION + + "= : suffix for gene trees when operating on gene tree directories (default: .mlt)" ); System.out.println(); System.out.println( " Formats" ); System.out @@ -802,9 +843,10 @@ public class rio { System.out.println( " in the species tree." ); System.out.println(); System.out.println( " Examples" ); + System.out.println( " rio -s gene_trees.nh species.xml outtable.tsv" ); System.out.println( " rio gene_trees.nh species.xml outtable.tsv log.txt" ); - System.out - .println( " rio -t -f=10 -l=100 -r=none -g=out_gene_tree.xml -s=stripped_species.xml gene_trees.xml species.xml outtable.tsv log.txt" ); + System.out.println( " rio -c=0.9 -f=10 -l=100 -r=none gene_trees.xml species.xml outtable.tsv log.txt" ); + System.out.println( " rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" ); System.out.println(); System.exit( -1 ); } @@ -837,10 +879,10 @@ public class rio { } } - private static void writeTable( final File table_outfile, - final int gene_trees_analyzed, - final IntMatrix m, - final boolean verbose ) + private static final void writeTable( final File table_outfile, + final int gene_trees_analyzed, + final IntMatrix m, + final boolean verbose ) throws IOException { final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile ); final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); @@ -873,6 +915,111 @@ public class rio { } } + private static final int writeOrtologGroups( final File outfile, + final double cutoff, + final int gene_trees_analyzed, + final IntMatrix m, + final boolean verbose, + final boolean calc_conly ) + throws IOException { + List> groups = new ArrayList>(); + BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics(); + int below_075 = 0; + int below_05 = 0; + int below_025 = 0; + for( int x = 1; x < m.size(); ++x ) { + final String a = m.getLabel( x ); + for( int y = 0; y < x; ++y ) { + final String b = m.getLabel( y ); + final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed; + stats.addValue( s ); + if ( s < 0.75 ) { + below_075++; + if ( s < 0.5 ) { + below_05++; + if ( s < 0.25 ) { + below_025++; + } + } + } + if ( s >= cutoff ) { + boolean found = false; + for( final SortedSet group : groups ) { + if ( group.contains( a ) ) { + group.add( b ); + found = true; + } + if ( group.contains( b ) ) { + group.add( a ); + found = true; + } + } + if ( !found ) { + final SortedSet new_group = new TreeSet(); + new_group.add( a ); + new_group.add( b ); + groups.add( new_group ); + } + } + } + } + //Deal with singlets: + for( int x = 0; x < m.size(); ++x ) { + final String a = m.getLabel( x ); + boolean found = false; + for( final SortedSet group : groups ) { + if ( group.contains( a ) ) { + found = true; + break; + } + } + if ( !found ) { + final SortedSet new_group = new TreeSet(); + new_group.add( a ); + groups.add( new_group ); + } + } + if ( calc_conly ) { + return groups.size(); + } + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" ); + df.setDecimalSeparatorAlwaysShown( false ); + df.setRoundingMode( RoundingMode.HALF_UP ); + final EasyWriter w = ForesterUtil.createEasyWriter( outfile ); + int counter = 1; + for( final SortedSet group : groups ) { + w.print( Integer.toString( counter++ ) ); + for( final String s : group ) { + w.print( "\t" ); + w.print( s ); + } + w.println(); + } + w.println(); + w.println( "# Cutoff\t" + df.format( cutoff ) ); + w.println(); + w.println( "# Orthology support statistics:" ); + if ( stats.getN() > 3 ) { + w.println( "# Median\t" + df.format( stats.median() ) ); + } + w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) ); + if ( stats.getN() > 3 ) { + w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) ); + } + w.println( "# Min\t" + df.format( stats.getMin() ) ); + w.println( "# Max\t" + df.format( stats.getMax() ) ); + w.println( "# Total\t" + df.format( stats.getN() ) ); + w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" ); + w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" ); + w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" ); + w.close(); + if ( verbose ) { + System.out.println( "Number of ortholog groups :\t" + groups.size() ); + System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() ); + } + return groups.size(); + } + private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException { final PhylogenyWriter writer = new PhylogenyWriter(); writer.toPhyloXML( f, p, 0 ); diff --git a/forester/java/src/org/forester/archaeopteryx/AptxConstants.java b/forester/java/src/org/forester/archaeopteryx/AptxConstants.java index d5db636..cf20424 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxConstants.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxConstants.java @@ -106,6 +106,6 @@ public final class AptxConstants { public final static Color DOMAIN_LABEL_COLOR_FOR_PDF = new Color( 150, 150, 150 ); - final static short DEFAULT_NODE_SHAPE_SIZE_DEFAULT = 4; + final static short DEFAULT_NODE_SHAPE_SIZE_DEFAULT = 7; static final int MAX_LENGTH_FOR_COLLAPSED_NAME = 8; } diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index 83bc203..f205640 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -35,6 +35,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; @@ -79,6 +80,7 @@ public final class RIO { private final REROOTING _rerooting; private final Phylogeny _species_tree; private Phylogeny _min_dub_gene_tree; + private Map _dup_to_tree_map; private RIO( final IteratingPhylogenyParser p, final Phylogeny species_tree, @@ -480,18 +482,47 @@ public final class RIO { } dups = gsdi.getDuplicationsSum(); } + assigned_tree.setRerootable( false ); + double new_dist = -1; if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) { _min_dub_gene_tree = assigned_tree; - _min_dub_gene_tree.setRerootable( false ); + } + else if ( dups == _duplications_stats.getMin() ) { + new_dist = PhylogenyMethods.calculateMaxDistanceToRoot( assigned_tree ); + if ( new_dist < PhylogenyMethods.calculateMaxDistanceToRoot( _min_dub_gene_tree ) ) { + _min_dub_gene_tree = assigned_tree; + } + } + if ( _dup_to_tree_map == null ) { + _dup_to_tree_map = new HashMap(); + } + if ( !_dup_to_tree_map.containsKey( dups ) ) { + _dup_to_tree_map.put( dups, assigned_tree ); + } + else { + if ( new_dist == -1 ) { + new_dist = PhylogenyMethods.calculateMaxDistanceToRoot( assigned_tree ); + } + if ( new_dist < PhylogenyMethods.calculateMaxDistanceToRoot( _dup_to_tree_map.get( dups ) ) ) { + _dup_to_tree_map.put( dups, assigned_tree ); + } } _duplications_stats.addValue( dups ); return assigned_tree; } + final public Map getDuplicationsToTreeMap() { + return _dup_to_tree_map; + } + private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException { final SDIR sdir = new SDIR(); - return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ]; + final Phylogeny r = sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ]; + r.setRerootable( false ); + final int dups = sdir.getMinimalDuplications(); + _duplications_stats.addValue( dups ); + return r; } private final void postLog( final Phylogeny species_tree, final int first, final int last ) { diff --git a/forester/java/src/org/forester/util/EasyWriter.java b/forester/java/src/org/forester/util/EasyWriter.java index f07edab..0cbe697 100644 --- a/forester/java/src/org/forester/util/EasyWriter.java +++ b/forester/java/src/org/forester/util/EasyWriter.java @@ -16,6 +16,7 @@ public final class EasyWriter extends BufferedWriter { write( s ); write( LINE_SEPARATOR ); } + public void println() throws IOException { write( LINE_SEPARATOR ); @@ -24,4 +25,5 @@ public final class EasyWriter extends BufferedWriter { public void print( final String s ) throws IOException { write( s ); } + }