2 package org.forester.rio;
5 import java.io.FilenameFilter;
6 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.util.ArrayList;
11 import java.util.SortedSet;
12 import java.util.TreeSet;
14 import org.forester.datastructures.IntMatrix;
15 import org.forester.io.parsers.IteratingPhylogenyParser;
16 import org.forester.io.parsers.PhylogenyParser;
17 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
18 import org.forester.io.parsers.nhx.NHXParser;
19 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
20 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
21 import org.forester.io.parsers.util.ParserUtils;
22 import org.forester.io.writers.PhylogenyWriter;
23 import org.forester.phylogeny.Phylogeny;
24 import org.forester.rio.RIO.REROOTING;
25 import org.forester.sdi.SDIException;
26 import org.forester.sdi.SDIutil.ALGORITHM;
27 import org.forester.util.BasicDescriptiveStatistics;
28 import org.forester.util.BasicTable;
29 import org.forester.util.BasicTableParser;
30 import org.forester.util.EasyWriter;
31 import org.forester.util.ForesterUtil;
33 public final class RIOUtil {
35 public static final void executeAnalysis( final File gene_trees_file,
36 final File species_tree_file,
37 final File orthology_outtable,
38 final File orthology_groups_outfile,
40 final String outgroup,
41 final REROOTING rerooting,
44 final File return_species_tree,
45 final File return_min_dup_gene_tree,
46 final File return_median_dup_gene_tree,
47 final boolean transfer_taxonomy,
48 final ALGORITHM algorithm,
49 final boolean use_gene_trees_dir,
51 final double ortholog_group_cutoff ) {
54 boolean iterating = false;
55 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
56 if ( p instanceof PhyloXmlParser ) {
57 rio = RIO.executeAnalysis( gene_trees_file,
70 if ( p instanceof NHXParser ) {
71 final NHXParser nhx = ( NHXParser ) p;
72 nhx.setReplaceUnderscores( false );
73 nhx.setIgnoreQuotes( true );
74 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
76 else if ( p instanceof NexusPhylogeniesParser ) {
77 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
78 nex.setReplaceUnderscores( false );
79 nex.setIgnoreQuotes( true );
80 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
83 throw new RuntimeException( "unknown parser type: " + p );
85 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
86 ip.setSource( gene_trees_file );
87 rio = RIO.executeAnalysis( ip,
98 if ( !use_gene_trees_dir ) {
99 if ( algorithm == ALGORITHM.GSDIR ) {
100 System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
107 m = rio.getOrthologTable();
110 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
112 final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
113 writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
114 final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
115 ortholog_group_cutoff,
120 final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
121 final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
122 final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
123 final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
124 final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
125 if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
126 writeLogFile( logfile,
131 org.forester.application.rio.PRG_NAME,
132 org.forester.application.rio.PRG_VERSION,
133 org.forester.application.rio.PRG_DATE,
134 ForesterUtil.getForesterLibraryInformation(),
135 !use_gene_trees_dir );
137 if ( return_species_tree != null ) {
138 writeTree( rio.getSpeciesTree(),
140 use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" );
142 if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
143 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
144 writeTree( rio.getMinDuplicationsGeneTree(),
145 new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
146 use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
148 if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
149 final int med = ( int ) rio.getDuplicationsStatistics().median();
150 writeTree( rio.getDuplicationsToTreeMap().get( med ),
151 new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
152 use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
154 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
155 final int min = ( int ) stats.getMin();
156 final int max = ( int ) stats.getMax();
157 final int median = ( int ) stats.median();
160 int median_count = 0;
161 for( double d : stats.getData() ) {
162 if ( ( ( int ) d ) == min ) {
165 if ( ( ( int ) d ) == max ) {
168 if ( ( ( int ) d ) == median ) {
172 final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
173 final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
174 final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
175 if ( use_gene_trees_dir ) {
176 String name = gene_trees_file.getName();
177 if ( name.indexOf( "." ) > 0 ) {
178 name = name.substring( 0, name.lastIndexOf( "." ) );
182 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
184 log.print( Integer.toString( ortholog_groups ) );
187 log.print( Integer.toString( ortholog_groups_005 ) );
189 log.print( Integer.toString( ortholog_groups_025 ) );
191 log.print( Integer.toString( ortholog_groups_05 ) );
193 log.print( Integer.toString( ortholog_groups_075 ) );
195 log.print( Integer.toString( ortholog_groups_095 ) );
198 if ( stats.getN() > 3 ) {
199 log.print( df.format( median ) );
205 log.print( df.format( stats.arithmeticMean() ) );
207 if ( stats.getN() > 3 ) {
208 log.print( df.format( stats.sampleStandardDeviation() ) );
214 log.print( Integer.toString( min ) );
216 log.print( Integer.toString( max ) );
218 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
220 log.print( Integer.toString( stats.getN() ) );
224 System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
225 System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
226 System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() )
227 + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
228 + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
229 if ( stats.getN() > 3 ) {
230 System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t"
231 + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
233 System.out.println( "Minimum duplications :\t" + min + "\t"
234 + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
235 System.out.println( "Maximum duplications :\t" + ( int ) max + "\t"
236 + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
237 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
238 + df.format( median_count_percentage ) + "%" );
239 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
240 + df.format( min_count_percentage ) + "%" );
241 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
242 + df.format( max_count_percentage ) + "%" );
243 if ( algorithm == ALGORITHM.GSDIR ) {
244 System.out.println( "Removed ext gene tree nodes :\t"
245 + rio.getRemovedGeneTreeNodes().size() );
249 catch ( final RIOException e ) {
250 ForesterUtil.fatalError( e.getLocalizedMessage() );
252 catch ( final SDIException e ) {
253 ForesterUtil.fatalError( e.getLocalizedMessage() );
255 catch ( final IOException e ) {
256 ForesterUtil.fatalError( e.getLocalizedMessage() );
258 catch ( final OutOfMemoryError e ) {
259 ForesterUtil.outOfMemoryError( e );
261 catch ( final Exception e ) {
262 ForesterUtil.unexpectedFatalError( e );
264 catch ( final Error e ) {
265 ForesterUtil.unexpectedFatalError( e );
269 private static final void writeTable( final File table_outfile,
270 final int gene_trees_analyzed,
272 final boolean verbose )
274 final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
275 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
276 df.setDecimalSeparatorAlwaysShown( false );
277 df.setRoundingMode( RoundingMode.HALF_UP );
278 for( int i = 0; i < m.size(); ++i ) {
280 w.print( m.getLabel( i ) );
283 for( int x = 0; x < m.size(); ++x ) {
284 w.print( m.getLabel( x ) );
285 for( int y = 0; y < m.size(); ++y ) {
288 if ( m.get( x, y ) != gene_trees_analyzed ) {
289 ForesterUtil.unexpectedFatalError( "diagonal value is off" );
294 w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
301 System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
305 private static final int writeOrtologGroups( final File outfile,
307 final int gene_trees_analyzed,
309 final boolean verbose,
310 final boolean calc_conly )
312 List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
313 BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
317 for( int x = 1; x < m.size(); ++x ) {
318 final String a = m.getLabel( x );
319 for( int y = 0; y < x; ++y ) {
320 final String b = m.getLabel( y );
321 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
333 boolean found = false;
334 for( final SortedSet<String> group : groups ) {
335 if ( group.contains( a ) ) {
339 if ( group.contains( b ) ) {
345 final SortedSet<String> new_group = new TreeSet<String>();
348 groups.add( new_group );
353 //Deal with singlets:
354 for( int x = 0; x < m.size(); ++x ) {
355 final String a = m.getLabel( x );
356 boolean found = false;
357 for( final SortedSet<String> group : groups ) {
358 if ( group.contains( a ) ) {
364 final SortedSet<String> new_group = new TreeSet<String>();
366 groups.add( new_group );
370 return groups.size();
372 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
373 df.setDecimalSeparatorAlwaysShown( false );
374 df.setRoundingMode( RoundingMode.HALF_UP );
375 final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
377 for( final SortedSet<String> group : groups ) {
378 w.print( Integer.toString( counter++ ) );
379 for( final String s : group ) {
386 w.println( "# Cutoff\t" + df.format( cutoff ) );
388 w.println( "# Orthology support statistics:" );
389 if ( stats.getN() > 3 ) {
390 w.println( "# Median\t" + df.format( stats.median() ) );
392 w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
393 if ( stats.getN() > 3 ) {
394 w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
396 w.println( "# Min\t" + df.format( stats.getMin() ) );
397 w.println( "# Max\t" + df.format( stats.getMax() ) );
398 w.println( "# Total\t" + df.format( stats.getN() ) );
399 w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
400 w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
401 w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
404 System.out.println( "Number of ortholog groups :\t" + groups.size() );
405 System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() );
407 return groups.size();
410 private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
411 final PhylogenyWriter writer = new PhylogenyWriter();
412 writer.toPhyloXML( f, p, 0 );
413 if ( comment != null ) {
414 System.out.println( comment + f.getCanonicalPath() );
418 private static void writeLogFile( final File logfile,
420 final File species_tree_file,
421 final File gene_trees_file,
423 final String prg_name,
425 final String prg_date,
427 final boolean verbose )
429 final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
430 out.println( "# " + prg_name );
431 out.println( "# version : " + prg_v );
432 out.println( "# date : " + prg_date );
433 out.println( "# based on: " + f );
434 out.println( "# ----------------------------------" );
435 out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
436 out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
437 out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() );
439 out.println( rio.getLog().toString() );
442 System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() );
446 private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
448 if ( !dir.exists() ) {
449 throw new IOException( "[" + dir + "] does not exist" );
451 if ( !dir.isDirectory() ) {
452 throw new IOException( "[" + dir + "] is not a directory" );
454 final File mapping_files[] = dir.listFiles( new FilenameFilter() {
457 public boolean accept( final File dir, final String name ) {
458 return ( name.endsWith( suffix ) );
461 String my_suffix = suffix;
462 boolean done = false;
465 for( File file : mapping_files ) {
466 if ( file.getName().equals( my_suffix ) ) {
474 my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
479 if ( mapping_files.length == 0 ) {
480 throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
483 if ( mapping_files.length > 1 ) {
484 throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
487 final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
488 return t.getColumnsAsMap( 0, 1 );