2 package org.forester.rio;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.util.ArrayList;
9 import java.util.Iterator;
10 import java.util.List;
12 import java.util.Map.Entry;
13 import java.util.SortedMap;
14 import java.util.SortedSet;
15 import java.util.TreeSet;
17 import org.forester.datastructures.IntMatrix;
18 import org.forester.io.parsers.IteratingPhylogenyParser;
19 import org.forester.io.parsers.PhylogenyParser;
20 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
21 import org.forester.io.parsers.nhx.NHXParser;
22 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
23 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
24 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
25 import org.forester.io.parsers.util.ParserUtils;
26 import org.forester.io.writers.PhylogenyWriter;
27 import org.forester.phylogeny.Phylogeny;
28 import org.forester.phylogeny.PhylogenyMethods;
29 import org.forester.phylogeny.PhylogenyNode;
30 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
31 import org.forester.phylogeny.data.Sequence;
32 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
33 import org.forester.phylogeny.factories.PhylogenyFactory;
34 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
35 import org.forester.rio.RIO.REROOTING;
36 import org.forester.sdi.GSDIR;
37 import org.forester.sdi.SDIException;
38 import org.forester.sdi.SDIutil;
39 import org.forester.sdi.SDIutil.ALGORITHM;
40 import org.forester.util.BasicDescriptiveStatistics;
41 import org.forester.util.BasicTable;
42 import org.forester.util.BasicTableParser;
43 import org.forester.util.EasyWriter;
44 import org.forester.util.ForesterUtil;
46 public final class RIOUtil {
48 public final static String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_stripped_species_tree.xml";
49 public final static String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv";
50 public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
51 public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_";
52 public final static String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_";
53 public final static String BEST_TREE_SUFFIX = "_RIO_consensus_gene_tree_dup_";
54 public final static String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv";
55 public final static String LOGFILE_SUFFIX = "_RIO_log.tsv";
57 public static final void executeAnalysis( final File gene_trees_file,
58 final File species_tree_file,
59 final File orthology_outtable,
60 final File orthology_outtable_with_mappings,
61 final File orthology_groups_outfile,
63 final String outgroup,
64 final REROOTING rerooting,
67 final File return_species_tree,
68 final File return_min_dup_gene_tree,
69 final File return_median_dup_gene_tree,
70 final boolean transfer_taxonomy,
71 final ALGORITHM algorithm,
72 final boolean use_gene_trees_dir,
74 final double ortholog_group_cutoff,
75 final boolean perform_id_mapping,
76 final File id_mapping_dir,
77 final String id_mapping_suffix,
78 final boolean perform_gsdir_on_best_tree,
80 final File best_trees_indir,
81 final String best_trees_suffix ) {
83 final SortedMap<String, String> id_map;
84 if ( perform_id_mapping ) {
85 id_map = obtainMapping( id_mapping_dir, gene_trees_file.getName(), id_mapping_suffix );
91 boolean iterating = false;
92 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
93 if ( p instanceof PhyloXmlParser ) {
94 rio = RIO.executeAnalysis( gene_trees_file,
107 if ( p instanceof NHXParser ) {
108 final NHXParser nhx = ( NHXParser ) p;
109 nhx.setReplaceUnderscores( false );
110 nhx.setIgnoreQuotes( true );
111 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
113 else if ( p instanceof NexusPhylogeniesParser ) {
114 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
115 nex.setReplaceUnderscores( false );
116 nex.setIgnoreQuotes( true );
117 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
120 throw new RuntimeException( "unknown parser type: " + p );
122 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
123 ip.setSource( gene_trees_file );
124 rio = RIO.executeAnalysis( ip,
135 if ( !use_gene_trees_dir ) {
136 if ( algorithm == ALGORITHM.GSDIR ) {
137 System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
142 m = rio.getOrthologTable();
145 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
147 final GSDIR gsdir_for_best_tree;
148 if ( perform_gsdir_on_best_tree ) {
149 gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
157 gsdir_for_best_tree = null;
159 final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
160 if ( perform_id_mapping ) {
161 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
162 writeOrthologyTable( orthology_outtable_with_mappings,
170 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, null, false );
172 final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
173 ortholog_group_cutoff,
179 final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true, null );
180 final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true, null );
181 final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true, null );
182 final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true, null );
183 final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true, null );
184 if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
185 writeLogFile( logfile,
190 org.forester.application.rio.PRG_NAME,
191 org.forester.application.rio.PRG_VERSION,
192 org.forester.application.rio.PRG_DATE,
193 ForesterUtil.getForesterLibraryInformation(),
194 !use_gene_trees_dir );
196 if ( return_species_tree != null ) {
197 writeTree( rio.getSpeciesTree(),
199 use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t",
202 if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
203 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
204 writeTree( rio.getMinDuplicationsGeneTree(),
205 new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
206 use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t",
209 if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
210 final int med = ( int ) rio.getDuplicationsStatistics().median();
211 writeTree( rio.getDuplicationsToTreeMap().get( med ),
212 new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
213 use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t",
216 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
217 final int min = ( int ) stats.getMin();
218 final int max = ( int ) stats.getMax();
219 final int median = ( int ) stats.median();
222 int median_count = 0;
223 for( double d : stats.getData() ) {
224 if ( ( ( int ) d ) == min ) {
227 if ( ( ( int ) d ) == max ) {
230 if ( ( ( int ) d ) == median ) {
234 final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
235 final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
236 final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
237 if ( use_gene_trees_dir ) {
238 String name = gene_trees_file.getName();
239 if ( name.indexOf( "." ) > 0 ) {
240 name = name.substring( 0, name.lastIndexOf( "." ) );
244 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
246 log.print( Integer.toString( ortholog_groups ) );
249 log.print( Integer.toString( ortholog_groups_005 ) );
251 log.print( Integer.toString( ortholog_groups_025 ) );
253 log.print( Integer.toString( ortholog_groups_05 ) );
255 log.print( Integer.toString( ortholog_groups_075 ) );
257 log.print( Integer.toString( ortholog_groups_095 ) );
261 log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
263 log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
267 if ( stats.getN() > 3 ) {
268 log.print( df.format( median ) );
274 log.print( df.format( stats.arithmeticMean() ) );
276 if ( stats.getN() > 3 ) {
277 log.print( df.format( stats.sampleStandardDeviation() ) );
283 log.print( Integer.toString( min ) );
285 log.print( Integer.toString( max ) );
287 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
289 log.print( Integer.toString( stats.getN() ) );
293 System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
294 System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
295 System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() )
296 + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
297 + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
298 if ( stats.getN() > 3 ) {
299 System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t"
300 + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
302 System.out.println( "Minimum duplications :\t" + min + "\t"
303 + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
304 System.out.println( "Maximum duplications :\t" + ( int ) max + "\t"
305 + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
306 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
307 + df.format( median_count_percentage ) + "%" );
308 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
309 + df.format( min_count_percentage ) + "%" );
310 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
311 + df.format( max_count_percentage ) + "%" );
312 if ( algorithm == ALGORITHM.GSDIR ) {
313 System.out.println( "Removed ext gene tree nodes :\t"
314 + rio.getRemovedGeneTreeNodes().size() );
318 catch ( final RIOException e ) {
319 ForesterUtil.fatalError( e.getLocalizedMessage() );
321 catch ( final SDIException e ) {
322 ForesterUtil.fatalError( e.getLocalizedMessage() );
324 catch ( final IOException e ) {
325 ForesterUtil.fatalError( e.getLocalizedMessage() );
327 catch ( final OutOfMemoryError e ) {
328 ForesterUtil.outOfMemoryError( e );
330 catch ( final Exception e ) {
331 ForesterUtil.unexpectedFatalError( e );
333 catch ( final Error e ) {
334 ForesterUtil.unexpectedFatalError( e );
338 private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
339 final File species_tree_file,
341 final File best_trees_indir,
342 final SortedMap<String, String> id_map,
343 final String best_trees_suffix )
344 throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
345 final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
346 gene_trees_file.getName(),
348 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
349 final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
350 final Phylogeny species_tree = SDIutil
351 .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
352 PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
353 best_tree.setRooted( true );
354 species_tree.setRooted( true );
355 if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
356 throw new IOException( "gene tree matching to ["
357 + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
359 final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
360 while ( it.hasNext() ) {
361 final PhylogenyNode n = it.next();
362 final String name = n.getName().trim();
363 if ( !ForesterUtil.isEmpty( name ) ) {
365 ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
367 catch ( final PhyloXmlDataFormatException e ) {
372 final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
373 final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
374 result_gene_tree.setRerootable( false );
375 PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
376 final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
377 final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
378 + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
379 writeTree( result_gene_tree, outfile, null, id_map );
380 return gsdir_for_best_tree;
383 private static final void writeOrthologyTable( final File table_outfile,
384 final int gene_trees_analyzed,
386 final boolean verbose,
387 final SortedMap<String, String> id_map,
388 final boolean replace_ids )
390 final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
391 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
392 df.setDecimalSeparatorAlwaysShown( false );
393 df.setRoundingMode( RoundingMode.HALF_UP );
394 for( int i = 0; i < m.size(); ++i ) {
397 if ( !id_map.containsKey( m.getLabel( i ) ) ) {
398 throw new IOException( "no id mapping for \"" + m.getLabel( i ) + "\" (attempting to write ["
399 + table_outfile + "])" );
401 w.print( id_map.get( m.getLabel( i ) ) );
404 w.print( m.getLabel( i ) );
408 for( int x = 0; x < m.size(); ++x ) {
410 w.print( id_map.get( m.getLabel( x ) ) );
413 w.print( m.getLabel( x ) );
415 for( int y = 0; y < m.size(); ++y ) {
418 if ( m.get( x, y ) != gene_trees_analyzed ) {
419 ForesterUtil.unexpectedFatalError( "diagonal value is off" );
424 w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
429 if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
432 final Iterator<?> it = id_map.entrySet().iterator();
433 while (it.hasNext()) {
434 Map.Entry<String, String> pair = ( Entry<String, String> ) it.next();
435 w.println( pair.getKey() + "\t" + pair.getValue() );
439 id_map.forEach( ( k, v ) -> {
441 w.println( k + "\t" + v );
443 catch ( final IOException e ) {
450 System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
454 private static final int writeOrtologGroups( final File outfile,
456 final int gene_trees_analyzed,
458 final boolean verbose,
459 final boolean calc_conly,
460 final SortedMap<String, String> id_map )
462 List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
463 BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
467 for( int x = 1; x < m.size(); ++x ) {
468 final String a = m.getLabel( x );
469 for( int y = 0; y < x; ++y ) {
470 final String b = m.getLabel( y );
471 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
483 boolean found = false;
484 for( final SortedSet<String> group : groups ) {
485 if ( group.contains( a ) ) {
489 if ( group.contains( b ) ) {
495 final SortedSet<String> new_group = new TreeSet<String>();
498 groups.add( new_group );
503 //Deal with singlets:
504 for( int x = 0; x < m.size(); ++x ) {
505 final String a = m.getLabel( x );
506 boolean found = false;
507 for( final SortedSet<String> group : groups ) {
508 if ( group.contains( a ) ) {
514 final SortedSet<String> new_group = new TreeSet<String>();
516 groups.add( new_group );
520 return groups.size();
522 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
523 df.setDecimalSeparatorAlwaysShown( false );
524 df.setRoundingMode( RoundingMode.HALF_UP );
525 final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
527 for( final SortedSet<String> group : groups ) {
528 w.print( Integer.toString( counter++ ) );
529 for( final String s : group ) {
531 if ( id_map != null && id_map.size() > 0 ) {
532 if ( !id_map.containsKey( s ) ) {
533 throw new IOException( "no id mapping for \"" + s + "\" (attempting to write [" + outfile
536 w.print( id_map.get( s ) );
545 w.println( "# Cutoff\t" + df.format( cutoff ) );
547 w.println( "# Orthology support statistics:" );
548 if ( stats.getN() > 3 ) {
549 w.println( "# Median\t" + df.format( stats.median() ) );
551 w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
552 if ( stats.getN() > 3 ) {
553 w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
555 w.println( "# Min\t" + df.format( stats.getMin() ) );
556 w.println( "# Max\t" + df.format( stats.getMax() ) );
557 w.println( "# Total\t" + df.format( stats.getN() ) );
558 w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
559 w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
560 w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
563 System.out.println( "Number of ortholog groups :\t" + groups.size() );
564 System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() );
566 return groups.size();
569 private static void writeTree( final Phylogeny p,
571 final String comment,
572 final SortedMap<String, String> id_map )
574 if ( id_map != null && id_map.size() > 0 ) {
575 final PhylogenyNodeIterator it = p.iteratorExternalForward();
576 while ( it.hasNext() ) {
577 final PhylogenyNode n = it.next();
578 if ( !id_map.containsKey( n.getName() ) ) {
579 throw new IOException( "no id mapping for \"" + n.getName() + "\" (attempting to write [" + f
582 final Sequence seq = new Sequence();
583 seq.setName( id_map.get( n.getName() ) );
584 n.getNodeData().addSequence( seq );
587 final PhylogenyWriter writer = new PhylogenyWriter();
588 writer.toPhyloXML( f, p, 0 );
589 if ( comment != null ) {
590 System.out.println( comment + f.getCanonicalPath() );
594 private static void writeLogFile( final File logfile,
596 final File species_tree_file,
597 final File gene_trees_file,
599 final String prg_name,
601 final String prg_date,
603 final boolean verbose )
605 final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
606 out.println( "# " + prg_name );
607 out.println( "# version : " + prg_v );
608 out.println( "# date : " + prg_date );
609 out.println( "# based on: " + f );
610 out.println( "# ----------------------------------" );
611 out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
612 out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
613 out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() );
615 out.println( rio.getLog().toString() );
618 System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() );
622 private final static SortedMap<String, String> obtainMapping( final File dir,
624 final String suffix )
626 final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
627 final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
628 return t.getColumnsAsMap( 0, 1 );