2 package org.forester.rio;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.util.ArrayList;
10 import java.util.SortedMap;
11 import java.util.SortedSet;
12 import java.util.TreeSet;
14 import org.forester.datastructures.IntMatrix;
15 import org.forester.io.parsers.IteratingPhylogenyParser;
16 import org.forester.io.parsers.PhylogenyParser;
17 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
18 import org.forester.io.parsers.nhx.NHXParser;
19 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
20 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
21 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
22 import org.forester.io.parsers.util.ParserUtils;
23 import org.forester.io.writers.PhylogenyWriter;
24 import org.forester.phylogeny.Phylogeny;
25 import org.forester.phylogeny.PhylogenyMethods;
26 import org.forester.phylogeny.PhylogenyNode;
27 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
28 import org.forester.phylogeny.data.Sequence;
29 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
30 import org.forester.phylogeny.factories.PhylogenyFactory;
31 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
32 import org.forester.rio.RIO.REROOTING;
33 import org.forester.sdi.GSDIR;
34 import org.forester.sdi.SDIException;
35 import org.forester.sdi.SDIutil;
36 import org.forester.sdi.SDIutil.ALGORITHM;
37 import org.forester.util.BasicDescriptiveStatistics;
38 import org.forester.util.BasicTable;
39 import org.forester.util.BasicTableParser;
40 import org.forester.util.EasyWriter;
41 import org.forester.util.ForesterUtil;
43 public final class RIOUtil {
45 public final static String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_stripped_species_tree.xml";
46 public final static String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv";
47 public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
48 public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_";
49 public final static String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_";
50 public final static String BEST_TREE_SUFFIX = "_RIO_consensus_gene_tree_dup_";
51 public final static String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv";
52 public final static String LOGFILE_SUFFIX = "_RIO_log.tsv";
54 public static final void executeAnalysis( final File gene_trees_file,
55 final File species_tree_file,
56 final File orthology_outtable,
57 final File orthology_outtable_with_mappings,
58 final File orthology_groups_outfile,
60 final String outgroup,
61 final REROOTING rerooting,
64 final File return_species_tree,
65 final File return_min_dup_gene_tree,
66 final File return_median_dup_gene_tree,
67 final boolean transfer_taxonomy,
68 final ALGORITHM algorithm,
69 final boolean use_gene_trees_dir,
71 final double ortholog_group_cutoff,
72 final boolean perform_id_mapping,
73 final File id_mapping_dir,
74 final String id_mapping_suffix,
75 final boolean perform_gsdir_on_best_tree,
77 final File best_trees_indir,
78 final String best_trees_suffix ) {
80 final SortedMap<String, String> id_map;
81 if ( perform_id_mapping ) {
82 id_map = obtainMapping( id_mapping_dir, gene_trees_file.getName(), id_mapping_suffix );
88 boolean iterating = false;
89 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
90 if ( p instanceof PhyloXmlParser ) {
91 rio = RIO.executeAnalysis( gene_trees_file,
104 if ( p instanceof NHXParser ) {
105 final NHXParser nhx = ( NHXParser ) p;
106 nhx.setReplaceUnderscores( false );
107 nhx.setIgnoreQuotes( true );
108 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
110 else if ( p instanceof NexusPhylogeniesParser ) {
111 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
112 nex.setReplaceUnderscores( false );
113 nex.setIgnoreQuotes( true );
114 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
117 throw new RuntimeException( "unknown parser type: " + p );
119 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
120 ip.setSource( gene_trees_file );
121 rio = RIO.executeAnalysis( ip,
132 if ( !use_gene_trees_dir ) {
133 if ( algorithm == ALGORITHM.GSDIR ) {
134 System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
139 m = rio.getOrthologTable();
142 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
144 final GSDIR gsdir_for_best_tree;
145 if ( perform_gsdir_on_best_tree ) {
146 gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
154 gsdir_for_best_tree = null;
156 final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
157 if ( perform_id_mapping ) {
158 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
159 writeOrthologyTable( orthology_outtable_with_mappings,
167 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, null, false );
169 final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
170 ortholog_group_cutoff,
176 final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true, null );
177 final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true, null );
178 final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true, null );
179 final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true, null );
180 final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true, null );
181 if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
182 writeLogFile( logfile,
187 org.forester.application.rio.PRG_NAME,
188 org.forester.application.rio.PRG_VERSION,
189 org.forester.application.rio.PRG_DATE,
190 ForesterUtil.getForesterLibraryInformation(),
191 !use_gene_trees_dir );
193 if ( return_species_tree != null ) {
194 writeTree( rio.getSpeciesTree(),
196 use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t",
199 if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
200 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
201 writeTree( rio.getMinDuplicationsGeneTree(),
202 new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
203 use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t",
206 if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
207 final int med = ( int ) rio.getDuplicationsStatistics().median();
208 writeTree( rio.getDuplicationsToTreeMap().get( med ),
209 new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
210 use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t",
213 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
214 final int min = ( int ) stats.getMin();
215 final int max = ( int ) stats.getMax();
216 final int median = ( int ) stats.median();
219 int median_count = 0;
220 for( double d : stats.getData() ) {
221 if ( ( ( int ) d ) == min ) {
224 if ( ( ( int ) d ) == max ) {
227 if ( ( ( int ) d ) == median ) {
231 final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
232 final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
233 final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
234 if ( use_gene_trees_dir ) {
235 String name = gene_trees_file.getName();
236 if ( name.indexOf( "." ) > 0 ) {
237 name = name.substring( 0, name.lastIndexOf( "." ) );
241 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
243 log.print( Integer.toString( ortholog_groups ) );
246 log.print( Integer.toString( ortholog_groups_005 ) );
248 log.print( Integer.toString( ortholog_groups_025 ) );
250 log.print( Integer.toString( ortholog_groups_05 ) );
252 log.print( Integer.toString( ortholog_groups_075 ) );
254 log.print( Integer.toString( ortholog_groups_095 ) );
258 log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
260 log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
264 if ( stats.getN() > 3 ) {
265 log.print( df.format( median ) );
271 log.print( df.format( stats.arithmeticMean() ) );
273 if ( stats.getN() > 3 ) {
274 log.print( df.format( stats.sampleStandardDeviation() ) );
280 log.print( Integer.toString( min ) );
282 log.print( Integer.toString( max ) );
284 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
286 log.print( Integer.toString( stats.getN() ) );
290 System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
291 System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
292 System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() )
293 + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
294 + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
295 if ( stats.getN() > 3 ) {
296 System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t"
297 + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
299 System.out.println( "Minimum duplications :\t" + min + "\t"
300 + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
301 System.out.println( "Maximum duplications :\t" + ( int ) max + "\t"
302 + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
303 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
304 + df.format( median_count_percentage ) + "%" );
305 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
306 + df.format( min_count_percentage ) + "%" );
307 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
308 + df.format( max_count_percentage ) + "%" );
309 if ( algorithm == ALGORITHM.GSDIR ) {
310 System.out.println( "Removed ext gene tree nodes :\t"
311 + rio.getRemovedGeneTreeNodes().size() );
315 catch ( final RIOException e ) {
316 ForesterUtil.fatalError( e.getLocalizedMessage() );
318 catch ( final SDIException e ) {
319 ForesterUtil.fatalError( e.getLocalizedMessage() );
321 catch ( final IOException e ) {
322 ForesterUtil.fatalError( e.getLocalizedMessage() );
324 catch ( final OutOfMemoryError e ) {
325 ForesterUtil.outOfMemoryError( e );
327 catch ( final Exception e ) {
328 ForesterUtil.unexpectedFatalError( e );
330 catch ( final Error e ) {
331 ForesterUtil.unexpectedFatalError( e );
335 private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
336 final File species_tree_file,
338 final File best_trees_indir,
339 final SortedMap<String, String> id_map,
340 final String best_trees_suffix )
341 throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
342 final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
343 gene_trees_file.getName(),
345 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
346 final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
347 final Phylogeny species_tree = SDIutil
348 .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
349 PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
350 best_tree.setRooted( true );
351 species_tree.setRooted( true );
352 if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
353 throw new IOException( "gene tree matching to ["
354 + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
356 final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
357 while ( it.hasNext() ) {
358 final PhylogenyNode n = it.next();
359 final String name = n.getName().trim();
360 if ( !ForesterUtil.isEmpty( name ) ) {
362 ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
364 catch ( final PhyloXmlDataFormatException e ) {
369 final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
370 final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
371 result_gene_tree.setRerootable( false );
372 PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
373 final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
374 final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
375 + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
376 writeTree( result_gene_tree, outfile, null, id_map );
377 return gsdir_for_best_tree;
380 private static final void writeOrthologyTable( final File table_outfile,
381 final int gene_trees_analyzed,
383 final boolean verbose,
384 final SortedMap<String, String> id_map,
385 final boolean replace_ids )
387 final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
388 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
389 df.setDecimalSeparatorAlwaysShown( false );
390 df.setRoundingMode( RoundingMode.HALF_UP );
391 for( int i = 0; i < m.size(); ++i ) {
394 if ( !id_map.containsKey( m.getLabel( i ) ) ) {
395 throw new IOException( "no id mapping for \"" + m.getLabel( i ) + "\" (attempting to write ["
396 + table_outfile + "])" );
398 w.print( id_map.get( m.getLabel( i ) ) );
401 w.print( m.getLabel( i ) );
405 for( int x = 0; x < m.size(); ++x ) {
407 w.print( id_map.get( m.getLabel( x ) ) );
410 w.print( m.getLabel( x ) );
412 for( int y = 0; y < m.size(); ++y ) {
415 if ( m.get( x, y ) != gene_trees_analyzed ) {
416 ForesterUtil.unexpectedFatalError( "diagonal value is off" );
421 w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
426 if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
428 id_map.forEach( ( k, v ) -> {
430 w.println( k + "\t" + v );
432 catch ( final IOException e ) {
439 System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
443 private static final int writeOrtologGroups( final File outfile,
445 final int gene_trees_analyzed,
447 final boolean verbose,
448 final boolean calc_conly,
449 final SortedMap<String, String> id_map )
451 List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
452 BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
456 for( int x = 1; x < m.size(); ++x ) {
457 final String a = m.getLabel( x );
458 for( int y = 0; y < x; ++y ) {
459 final String b = m.getLabel( y );
460 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
472 boolean found = false;
473 for( final SortedSet<String> group : groups ) {
474 if ( group.contains( a ) ) {
478 if ( group.contains( b ) ) {
484 final SortedSet<String> new_group = new TreeSet<String>();
487 groups.add( new_group );
492 //Deal with singlets:
493 for( int x = 0; x < m.size(); ++x ) {
494 final String a = m.getLabel( x );
495 boolean found = false;
496 for( final SortedSet<String> group : groups ) {
497 if ( group.contains( a ) ) {
503 final SortedSet<String> new_group = new TreeSet<String>();
505 groups.add( new_group );
509 return groups.size();
511 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
512 df.setDecimalSeparatorAlwaysShown( false );
513 df.setRoundingMode( RoundingMode.HALF_UP );
514 final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
516 for( final SortedSet<String> group : groups ) {
517 w.print( Integer.toString( counter++ ) );
518 for( final String s : group ) {
520 if ( id_map != null && id_map.size() > 0 ) {
521 if ( !id_map.containsKey( s ) ) {
522 throw new IOException( "no id mapping for \"" + s + "\" (attempting to write [" + outfile
525 w.print( id_map.get( s ) );
534 w.println( "# Cutoff\t" + df.format( cutoff ) );
536 w.println( "# Orthology support statistics:" );
537 if ( stats.getN() > 3 ) {
538 w.println( "# Median\t" + df.format( stats.median() ) );
540 w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
541 if ( stats.getN() > 3 ) {
542 w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
544 w.println( "# Min\t" + df.format( stats.getMin() ) );
545 w.println( "# Max\t" + df.format( stats.getMax() ) );
546 w.println( "# Total\t" + df.format( stats.getN() ) );
547 w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
548 w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
549 w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
552 System.out.println( "Number of ortholog groups :\t" + groups.size() );
553 System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() );
555 return groups.size();
558 private static void writeTree( final Phylogeny p,
560 final String comment,
561 final SortedMap<String, String> id_map )
563 if ( id_map != null && id_map.size() > 0 ) {
564 final PhylogenyNodeIterator it = p.iteratorExternalForward();
565 while ( it.hasNext() ) {
566 final PhylogenyNode n = it.next();
567 if ( !id_map.containsKey( n.getName() ) ) {
568 throw new IOException( "no id mapping for \"" + n.getName() + "\" (attempting to write [" + f
571 final Sequence seq = new Sequence();
572 seq.setName( id_map.get( n.getName() ) );
573 n.getNodeData().addSequence( seq );
576 final PhylogenyWriter writer = new PhylogenyWriter();
577 writer.toPhyloXML( f, p, 0 );
578 if ( comment != null ) {
579 System.out.println( comment + f.getCanonicalPath() );
583 private static void writeLogFile( final File logfile,
585 final File species_tree_file,
586 final File gene_trees_file,
588 final String prg_name,
590 final String prg_date,
592 final boolean verbose )
594 final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
595 out.println( "# " + prg_name );
596 out.println( "# version : " + prg_v );
597 out.println( "# date : " + prg_date );
598 out.println( "# based on: " + f );
599 out.println( "# ----------------------------------" );
600 out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
601 out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
602 out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() );
604 out.println( rio.getLog().toString() );
607 System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() );
611 private final static SortedMap<String, String> obtainMapping( final File dir,
613 final String suffix )
615 final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
616 final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
617 return t.getColumnsAsMap( 0, 1 );