2 package org.forester.rio;
5 import java.io.IOException;
6 import java.math.RoundingMode;
7 import java.util.ArrayList;
10 import java.util.SortedMap;
11 import java.util.SortedSet;
12 import java.util.TreeSet;
14 import javax.swing.JOptionPane;
16 import org.forester.archaeopteryx.AptxUtil;
17 import org.forester.datastructures.IntMatrix;
18 import org.forester.io.parsers.IteratingPhylogenyParser;
19 import org.forester.io.parsers.PhylogenyParser;
20 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
21 import org.forester.io.parsers.nhx.NHXParser;
22 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
23 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
24 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
25 import org.forester.io.parsers.util.ParserUtils;
26 import org.forester.io.writers.PhylogenyWriter;
27 import org.forester.phylogeny.Phylogeny;
28 import org.forester.phylogeny.PhylogenyMethods;
29 import org.forester.phylogeny.PhylogenyNode;
30 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
31 import org.forester.phylogeny.data.Sequence;
32 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
33 import org.forester.phylogeny.factories.PhylogenyFactory;
34 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
35 import org.forester.rio.RIO.REROOTING;
36 import org.forester.sdi.GSDIR;
37 import org.forester.sdi.SDIException;
38 import org.forester.sdi.SDIutil;
39 import org.forester.sdi.SDIutil.ALGORITHM;
40 import org.forester.util.BasicDescriptiveStatistics;
41 import org.forester.util.BasicTable;
42 import org.forester.util.BasicTableParser;
43 import org.forester.util.EasyWriter;
44 import org.forester.util.ForesterUtil;
46 public final class RIOUtil {
48 public static final void executeAnalysis( final File gene_trees_file,
49 final File species_tree_file,
50 final File orthology_outtable,
51 final File orthology_outtable_with_mappings,
52 final File orthology_groups_outfile,
54 final String outgroup,
55 final REROOTING rerooting,
58 final File return_species_tree,
59 final File return_min_dup_gene_tree,
60 final File return_median_dup_gene_tree,
61 final boolean transfer_taxonomy,
62 final ALGORITHM algorithm,
63 final boolean use_gene_trees_dir,
65 final double ortholog_group_cutoff,
66 final boolean perform_id_mapping,
67 final File id_mapping_dir,
68 final String id_mapping_suffix ) {
70 final SortedMap<String, String> id_map;
71 if ( perform_id_mapping ) {
72 id_map = obtainMapping( id_mapping_dir, gene_trees_file.getName(), id_mapping_suffix );
78 boolean iterating = false;
79 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
80 if ( p instanceof PhyloXmlParser ) {
81 rio = RIO.executeAnalysis( gene_trees_file,
94 if ( p instanceof NHXParser ) {
95 final NHXParser nhx = ( NHXParser ) p;
96 nhx.setReplaceUnderscores( false );
97 nhx.setIgnoreQuotes( true );
98 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
100 else if ( p instanceof NexusPhylogeniesParser ) {
101 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
102 nex.setReplaceUnderscores( false );
103 nex.setIgnoreQuotes( true );
104 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
107 throw new RuntimeException( "unknown parser type: " + p );
109 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
110 ip.setSource( gene_trees_file );
111 rio = RIO.executeAnalysis( ip,
122 if ( !use_gene_trees_dir ) {
123 if ( algorithm == ALGORITHM.GSDIR ) {
124 System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
129 m = rio.getOrthologTable();
132 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
134 ////////////////////////////////////////////
135 ////////////////////////////////////////////
137 final boolean perform_gsdir_on_best_tree = true;
138 final File best_trees_dir = new File( "best_trees" );
139 final String best_trees_suffix = ".xml";
140 final GSDIR gsdir_for_best_tree;
141 if ( perform_gsdir_on_best_tree ) {
142 final Phylogeny best_tree = obtainTree( best_trees_dir, gene_trees_file.getName(), best_trees_suffix );
143 final Phylogeny species_tree = SDIutil
144 .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
145 PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
146 best_tree.setRooted( true );
147 species_tree.setRooted( true );
148 if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
149 throw new IOException( "gene tree matching to ["
150 + ForesterUtil.removeFileExtension( gene_trees_file.getName() )
151 + "] is not completely binary" );
153 final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
154 while ( it.hasNext() ) {
155 final PhylogenyNode n = it.next();
156 final String name = n.getName().trim();
157 if ( !ForesterUtil.isEmpty( name ) ) {
159 ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
161 catch ( final PhyloXmlDataFormatException e ) {
166 gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
167 final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
168 System.out.println( gsdir_for_best_tree.getMinDuplicationsSum() );
169 result_gene_tree.setRerootable( false );
170 PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(),
173 DESCENDANT_SORT_PRIORITY.NODE_NAME );
174 writeTree( result_gene_tree, new File( gene_trees_file.getName() + "____.xml" ), null, id_map );
177 gsdir_for_best_tree = null;
179 ////////////////////////////////////////////
180 ////////////////////////////////////////////
181 final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
182 if ( perform_id_mapping ) {
183 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
184 writeOrthologyTable( orthology_outtable_with_mappings,
192 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, null, false );
194 final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
195 ortholog_group_cutoff,
201 final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true, null );
202 final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true, null );
203 final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true, null );
204 final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true, null );
205 final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true, null );
206 if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
207 writeLogFile( logfile,
212 org.forester.application.rio.PRG_NAME,
213 org.forester.application.rio.PRG_VERSION,
214 org.forester.application.rio.PRG_DATE,
215 ForesterUtil.getForesterLibraryInformation(),
216 !use_gene_trees_dir );
218 if ( return_species_tree != null ) {
219 writeTree( rio.getSpeciesTree(),
221 use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t",
224 if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
225 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
226 writeTree( rio.getMinDuplicationsGeneTree(),
227 new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
228 use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t",
231 if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
232 final int med = ( int ) rio.getDuplicationsStatistics().median();
233 writeTree( rio.getDuplicationsToTreeMap().get( med ),
234 new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
235 use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t",
238 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
239 final int min = ( int ) stats.getMin();
240 final int max = ( int ) stats.getMax();
241 final int median = ( int ) stats.median();
244 int median_count = 0;
245 for( double d : stats.getData() ) {
246 if ( ( ( int ) d ) == min ) {
249 if ( ( ( int ) d ) == max ) {
252 if ( ( ( int ) d ) == median ) {
256 final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
257 final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
258 final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
259 if ( use_gene_trees_dir ) {
260 String name = gene_trees_file.getName();
261 if ( name.indexOf( "." ) > 0 ) {
262 name = name.substring( 0, name.lastIndexOf( "." ) );
266 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
268 log.print( Integer.toString( ortholog_groups ) );
271 log.print( Integer.toString( ortholog_groups_005 ) );
273 log.print( Integer.toString( ortholog_groups_025 ) );
275 log.print( Integer.toString( ortholog_groups_05 ) );
277 log.print( Integer.toString( ortholog_groups_075 ) );
279 log.print( Integer.toString( ortholog_groups_095 ) );
282 if ( stats.getN() > 3 ) {
283 log.print( df.format( median ) );
289 log.print( df.format( stats.arithmeticMean() ) );
291 if ( stats.getN() > 3 ) {
292 log.print( df.format( stats.sampleStandardDeviation() ) );
298 log.print( Integer.toString( min ) );
300 log.print( Integer.toString( max ) );
302 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
304 log.print( Integer.toString( stats.getN() ) );
308 System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
309 System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
310 System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() )
311 + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
312 + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
313 if ( stats.getN() > 3 ) {
314 System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t"
315 + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
317 System.out.println( "Minimum duplications :\t" + min + "\t"
318 + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
319 System.out.println( "Maximum duplications :\t" + ( int ) max + "\t"
320 + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
321 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
322 + df.format( median_count_percentage ) + "%" );
323 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
324 + df.format( min_count_percentage ) + "%" );
325 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
326 + df.format( max_count_percentage ) + "%" );
327 if ( algorithm == ALGORITHM.GSDIR ) {
328 System.out.println( "Removed ext gene tree nodes :\t"
329 + rio.getRemovedGeneTreeNodes().size() );
333 catch ( final RIOException e ) {
334 ForesterUtil.fatalError( e.getLocalizedMessage() );
336 catch ( final SDIException e ) {
337 ForesterUtil.fatalError( e.getLocalizedMessage() );
339 catch ( final IOException e ) {
340 ForesterUtil.fatalError( e.getLocalizedMessage() );
342 catch ( final OutOfMemoryError e ) {
343 ForesterUtil.outOfMemoryError( e );
345 catch ( final Exception e ) {
346 ForesterUtil.unexpectedFatalError( e );
348 catch ( final Error e ) {
349 ForesterUtil.unexpectedFatalError( e );
353 private static final void writeOrthologyTable( final File table_outfile,
354 final int gene_trees_analyzed,
356 final boolean verbose,
357 final SortedMap<String, String> id_map,
358 final boolean replace_ids )
360 final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
361 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
362 df.setDecimalSeparatorAlwaysShown( false );
363 df.setRoundingMode( RoundingMode.HALF_UP );
364 for( int i = 0; i < m.size(); ++i ) {
367 if ( !id_map.containsKey( m.getLabel( i ) ) ) {
368 throw new IOException( "no id mapping for \"" + m.getLabel( i ) + "\" (attempting to write ["
369 + table_outfile + "])" );
371 w.print( id_map.get( m.getLabel( i ) ) );
374 w.print( m.getLabel( i ) );
378 for( int x = 0; x < m.size(); ++x ) {
380 w.print( id_map.get( m.getLabel( x ) ) );
383 w.print( m.getLabel( x ) );
385 for( int y = 0; y < m.size(); ++y ) {
388 if ( m.get( x, y ) != gene_trees_analyzed ) {
389 ForesterUtil.unexpectedFatalError( "diagonal value is off" );
394 w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
399 if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
401 id_map.forEach( ( k, v ) -> {
403 w.println( k + "\t" + v );
405 catch ( final IOException e ) {
412 System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
416 private static final int writeOrtologGroups( final File outfile,
418 final int gene_trees_analyzed,
420 final boolean verbose,
421 final boolean calc_conly,
422 final SortedMap<String, String> id_map )
424 List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
425 BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
429 for( int x = 1; x < m.size(); ++x ) {
430 final String a = m.getLabel( x );
431 for( int y = 0; y < x; ++y ) {
432 final String b = m.getLabel( y );
433 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
445 boolean found = false;
446 for( final SortedSet<String> group : groups ) {
447 if ( group.contains( a ) ) {
451 if ( group.contains( b ) ) {
457 final SortedSet<String> new_group = new TreeSet<String>();
460 groups.add( new_group );
465 //Deal with singlets:
466 for( int x = 0; x < m.size(); ++x ) {
467 final String a = m.getLabel( x );
468 boolean found = false;
469 for( final SortedSet<String> group : groups ) {
470 if ( group.contains( a ) ) {
476 final SortedSet<String> new_group = new TreeSet<String>();
478 groups.add( new_group );
482 return groups.size();
484 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
485 df.setDecimalSeparatorAlwaysShown( false );
486 df.setRoundingMode( RoundingMode.HALF_UP );
487 final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
489 for( final SortedSet<String> group : groups ) {
490 w.print( Integer.toString( counter++ ) );
491 for( final String s : group ) {
493 if ( id_map != null && id_map.size() > 0 ) {
494 if ( !id_map.containsKey( s ) ) {
495 throw new IOException( "no id mapping for \"" + s + "\" (attempting to write [" + outfile
498 w.print( id_map.get( s ) );
507 w.println( "# Cutoff\t" + df.format( cutoff ) );
509 w.println( "# Orthology support statistics:" );
510 if ( stats.getN() > 3 ) {
511 w.println( "# Median\t" + df.format( stats.median() ) );
513 w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
514 if ( stats.getN() > 3 ) {
515 w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
517 w.println( "# Min\t" + df.format( stats.getMin() ) );
518 w.println( "# Max\t" + df.format( stats.getMax() ) );
519 w.println( "# Total\t" + df.format( stats.getN() ) );
520 w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
521 w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
522 w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
525 System.out.println( "Number of ortholog groups :\t" + groups.size() );
526 System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() );
528 return groups.size();
531 private static void writeTree( final Phylogeny p,
533 final String comment,
534 final SortedMap<String, String> id_map )
536 if ( id_map != null && id_map.size() > 0 ) {
537 final PhylogenyNodeIterator it = p.iteratorExternalForward();
538 while ( it.hasNext() ) {
539 final PhylogenyNode n = it.next();
540 if ( !id_map.containsKey( n.getName() ) ) {
541 throw new IOException( "no id mapping for \"" + n.getName() + "\" (attempting to write [" + f
544 final Sequence seq = new Sequence();
545 seq.setName( id_map.get( n.getName() ) );
546 n.getNodeData().addSequence( seq );
549 final PhylogenyWriter writer = new PhylogenyWriter();
550 writer.toPhyloXML( f, p, 0 );
551 if ( comment != null ) {
552 System.out.println( comment + f.getCanonicalPath() );
556 private static void writeLogFile( final File logfile,
558 final File species_tree_file,
559 final File gene_trees_file,
561 final String prg_name,
563 final String prg_date,
565 final boolean verbose )
567 final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
568 out.println( "# " + prg_name );
569 out.println( "# version : " + prg_v );
570 out.println( "# date : " + prg_date );
571 out.println( "# based on: " + f );
572 out.println( "# ----------------------------------" );
573 out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
574 out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
575 out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() );
577 out.println( rio.getLog().toString() );
580 System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() );
584 private final static SortedMap<String, String> obtainMapping( final File dir,
586 final String suffix )
588 final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
589 final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
590 return t.getColumnsAsMap( 0, 1 );
593 private final static Phylogeny obtainTree( final File dir, final String prefix, final String suffix )
595 final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
596 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
597 return factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];