8d01afa6ac1bd1483ede87b112f6cde5bac1c1af
[jalview.git] / forester / java / src / org / forester / rio / RIOUtil.java
1
2 package org.forester.rio;
3
4 import java.io.File;
5 import java.io.FilenameFilter;
6 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.util.ArrayList;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.SortedSet;
12 import java.util.TreeSet;
13
14 import org.forester.datastructures.IntMatrix;
15 import org.forester.io.parsers.IteratingPhylogenyParser;
16 import org.forester.io.parsers.PhylogenyParser;
17 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
18 import org.forester.io.parsers.nhx.NHXParser;
19 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
20 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
21 import org.forester.io.parsers.util.ParserUtils;
22 import org.forester.io.writers.PhylogenyWriter;
23 import org.forester.phylogeny.Phylogeny;
24 import org.forester.rio.RIO.REROOTING;
25 import org.forester.sdi.SDIException;
26 import org.forester.sdi.SDIutil.ALGORITHM;
27 import org.forester.util.BasicDescriptiveStatistics;
28 import org.forester.util.BasicTable;
29 import org.forester.util.BasicTableParser;
30 import org.forester.util.EasyWriter;
31 import org.forester.util.ForesterUtil;
32
33 public final class RIOUtil {
34
35     public static final void executeAnalysis( final File gene_trees_file,
36                                               final File species_tree_file,
37                                               final File orthology_outtable,
38                                               final File orthology_groups_outfile,
39                                               final File logfile,
40                                               final String outgroup,
41                                               final REROOTING rerooting,
42                                               final int gt_first,
43                                               final int gt_last,
44                                               final File return_species_tree,
45                                               final File return_min_dup_gene_tree,
46                                               final File return_median_dup_gene_tree,
47                                               final boolean transfer_taxonomy,
48                                               final ALGORITHM algorithm,
49                                               final boolean use_gene_trees_dir,
50                                               final EasyWriter log,
51                                               final double ortholog_group_cutoff ) {
52         try {
53             final RIO rio;
54             boolean iterating = false;
55             final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
56             if ( p instanceof PhyloXmlParser ) {
57                 rio = RIO.executeAnalysis( gene_trees_file,
58                                            species_tree_file,
59                                            algorithm,
60                                            rerooting,
61                                            outgroup,
62                                            gt_first,
63                                            gt_last,
64                                            logfile != null,
65                                            true,
66                                            transfer_taxonomy );
67             }
68             else {
69                 iterating = true;
70                 if ( p instanceof NHXParser ) {
71                     final NHXParser nhx = ( NHXParser ) p;
72                     nhx.setReplaceUnderscores( false );
73                     nhx.setIgnoreQuotes( true );
74                     nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
75                 }
76                 else if ( p instanceof NexusPhylogeniesParser ) {
77                     final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
78                     nex.setReplaceUnderscores( false );
79                     nex.setIgnoreQuotes( true );
80                     nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
81                 }
82                 else {
83                     throw new RuntimeException( "unknown parser type: " + p );
84                 }
85                 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
86                 ip.setSource( gene_trees_file );
87                 rio = RIO.executeAnalysis( ip,
88                                            species_tree_file,
89                                            algorithm,
90                                            rerooting,
91                                            outgroup,
92                                            gt_first,
93                                            gt_last,
94                                            logfile != null,
95                                            !use_gene_trees_dir,
96                                            transfer_taxonomy );
97             }
98             if ( !use_gene_trees_dir ) {
99                 if ( algorithm == ALGORITHM.GSDIR ) {
100                     System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
101                 }
102             }
103             ///
104             ////
105             final IntMatrix m;
106             if ( iterating ) {
107                 m = rio.getOrthologTable();
108             }
109             else {
110                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
111             }
112             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
113             writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
114             final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
115                                                             ortholog_group_cutoff,
116                                                             stats.getN(),
117                                                             m,
118                                                             !use_gene_trees_dir,
119                                                             false );
120             final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
121             final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
122             final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
123             final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
124             final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
125             if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
126                 writeLogFile( logfile,
127                               rio,
128                               species_tree_file,
129                               gene_trees_file,
130                               orthology_outtable,
131                               org.forester.application.rio.PRG_NAME,
132                               org.forester.application.rio.PRG_VERSION,
133                               org.forester.application.rio.PRG_DATE,
134                               ForesterUtil.getForesterLibraryInformation(),
135                               !use_gene_trees_dir );
136             }
137             if ( return_species_tree != null ) {
138                 writeTree( rio.getSpeciesTree(),
139                            return_species_tree,
140                            use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
141             }
142             if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
143                 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
144                 writeTree( rio.getMinDuplicationsGeneTree(),
145                            new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
146                            use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
147             }
148             if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
149                 final int med = ( int ) rio.getDuplicationsStatistics().median();
150                 writeTree( rio.getDuplicationsToTreeMap().get( med ),
151                            new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
152                            use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
153             }
154             final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
155             final int min = ( int ) stats.getMin();
156             final int max = ( int ) stats.getMax();
157             final int median = ( int ) stats.median();
158             int min_count = 0;
159             int max_count = 0;
160             int median_count = 0;
161             for( double d : stats.getData() ) {
162                 if ( ( ( int ) d ) == min ) {
163                     ++min_count;
164                 }
165                 if ( ( ( int ) d ) == max ) {
166                     ++max_count;
167                 }
168                 if ( ( ( int ) d ) == median ) {
169                     ++median_count;
170                 }
171             }
172             final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
173             final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
174             final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
175             if ( use_gene_trees_dir ) {
176                 String name = gene_trees_file.getName();
177                 if ( name.indexOf( "." ) > 0 ) {
178                     name = name.substring( 0, name.lastIndexOf( "." ) );
179                 }
180                 log.print( name );
181                 log.print( "\t" );
182                 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
183                 log.print( "\t" );
184                 log.print( Integer.toString( ortholog_groups ) );
185                 //
186                 log.print( "\t" );
187                 log.print( Integer.toString( ortholog_groups_005 ) );
188                 log.print( "\t" );
189                 log.print( Integer.toString( ortholog_groups_025 ) );
190                 log.print( "\t" );
191                 log.print( Integer.toString( ortholog_groups_05 ) );
192                 log.print( "\t" );
193                 log.print( Integer.toString( ortholog_groups_075 ) );
194                 log.print( "\t" );
195                 log.print( Integer.toString( ortholog_groups_095 ) );
196                 //
197                 log.print( "\t" );
198                 if ( stats.getN() > 3 ) {
199                     log.print( df.format( median ) );
200                 }
201                 else {
202                     log.print( "" );
203                 }
204                 log.print( "\t" );
205                 log.print( df.format( stats.arithmeticMean() ) );
206                 log.print( "\t" );
207                 if ( stats.getN() > 3 ) {
208                     log.print( df.format( stats.sampleStandardDeviation() ) );
209                 }
210                 else {
211                     log.print( "" );
212                 }
213                 log.print( "\t" );
214                 log.print( Integer.toString( min ) );
215                 log.print( "\t" );
216                 log.print( Integer.toString( max ) );
217                 log.print( "\t" );
218                 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
219                 log.print( "\t" );
220                 log.print( Integer.toString( stats.getN() ) );
221                 log.println();
222             }
223             else {
224                 System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
225                 System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
226                 System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
227                         + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
228                         + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
229                 if ( stats.getN() > 3 ) {
230                     System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
231                             + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
232                 }
233                 System.out.println( "Minimum duplications                :\t" + min + "\t"
234                         + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
235                 System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
236                         + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
237                 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
238                         + df.format( median_count_percentage ) + "%" );
239                 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
240                         + df.format( min_count_percentage ) + "%" );
241                 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
242                         + df.format( max_count_percentage ) + "%" );
243                 if ( algorithm == ALGORITHM.GSDIR ) {
244                     System.out.println( "Removed ext gene tree nodes         :\t"
245                             + rio.getRemovedGeneTreeNodes().size() );
246                 }
247             }
248         }
249         catch ( final RIOException e ) {
250             ForesterUtil.fatalError( e.getLocalizedMessage() );
251         }
252         catch ( final SDIException e ) {
253             ForesterUtil.fatalError( e.getLocalizedMessage() );
254         }
255         catch ( final IOException e ) {
256             ForesterUtil.fatalError( e.getLocalizedMessage() );
257         }
258         catch ( final OutOfMemoryError e ) {
259             ForesterUtil.outOfMemoryError( e );
260         }
261         catch ( final Exception e ) {
262             ForesterUtil.unexpectedFatalError( e );
263         }
264         catch ( final Error e ) {
265             ForesterUtil.unexpectedFatalError( e );
266         }
267     }
268
269     private static final void writeTable( final File table_outfile,
270                                           final int gene_trees_analyzed,
271                                           final IntMatrix m,
272                                           final boolean verbose )
273             throws IOException {
274         final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
275         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
276         df.setDecimalSeparatorAlwaysShown( false );
277         df.setRoundingMode( RoundingMode.HALF_UP );
278         for( int i = 0; i < m.size(); ++i ) {
279             w.print( "\t" );
280             w.print( m.getLabel( i ) );
281         }
282         w.println();
283         for( int x = 0; x < m.size(); ++x ) {
284             w.print( m.getLabel( x ) );
285             for( int y = 0; y < m.size(); ++y ) {
286                 w.print( "\t" );
287                 if ( x == y ) {
288                     if ( m.get( x, y ) != gene_trees_analyzed ) {
289                         ForesterUtil.unexpectedFatalError( "diagonal value is off" );
290                     }
291                     w.print( "-" );
292                 }
293                 else {
294                     w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
295                 }
296             }
297             w.println();
298         }
299         w.close();
300         if ( verbose ) {
301             System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
302         }
303     }
304
305     private static final int writeOrtologGroups( final File outfile,
306                                                  final double cutoff,
307                                                  final int gene_trees_analyzed,
308                                                  final IntMatrix m,
309                                                  final boolean verbose,
310                                                  final boolean calc_conly )
311             throws IOException {
312         List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
313         BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
314         int below_075 = 0;
315         int below_05 = 0;
316         int below_025 = 0;
317         for( int x = 1; x < m.size(); ++x ) {
318             final String a = m.getLabel( x );
319             for( int y = 0; y < x; ++y ) {
320                 final String b = m.getLabel( y );
321                 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
322                 stats.addValue( s );
323                 if ( s < 0.75 ) {
324                     below_075++;
325                     if ( s < 0.5 ) {
326                         below_05++;
327                         if ( s < 0.25 ) {
328                             below_025++;
329                         }
330                     }
331                 }
332                 if ( s >= cutoff ) {
333                     boolean found = false;
334                     for( final SortedSet<String> group : groups ) {
335                         if ( group.contains( a ) ) {
336                             group.add( b );
337                             found = true;
338                         }
339                         if ( group.contains( b ) ) {
340                             group.add( a );
341                             found = true;
342                         }
343                     }
344                     if ( !found ) {
345                         final SortedSet<String> new_group = new TreeSet<String>();
346                         new_group.add( a );
347                         new_group.add( b );
348                         groups.add( new_group );
349                     }
350                 }
351             }
352         }
353         //Deal with singlets:
354         for( int x = 0; x < m.size(); ++x ) {
355             final String a = m.getLabel( x );
356             boolean found = false;
357             for( final SortedSet<String> group : groups ) {
358                 if ( group.contains( a ) ) {
359                     found = true;
360                     break;
361                 }
362             }
363             if ( !found ) {
364                 final SortedSet<String> new_group = new TreeSet<String>();
365                 new_group.add( a );
366                 groups.add( new_group );
367             }
368         }
369         if ( calc_conly ) {
370             return groups.size();
371         }
372         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
373         df.setDecimalSeparatorAlwaysShown( false );
374         df.setRoundingMode( RoundingMode.HALF_UP );
375         final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
376         int counter = 1;
377         for( final SortedSet<String> group : groups ) {
378             w.print( Integer.toString( counter++ ) );
379             for( final String s : group ) {
380                 w.print( "\t" );
381                 w.print( s );
382             }
383             w.println();
384         }
385         w.println();
386         w.println( "# Cutoff\t" + df.format( cutoff ) );
387         w.println();
388         w.println( "# Orthology support statistics:" );
389         if ( stats.getN() > 3 ) {
390             w.println( "# Median\t" + df.format( stats.median() ) );
391         }
392         w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
393         if ( stats.getN() > 3 ) {
394             w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
395         }
396         w.println( "# Min\t" + df.format( stats.getMin() ) );
397         w.println( "# Max\t" + df.format( stats.getMax() ) );
398         w.println( "# Total\t" + df.format( stats.getN() ) );
399         w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
400         w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
401         w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
402         w.close();
403         if ( verbose ) {
404             System.out.println( "Number of ortholog groups           :\t" + groups.size() );
405             System.out.println( "Wrote orthologs groups table to     :\t" + outfile.getCanonicalPath() );
406         }
407         return groups.size();
408     }
409
410     private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
411         final PhylogenyWriter writer = new PhylogenyWriter();
412         writer.toPhyloXML( f, p, 0 );
413         if ( comment != null ) {
414             System.out.println( comment + f.getCanonicalPath() );
415         }
416     }
417
418     private static void writeLogFile( final File logfile,
419                                       final RIO rio,
420                                       final File species_tree_file,
421                                       final File gene_trees_file,
422                                       final File outtable,
423                                       final String prg_name,
424                                       final String prg_v,
425                                       final String prg_date,
426                                       final String f,
427                                       final boolean verbose )
428             throws IOException {
429         final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
430         out.println( "# " + prg_name );
431         out.println( "# version : " + prg_v );
432         out.println( "# date    : " + prg_date );
433         out.println( "# based on: " + f );
434         out.println( "# ----------------------------------" );
435         out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
436         out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
437         out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
438         out.flush();
439         out.println( rio.getLog().toString() );
440         out.close();
441         if ( verbose ) {
442             System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
443         }
444     }
445
446     private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
447             throws IOException {
448         if ( !dir.exists() ) {
449             throw new IOException( "[" + dir + "] does not exist" );
450         }
451         if ( !dir.isDirectory() ) {
452             throw new IOException( "[" + dir + "] is not a directory" );
453         }
454         final File mapping_files[] = dir.listFiles( new FilenameFilter() {
455
456             @Override
457             public boolean accept( final File dir, final String name ) {
458                 return ( name.endsWith( suffix ) );
459             }
460         } );
461         String my_suffix = suffix;
462         boolean done = false;
463         do {
464             int matches = 0;
465             for( File file : mapping_files ) {
466                 if ( file.getName().equals( my_suffix ) ) {
467                     matches++;
468                 }
469             }
470             if ( matches == 1) {
471                 done = true;
472             }
473             else {
474                 my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
475             }
476         } while (!done );
477         
478         
479         if ( mapping_files.length == 0 ) {
480             throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
481                     + dir + "] " );
482         }
483         if ( mapping_files.length > 1 ) {
484             throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
485                     + dir + "] " );
486         }
487         final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
488         return t.getColumnsAsMap( 0, 1 );
489     }
490 }