forester/java/src/org/forester/rio/RIOUtil.java

   1
   2 package org.forester.rio;
   3
   4 import java.io.File;
   5 import java.io.FilenameFilter;
   6 import java.io.IOException;
   7 import java.math.RoundingMode;
   8 import java.util.ArrayList;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.SortedSet;
  12 import java.util.TreeSet;
  13
  14 import org.forester.datastructures.IntMatrix;
  15 import org.forester.io.parsers.IteratingPhylogenyParser;
  16 import org.forester.io.parsers.PhylogenyParser;
  17 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
  18 import org.forester.io.parsers.nhx.NHXParser;
  19 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
  20 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
  21 import org.forester.io.parsers.util.ParserUtils;
  22 import org.forester.io.writers.PhylogenyWriter;
  23 import org.forester.phylogeny.Phylogeny;
  24 import org.forester.rio.RIO.REROOTING;
  25 import org.forester.sdi.SDIException;
  26 import org.forester.sdi.SDIutil.ALGORITHM;
  27 import org.forester.util.BasicDescriptiveStatistics;
  28 import org.forester.util.BasicTable;
  29 import org.forester.util.BasicTableParser;
  30 import org.forester.util.EasyWriter;
  31 import org.forester.util.ForesterUtil;
  32
  33 public final class RIOUtil {
  34
  35     public static final void executeAnalysis( final File gene_trees_file,
  36                                               final File species_tree_file,
  37                                               final File orthology_outtable,
  38                                               final File orthology_groups_outfile,
  39                                               final File logfile,
  40                                               final String outgroup,
  41                                               final REROOTING rerooting,
  42                                               final int gt_first,
  43                                               final int gt_last,
  44                                               final File return_species_tree,
  45                                               final File return_min_dup_gene_tree,
  46                                               final File return_median_dup_gene_tree,
  47                                               final boolean transfer_taxonomy,
  48                                               final ALGORITHM algorithm,
  49                                               final boolean use_gene_trees_dir,
  50                                               final EasyWriter log,
  51                                               final double ortholog_group_cutoff ) {
  52         try {
  53             final RIO rio;
  54             boolean iterating = false;
  55             final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
  56             if ( p instanceof PhyloXmlParser ) {
  57                 rio = RIO.executeAnalysis( gene_trees_file,
  58                                            species_tree_file,
  59                                            algorithm,
  60                                            rerooting,
  61                                            outgroup,
  62                                            gt_first,
  63                                            gt_last,
  64                                            logfile != null,
  65                                            true,
  66                                            transfer_taxonomy );
  67             }
  68             else {
  69                 iterating = true;
  70                 if ( p instanceof NHXParser ) {
  71                     final NHXParser nhx = ( NHXParser ) p;
  72                     nhx.setReplaceUnderscores( false );
  73                     nhx.setIgnoreQuotes( true );
  74                     nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
  75                 }
  76                 else if ( p instanceof NexusPhylogeniesParser ) {
  77                     final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
  78                     nex.setReplaceUnderscores( false );
  79                     nex.setIgnoreQuotes( true );
  80                     nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
  81                 }
  82                 else {
  83                     throw new RuntimeException( "unknown parser type: " + p );
  84                 }
  85                 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
  86                 ip.setSource( gene_trees_file );
  87                 rio = RIO.executeAnalysis( ip,
  88                                            species_tree_file,
  89                                            algorithm,
  90                                            rerooting,
  91                                            outgroup,
  92                                            gt_first,
  93                                            gt_last,
  94                                            logfile != null,
  95                                            !use_gene_trees_dir,
  96                                            transfer_taxonomy );
  97             }
  98             if ( !use_gene_trees_dir ) {
  99                 if ( algorithm == ALGORITHM.GSDIR ) {
 100                     System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
 101                 }
 102             }
 103             ///
 104             ////
 105             final IntMatrix m;
 106             if ( iterating ) {
 107                 m = rio.getOrthologTable();
 108             }
 109             else {
 110                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
 111             }
 112             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
 113             writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
 114             final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
 115                                                             ortholog_group_cutoff,
 116                                                             stats.getN(),
 117                                                             m,
 118                                                             !use_gene_trees_dir,
 119                                                             false );
 120             final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
 121             final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
 122             final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
 123             final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
 124             final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
 125             if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
 126                 writeLogFile( logfile,
 127                               rio,
 128                               species_tree_file,
 129                               gene_trees_file,
 130                               orthology_outtable,
 131                               org.forester.application.rio.PRG_NAME,
 132                               org.forester.application.rio.PRG_VERSION,
 133                               org.forester.application.rio.PRG_DATE,
 134                               ForesterUtil.getForesterLibraryInformation(),
 135                               !use_gene_trees_dir );
 136             }
 137             if ( return_species_tree != null ) {
 138                 writeTree( rio.getSpeciesTree(),
 139                            return_species_tree,
 140                            use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
 141             }
 142             if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
 143                 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
 144                 writeTree( rio.getMinDuplicationsGeneTree(),
 145                            new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
 146                            use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
 147             }
 148             if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
 149                 final int med = ( int ) rio.getDuplicationsStatistics().median();
 150                 writeTree( rio.getDuplicationsToTreeMap().get( med ),
 151                            new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
 152                            use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
 153             }
 154             final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
 155             final int min = ( int ) stats.getMin();
 156             final int max = ( int ) stats.getMax();
 157             final int median = ( int ) stats.median();
 158             int min_count = 0;
 159             int max_count = 0;
 160             int median_count = 0;
 161             for( double d : stats.getData() ) {
 162                 if ( ( ( int ) d ) == min ) {
 163                     ++min_count;
 164                 }
 165                 if ( ( ( int ) d ) == max ) {
 166                     ++max_count;
 167                 }
 168                 if ( ( ( int ) d ) == median ) {
 169                     ++median_count;
 170                 }
 171             }
 172             final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
 173             final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
 174             final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
 175             if ( use_gene_trees_dir ) {
 176                 String name = gene_trees_file.getName();
 177                 if ( name.indexOf( "." ) > 0 ) {
 178                     name = name.substring( 0, name.lastIndexOf( "." ) );
 179                 }
 180                 log.print( name );
 181                 log.print( "\t" );
 182                 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
 183                 log.print( "\t" );
 184                 log.print( Integer.toString( ortholog_groups ) );
 185                 //
 186                 log.print( "\t" );
 187                 log.print( Integer.toString( ortholog_groups_005 ) );
 188                 log.print( "\t" );
 189                 log.print( Integer.toString( ortholog_groups_025 ) );
 190                 log.print( "\t" );
 191                 log.print( Integer.toString( ortholog_groups_05 ) );
 192                 log.print( "\t" );
 193                 log.print( Integer.toString( ortholog_groups_075 ) );
 194                 log.print( "\t" );
 195                 log.print( Integer.toString( ortholog_groups_095 ) );
 196                 //
 197                 log.print( "\t" );
 198                 if ( stats.getN() > 3 ) {
 199                     log.print( df.format( median ) );
 200                 }
 201                 else {
 202                     log.print( "" );
 203                 }
 204                 log.print( "\t" );
 205                 log.print( df.format( stats.arithmeticMean() ) );
 206                 log.print( "\t" );
 207                 if ( stats.getN() > 3 ) {
 208                     log.print( df.format( stats.sampleStandardDeviation() ) );
 209                 }
 210                 else {
 211                     log.print( "" );
 212                 }
 213                 log.print( "\t" );
 214                 log.print( Integer.toString( min ) );
 215                 log.print( "\t" );
 216                 log.print( Integer.toString( max ) );
 217                 log.print( "\t" );
 218                 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
 219                 log.print( "\t" );
 220                 log.print( Integer.toString( stats.getN() ) );
 221                 log.println();
 222             }
 223             else {
 224                 System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
 225                 System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
 226                 System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
 227                         + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
 228                         + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
 229                 if ( stats.getN() > 3 ) {
 230                     System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
 231                             + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
 232                 }
 233                 System.out.println( "Minimum duplications                :\t" + min + "\t"
 234                         + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
 235                 System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
 236                         + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
 237                 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
 238                         + df.format( median_count_percentage ) + "%" );
 239                 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
 240                         + df.format( min_count_percentage ) + "%" );
 241                 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
 242                         + df.format( max_count_percentage ) + "%" );
 243                 if ( algorithm == ALGORITHM.GSDIR ) {
 244                     System.out.println( "Removed ext gene tree nodes         :\t"
 245                             + rio.getRemovedGeneTreeNodes().size() );
 246                 }
 247             }
 248         }
 249         catch ( final RIOException e ) {
 250             ForesterUtil.fatalError( e.getLocalizedMessage() );
 251         }
 252         catch ( final SDIException e ) {
 253             ForesterUtil.fatalError( e.getLocalizedMessage() );
 254         }
 255         catch ( final IOException e ) {
 256             ForesterUtil.fatalError( e.getLocalizedMessage() );
 257         }
 258         catch ( final OutOfMemoryError e ) {
 259             ForesterUtil.outOfMemoryError( e );
 260         }
 261         catch ( final Exception e ) {
 262             ForesterUtil.unexpectedFatalError( e );
 263         }
 264         catch ( final Error e ) {
 265             ForesterUtil.unexpectedFatalError( e );
 266         }
 267     }
 268
 269     private static final void writeTable( final File table_outfile,
 270                                           final int gene_trees_analyzed,
 271                                           final IntMatrix m,
 272                                           final boolean verbose )
 273             throws IOException {
 274         final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
 275         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
 276         df.setDecimalSeparatorAlwaysShown( false );
 277         df.setRoundingMode( RoundingMode.HALF_UP );
 278         for( int i = 0; i < m.size(); ++i ) {
 279             w.print( "\t" );
 280             w.print( m.getLabel( i ) );
 281         }
 282         w.println();
 283         for( int x = 0; x < m.size(); ++x ) {
 284             w.print( m.getLabel( x ) );
 285             for( int y = 0; y < m.size(); ++y ) {
 286                 w.print( "\t" );
 287                 if ( x == y ) {
 288                     if ( m.get( x, y ) != gene_trees_analyzed ) {
 289                         ForesterUtil.unexpectedFatalError( "diagonal value is off" );
 290                     }
 291                     w.print( "-" );
 292                 }
 293                 else {
 294                     w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
 295                 }
 296             }
 297             w.println();
 298         }
 299         w.close();
 300         if ( verbose ) {
 301             System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
 302         }
 303     }
 304
 305     private static final int writeOrtologGroups( final File outfile,
 306                                                  final double cutoff,
 307                                                  final int gene_trees_analyzed,
 308                                                  final IntMatrix m,
 309                                                  final boolean verbose,
 310                                                  final boolean calc_conly )
 311             throws IOException {
 312         List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
 313         BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
 314         int below_075 = 0;
 315         int below_05 = 0;
 316         int below_025 = 0;
 317         for( int x = 1; x < m.size(); ++x ) {
 318             final String a = m.getLabel( x );
 319             for( int y = 0; y < x; ++y ) {
 320                 final String b = m.getLabel( y );
 321                 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
 322                 stats.addValue( s );
 323                 if ( s < 0.75 ) {
 324                     below_075++;
 325                     if ( s < 0.5 ) {
 326                         below_05++;
 327                         if ( s < 0.25 ) {
 328                             below_025++;
 329                         }
 330                     }
 331                 }
 332                 if ( s >= cutoff ) {
 333                     boolean found = false;
 334                     for( final SortedSet<String> group : groups ) {
 335                         if ( group.contains( a ) ) {
 336                             group.add( b );
 337                             found = true;
 338                         }
 339                         if ( group.contains( b ) ) {
 340                             group.add( a );
 341                             found = true;
 342                         }
 343                     }
 344                     if ( !found ) {
 345                         final SortedSet<String> new_group = new TreeSet<String>();
 346                         new_group.add( a );
 347                         new_group.add( b );
 348                         groups.add( new_group );
 349                     }
 350                 }
 351             }
 352         }
 353         //Deal with singlets:
 354         for( int x = 0; x < m.size(); ++x ) {
 355             final String a = m.getLabel( x );
 356             boolean found = false;
 357             for( final SortedSet<String> group : groups ) {
 358                 if ( group.contains( a ) ) {
 359                     found = true;
 360                     break;
 361                 }
 362             }
 363             if ( !found ) {
 364                 final SortedSet<String> new_group = new TreeSet<String>();
 365                 new_group.add( a );
 366                 groups.add( new_group );
 367             }
 368         }
 369         if ( calc_conly ) {
 370             return groups.size();
 371         }
 372         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
 373         df.setDecimalSeparatorAlwaysShown( false );
 374         df.setRoundingMode( RoundingMode.HALF_UP );
 375         final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
 376         int counter = 1;
 377         for( final SortedSet<String> group : groups ) {
 378             w.print( Integer.toString( counter++ ) );
 379             for( final String s : group ) {
 380                 w.print( "\t" );
 381                 w.print( s );
 382             }
 383             w.println();
 384         }
 385         w.println();
 386         w.println( "# Cutoff\t" + df.format( cutoff ) );
 387         w.println();
 388         w.println( "# Orthology support statistics:" );
 389         if ( stats.getN() > 3 ) {
 390             w.println( "# Median\t" + df.format( stats.median() ) );
 391         }
 392         w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
 393         if ( stats.getN() > 3 ) {
 394             w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
 395         }
 396         w.println( "# Min\t" + df.format( stats.getMin() ) );
 397         w.println( "# Max\t" + df.format( stats.getMax() ) );
 398         w.println( "# Total\t" + df.format( stats.getN() ) );
 399         w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
 400         w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
 401         w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
 402         w.close();
 403         if ( verbose ) {
 404             System.out.println( "Number of ortholog groups           :\t" + groups.size() );
 405             System.out.println( "Wrote orthologs groups table to     :\t" + outfile.getCanonicalPath() );
 406         }
 407         return groups.size();
 408     }
 409
 410     private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
 411         final PhylogenyWriter writer = new PhylogenyWriter();
 412         writer.toPhyloXML( f, p, 0 );
 413         if ( comment != null ) {
 414             System.out.println( comment + f.getCanonicalPath() );
 415         }
 416     }
 417
 418     private static void writeLogFile( final File logfile,
 419                                       final RIO rio,
 420                                       final File species_tree_file,
 421                                       final File gene_trees_file,
 422                                       final File outtable,
 423                                       final String prg_name,
 424                                       final String prg_v,
 425                                       final String prg_date,
 426                                       final String f,
 427                                       final boolean verbose )
 428             throws IOException {
 429         final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
 430         out.println( "# " + prg_name );
 431         out.println( "# version : " + prg_v );
 432         out.println( "# date    : " + prg_date );
 433         out.println( "# based on: " + f );
 434         out.println( "# ----------------------------------" );
 435         out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
 436         out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
 437         out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
 438         out.flush();
 439         out.println( rio.getLog().toString() );
 440         out.close();
 441         if ( verbose ) {
 442             System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
 443         }
 444     }
 445
 446     private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
 447             throws IOException {
 448         if ( !dir.exists() ) {
 449             throw new IOException( "[" + dir + "] does not exist" );
 450         }
 451         if ( !dir.isDirectory() ) {
 452             throw new IOException( "[" + dir + "] is not a directory" );
 453         }
 454         final File mapping_files[] = dir.listFiles( new FilenameFilter() {
 455
 456             @Override
 457             public boolean accept( final File dir, final String name ) {
 458                 return ( name.endsWith( suffix ) );
 459             }
 460         } );
 461         String my_suffix = suffix;
 462         boolean done = false;
 463         do {
 464             int matches = 0;
 465             for( File file : mapping_files ) {
 466                 if ( file.getName().equals( my_suffix ) ) {
 467                     matches++;
 468                 }
 469             }
 470             if ( matches == 1) {
 471                 done = true;
 472             }
 473             else {
 474                 my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
 475             }
 476         } while (!done );
 477
 478
 479         if ( mapping_files.length == 0 ) {
 480             throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
 481                     + dir + "] " );
 482         }
 483         if ( mapping_files.length > 1 ) {
 484             throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
 485                     + dir + "] " );
 486         }
 487         final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
 488         return t.getColumnsAsMap( 0, 1 );
 489     }
 490 }