in progress...

author cmzmasek <chris.zma@outlook.com>

Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)

committer cmzmasek <chris.zma@outlook.com>

Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)
author cmzmasek <chris.zma@outlook.com>
Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)
committer cmzmasek <chris.zma@outlook.com>
Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)
diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java

index 29dec92..fac270c 100644 (file)
--- a/forester/java/src/org/forester/application/rio.java
+++ b/forester/java/src/org/forester/application/rio.java
@@ -26,39 +26,23 @@ package org.forester.application;
  import java.io.File;
  import java.io.FilenameFilter;
  import java.io.IOException;
-import java.math.RoundingMode;
  import java.util.ArrayList;
  import java.util.Arrays;
-import java.util.Iterator;
  import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
  
-import org.forester.datastructures.IntMatrix;
-import org.forester.io.parsers.IteratingPhylogenyParser;
-import org.forester.io.parsers.PhylogenyParser;
-import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
-import org.forester.io.parsers.nhx.NHXParser;
-import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
-import org.forester.io.parsers.phyloxml.PhyloXmlParser;
-import org.forester.io.parsers.util.ParserUtils;
-import org.forester.io.writers.PhylogenyWriter;
-import org.forester.phylogeny.Phylogeny;
  import org.forester.rio.RIO;
  import org.forester.rio.RIO.REROOTING;
-import org.forester.rio.RIOException;
-import org.forester.sdi.SDIException;
+import org.forester.rio.RIOUtil;
  import org.forester.sdi.SDIutil.ALGORITHM;
-import org.forester.util.BasicDescriptiveStatistics;
  import org.forester.util.CommandLineArguments;
  import org.forester.util.EasyWriter;
  import org.forester.util.ForesterUtil;
  
  public class rio {
  
-    final static private String PRG_NAME                       = "rio";
-    final static private String PRG_VERSION                    = "5.000";
-    final static private String PRG_DATE                       = "170411";
+    public final static String PRG_NAME                       = "rio";
+    public final static String PRG_VERSION                    = "5.000";
+    public final static String PRG_DATE                       = "170411";
      final static private String E_MAIL                         = "phyloxml@gmail.com";
      final static private String WWW                            = "https://sites.google.com/site/cmzmasek/home/software/forester";
      final static private String HELP_OPTION_1                  = "help";
@@ -507,7 +491,7 @@ public class rio {
                      outname = outname.substring( 0, outname.lastIndexOf( "." ) );
                  }
                  try {
-                    executeAnalysis( gf,
+                    RIOUtil.executeAnalysis( gf,
                                       species_tree_file,
                                       new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ),
                                       new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ),
@@ -541,7 +525,7 @@ public class rio {
              if ( outname.indexOf( "." ) > 0 ) {
                  outname = outname.substring( 0, outname.lastIndexOf( "." ) );
              }
-            executeAnalysis( gene_trees_file,
+            RIOUtil.executeAnalysis( gene_trees_file,
                               species_tree_file,
                               orthology_outtable,
                               new File( outname + ORTHOLOG_GROUPS_SUFFIX ),
@@ -576,238 +560,6 @@ public class rio {
          System.exit( 0 );
      }
  
-    private static final void executeAnalysis( final File gene_trees_file,
-                                               final File species_tree_file,
-                                               final File orthology_outtable,
-                                               final File orthology_groups_outfile,
-                                               final File logfile,
-                                               final String outgroup,
-                                               final REROOTING rerooting,
-                                               final int gt_first,
-                                               final int gt_last,
-                                               final File return_species_tree,
-                                               final File return_min_dup_gene_tree,
-                                               final File return_median_dup_gene_tree,
-                                               final boolean transfer_taxonomy,
-                                               final ALGORITHM algorithm,
-                                               final boolean use_gene_trees_dir,
-                                               final EasyWriter log,
-                                               final double ortholog_group_cutoff ) {
-        try {
-            final RIO rio;
-            boolean iterating = false;
-            final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
-            if ( p instanceof PhyloXmlParser ) {
-                rio = RIO.executeAnalysis( gene_trees_file,
-                                           species_tree_file,
-                                           algorithm,
-                                           rerooting,
-                                           outgroup,
-                                           gt_first,
-                                           gt_last,
-                                           logfile != null,
-                                           true,
-                                           transfer_taxonomy );
-            }
-            else {
-                iterating = true;
-                if ( p instanceof NHXParser ) {
-                    final NHXParser nhx = ( NHXParser ) p;
-                    nhx.setReplaceUnderscores( false );
-                    nhx.setIgnoreQuotes( true );
-                    nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
-                }
-                else if ( p instanceof NexusPhylogeniesParser ) {
-                    final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
-                    nex.setReplaceUnderscores( false );
-                    nex.setIgnoreQuotes( true );
-                    nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
-                }
-                else {
-                    throw new RuntimeException( "unknown parser type: " + p );
-                }
-                final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
-                ip.setSource( gene_trees_file );
-                rio = RIO.executeAnalysis( ip,
-                                           species_tree_file,
-                                           algorithm,
-                                           rerooting,
-                                           outgroup,
-                                           gt_first,
-                                           gt_last,
-                                           logfile != null,
-                                           !use_gene_trees_dir,
-                                           transfer_taxonomy );
-            }
-            if ( !use_gene_trees_dir ) {
-                if ( algorithm == ALGORITHM.GSDIR ) {
-                    System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
-                }
-            }
-            final IntMatrix m;
-            if ( iterating ) {
-                m = rio.getOrthologTable();
-            }
-            else {
-                m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
-            }
-            final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
-            writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
-            final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
-                                                            ortholog_group_cutoff,
-                                                            stats.getN(),
-                                                            m,
-                                                            !use_gene_trees_dir,
-                                                            false );
-            final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
-            final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
-            final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
-            final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
-            final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
-            if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
-                writeLogFile( logfile,
-                              rio,
-                              species_tree_file,
-                              gene_trees_file,
-                              orthology_outtable,
-                              PRG_NAME,
-                              PRG_VERSION,
-                              PRG_DATE,
-                              ForesterUtil.getForesterLibraryInformation(),
-                              !use_gene_trees_dir );
-            }
-            if ( return_species_tree != null ) {
-                writeTree( rio.getSpeciesTree(),
-                           return_species_tree,
-                           use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
-            }
-            if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
-                final int min = ( int ) rio.getDuplicationsStatistics().getMin();
-                writeTree( rio.getMinDuplicationsGeneTree(),
-                           new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
-                           use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
-            }
-            if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
-                final int med = ( int ) rio.getDuplicationsStatistics().median();
-                writeTree( rio.getDuplicationsToTreeMap().get( med ),
-                           new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
-                           use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
-            }
-            final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
-            final int min = ( int ) stats.getMin();
-            final int max = ( int ) stats.getMax();
-            final int median = ( int ) stats.median();
-            int min_count = 0;
-            int max_count = 0;
-            int median_count = 0;
-            for( double d : stats.getData() ) {
-                if ( ( ( int ) d ) == min ) {
-                    ++min_count;
-                }
-                if ( ( ( int ) d ) == max ) {
-                    ++max_count;
-                }
-                if ( ( ( int ) d ) == median ) {
-                    ++median_count;
-                }
-            }
-            final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
-            final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
-            final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
-            if ( use_gene_trees_dir ) {
-                String name = gene_trees_file.getName();
-                if ( name.indexOf( "." ) > 0 ) {
-                    name = name.substring( 0, name.lastIndexOf( "." ) );
-                }
-                log.print( name );
-                log.print( "\t" );
-                log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups ) );
-                //
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups_005 ) );
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups_025 ) );
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups_05 ) );
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups_075 ) );
-                log.print( "\t" );
-                log.print( Integer.toString( ortholog_groups_095 ) );
-                //
-                log.print( "\t" );
-                if ( stats.getN() > 3 ) {
-                    log.print( df.format( median ) );
-                }
-                else {
-                    log.print( "" );
-                }
-                log.print( "\t" );
-                log.print( df.format( stats.arithmeticMean() ) );
-                log.print( "\t" );
-                if ( stats.getN() > 3 ) {
-                    log.print( df.format( stats.sampleStandardDeviation() ) );
-                }
-                else {
-                    log.print( "" );
-                }
-                log.print( "\t" );
-                log.print( Integer.toString( min ) );
-                log.print( "\t" );
-                log.print( Integer.toString( max ) );
-                log.print( "\t" );
-                log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
-                log.print( "\t" );
-                log.print( Integer.toString( stats.getN() ) );
-                log.println();
-            }
-            else {
-                System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
-                System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
-                System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
-                        + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
-                        + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
-                if ( stats.getN() > 3 ) {
-                    System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
-                            + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
-                }
-                System.out.println( "Minimum duplications                :\t" + min + "\t"
-                        + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
-                System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
-                        + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
-                System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
-                        + df.format( median_count_percentage ) + "%" );
-                System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
-                        + df.format( min_count_percentage ) + "%" );
-                System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
-                        + df.format( max_count_percentage ) + "%" );
-                if ( algorithm == ALGORITHM.GSDIR ) {
-                    System.out.println( "Removed ext gene tree nodes         :\t"
-                            + rio.getRemovedGeneTreeNodes().size() );
-                }
-            }
-        }
-        catch ( final RIOException e ) {
-            ForesterUtil.fatalError( e.getLocalizedMessage() );
-        }
-        catch ( final SDIException e ) {
-            ForesterUtil.fatalError( e.getLocalizedMessage() );
-        }
-        catch ( final IOException e ) {
-            ForesterUtil.fatalError( e.getLocalizedMessage() );
-        }
-        catch ( final OutOfMemoryError e ) {
-            ForesterUtil.outOfMemoryError( e );
-        }
-        catch ( final Exception e ) {
-            ForesterUtil.unexpectedFatalError( e );
-        }
-        catch ( final Error e ) {
-            ForesterUtil.unexpectedFatalError( e );
-        }
-    }
-
      private final static void printHelp() {
          System.out.println( "Usage" );
          System.out.println();
@@ -850,181 +602,4 @@ public class rio {
          System.out.println();
          System.exit( -1 );
      }
-
-    private static void writeLogFile( final File logfile,
-                                      final RIO rio,
-                                      final File species_tree_file,
-                                      final File gene_trees_file,
-                                      final File outtable,
-                                      final String prg_name,
-                                      final String prg_v,
-                                      final String prg_date,
-                                      final String f,
-                                      final boolean verbose )
-            throws IOException {
-        final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
-        out.println( "# " + prg_name );
-        out.println( "# version : " + prg_v );
-        out.println( "# date    : " + prg_date );
-        out.println( "# based on: " + f );
-        out.println( "# ----------------------------------" );
-        out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
-        out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
-        out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
-        out.flush();
-        out.println( rio.getLog().toString() );
-        out.close();
-        if ( verbose ) {
-            System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
-        }
-    }
-
-    private static final void writeTable( final File table_outfile,
-                                          final int gene_trees_analyzed,
-                                          final IntMatrix m,
-                                          final boolean verbose )
-            throws IOException {
-        final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
-        final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
-        df.setDecimalSeparatorAlwaysShown( false );
-        df.setRoundingMode( RoundingMode.HALF_UP );
-        for( int i = 0; i < m.size(); ++i ) {
-            w.print( "\t" );
-            w.print( m.getLabel( i ) );
-        }
-        w.println();
-        for( int x = 0; x < m.size(); ++x ) {
-            w.print( m.getLabel( x ) );
-            for( int y = 0; y < m.size(); ++y ) {
-                w.print( "\t" );
-                if ( x == y ) {
-                    if ( m.get( x, y ) != gene_trees_analyzed ) {
-                        ForesterUtil.unexpectedFatalError( "diagonal value is off" );
-                    }
-                    w.print( "-" );
-                }
-                else {
-                    w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
-                }
-            }
-            w.println();
-        }
-        w.close();
-        if ( verbose ) {
-            System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
-        }
-    }
-
-    private static final int writeOrtologGroups( final File outfile,
-                                                 final double cutoff,
-                                                 final int gene_trees_analyzed,
-                                                 final IntMatrix m,
-                                                 final boolean verbose,
-                                                 final boolean calc_conly )
-            throws IOException {
-        List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
-        BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
-        int below_075 = 0;
-        int below_05 = 0;
-        int below_025 = 0;
-        for( int x = 1; x < m.size(); ++x ) {
-            final String a = m.getLabel( x );
-            for( int y = 0; y < x; ++y ) {
-                final String b = m.getLabel( y );
-                final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
-                stats.addValue( s );
-                if ( s < 0.75 ) {
-                    below_075++;
-                    if ( s < 0.5 ) {
-                        below_05++;
-                        if ( s < 0.25 ) {
-                            below_025++;
-                        }
-                    }
-                }
-                if ( s >= cutoff ) {
-                    boolean found = false;
-                    for( final SortedSet<String> group : groups ) {
-                        if ( group.contains( a ) ) {
-                            group.add( b );
-                            found = true;
-                        }
-                        if ( group.contains( b ) ) {
-                            group.add( a );
-                            found = true;
-                        }
-                    }
-                    if ( !found ) {
-                        final SortedSet<String> new_group = new TreeSet<String>();
-                        new_group.add( a );
-                        new_group.add( b );
-                        groups.add( new_group );
-                    }
-                }
-            }
-        }
-        //Deal with singlets:
-        for( int x = 0; x < m.size(); ++x ) {
-            final String a = m.getLabel( x );
-            boolean found = false;
-            for( final SortedSet<String> group : groups ) {
-                if ( group.contains( a ) ) {
-                    found = true;
-                    break;
-                }
-            }
-            if ( !found ) {
-                final SortedSet<String> new_group = new TreeSet<String>();
-                new_group.add( a );
-                groups.add( new_group );
-            }
-        }
-        if ( calc_conly ) {
-            return groups.size();
-        }
-        final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
-        df.setDecimalSeparatorAlwaysShown( false );
-        df.setRoundingMode( RoundingMode.HALF_UP );
-        final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
-        int counter = 1;
-        for( final SortedSet<String> group : groups ) {
-            w.print( Integer.toString( counter++ ) );
-            for( final String s : group ) {
-                w.print( "\t" );
-                w.print( s );
-            }
-            w.println();
-        }
-        w.println();
-        w.println( "# Cutoff\t" + df.format( cutoff ) );
-        w.println();
-        w.println( "# Orthology support statistics:" );
-        if ( stats.getN() > 3 ) {
-            w.println( "# Median\t" + df.format( stats.median() ) );
-        }
-        w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
-        if ( stats.getN() > 3 ) {
-            w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
-        }
-        w.println( "# Min\t" + df.format( stats.getMin() ) );
-        w.println( "# Max\t" + df.format( stats.getMax() ) );
-        w.println( "# Total\t" + df.format( stats.getN() ) );
-        w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
-        w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
-        w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
-        w.close();
-        if ( verbose ) {
-            System.out.println( "Number of ortholog groups           :\t" + groups.size() );
-            System.out.println( "Wrote orthologs groups table to     :\t" + outfile.getCanonicalPath() );
-        }
-        return groups.size();
-    }
-
-    private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
-        final PhylogenyWriter writer = new PhylogenyWriter();
-        writer.toPhyloXML( f, p, 0 );
-        if ( comment != null ) {
-            System.out.println( comment + f.getCanonicalPath() );
-        }
-    }
  }
diff --git a/forester/java/src/org/forester/rio/RIOUtil.java b/forester/java/src/org/forester/rio/RIOUtil.java

new file mode 100644 (file)

index 0000000..8d01afa
--- /dev/null
+++ b/forester/java/src/org/forester/rio/RIOUtil.java
@@ -0,0 +1,490 @@
+
+package org.forester.rio;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.math.RoundingMode;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.forester.datastructures.IntMatrix;
+import org.forester.io.parsers.IteratingPhylogenyParser;
+import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
+import org.forester.io.parsers.nhx.NHXParser;
+import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlParser;
+import org.forester.io.parsers.util.ParserUtils;
+import org.forester.io.writers.PhylogenyWriter;
+import org.forester.phylogeny.Phylogeny;
+import org.forester.rio.RIO.REROOTING;
+import org.forester.sdi.SDIException;
+import org.forester.sdi.SDIutil.ALGORITHM;
+import org.forester.util.BasicDescriptiveStatistics;
+import org.forester.util.BasicTable;
+import org.forester.util.BasicTableParser;
+import org.forester.util.EasyWriter;
+import org.forester.util.ForesterUtil;
+
+public final class RIOUtil {
+
+    public static final void executeAnalysis( final File gene_trees_file,
+                                              final File species_tree_file,
+                                              final File orthology_outtable,
+                                              final File orthology_groups_outfile,
+                                              final File logfile,
+                                              final String outgroup,
+                                              final REROOTING rerooting,
+                                              final int gt_first,
+                                              final int gt_last,
+                                              final File return_species_tree,
+                                              final File return_min_dup_gene_tree,
+                                              final File return_median_dup_gene_tree,
+                                              final boolean transfer_taxonomy,
+                                              final ALGORITHM algorithm,
+                                              final boolean use_gene_trees_dir,
+                                              final EasyWriter log,
+                                              final double ortholog_group_cutoff ) {
+        try {
+            final RIO rio;
+            boolean iterating = false;
+            final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
+            if ( p instanceof PhyloXmlParser ) {
+                rio = RIO.executeAnalysis( gene_trees_file,
+                                           species_tree_file,
+                                           algorithm,
+                                           rerooting,
+                                           outgroup,
+                                           gt_first,
+                                           gt_last,
+                                           logfile != null,
+                                           true,
+                                           transfer_taxonomy );
+            }
+            else {
+                iterating = true;
+                if ( p instanceof NHXParser ) {
+                    final NHXParser nhx = ( NHXParser ) p;
+                    nhx.setReplaceUnderscores( false );
+                    nhx.setIgnoreQuotes( true );
+                    nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
+                }
+                else if ( p instanceof NexusPhylogeniesParser ) {
+                    final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
+                    nex.setReplaceUnderscores( false );
+                    nex.setIgnoreQuotes( true );
+                    nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
+                }
+                else {
+                    throw new RuntimeException( "unknown parser type: " + p );
+                }
+                final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
+                ip.setSource( gene_trees_file );
+                rio = RIO.executeAnalysis( ip,
+                                           species_tree_file,
+                                           algorithm,
+                                           rerooting,
+                                           outgroup,
+                                           gt_first,
+                                           gt_last,
+                                           logfile != null,
+                                           !use_gene_trees_dir,
+                                           transfer_taxonomy );
+            }
+            if ( !use_gene_trees_dir ) {
+                if ( algorithm == ALGORITHM.GSDIR ) {
+                    System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
+                }
+            }
+            ///
+            ////
+            final IntMatrix m;
+            if ( iterating ) {
+                m = rio.getOrthologTable();
+            }
+            else {
+                m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
+            }
+            final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
+            writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
+            final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
+                                                            ortholog_group_cutoff,
+                                                            stats.getN(),
+                                                            m,
+                                                            !use_gene_trees_dir,
+                                                            false );
+            final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
+            final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
+            final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
+            final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
+            final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
+            if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
+                writeLogFile( logfile,
+                              rio,
+                              species_tree_file,
+                              gene_trees_file,
+                              orthology_outtable,
+                              org.forester.application.rio.PRG_NAME,
+                              org.forester.application.rio.PRG_VERSION,
+                              org.forester.application.rio.PRG_DATE,
+                              ForesterUtil.getForesterLibraryInformation(),
+                              !use_gene_trees_dir );
+            }
+            if ( return_species_tree != null ) {
+                writeTree( rio.getSpeciesTree(),
+                           return_species_tree,
+                           use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
+            }
+            if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
+                final int min = ( int ) rio.getDuplicationsStatistics().getMin();
+                writeTree( rio.getMinDuplicationsGeneTree(),
+                           new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
+                           use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
+            }
+            if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
+                final int med = ( int ) rio.getDuplicationsStatistics().median();
+                writeTree( rio.getDuplicationsToTreeMap().get( med ),
+                           new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
+                           use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
+            }
+            final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
+            final int min = ( int ) stats.getMin();
+            final int max = ( int ) stats.getMax();
+            final int median = ( int ) stats.median();
+            int min_count = 0;
+            int max_count = 0;
+            int median_count = 0;
+            for( double d : stats.getData() ) {
+                if ( ( ( int ) d ) == min ) {
+                    ++min_count;
+                }
+                if ( ( ( int ) d ) == max ) {
+                    ++max_count;
+                }
+                if ( ( ( int ) d ) == median ) {
+                    ++median_count;
+                }
+            }
+            final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
+            final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
+            final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
+            if ( use_gene_trees_dir ) {
+                String name = gene_trees_file.getName();
+                if ( name.indexOf( "." ) > 0 ) {
+                    name = name.substring( 0, name.lastIndexOf( "." ) );
+                }
+                log.print( name );
+                log.print( "\t" );
+                log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups ) );
+                //
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups_005 ) );
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups_025 ) );
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups_05 ) );
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups_075 ) );
+                log.print( "\t" );
+                log.print( Integer.toString( ortholog_groups_095 ) );
+                //
+                log.print( "\t" );
+                if ( stats.getN() > 3 ) {
+                    log.print( df.format( median ) );
+                }
+                else {
+                    log.print( "" );
+                }
+                log.print( "\t" );
+                log.print( df.format( stats.arithmeticMean() ) );
+                log.print( "\t" );
+                if ( stats.getN() > 3 ) {
+                    log.print( df.format( stats.sampleStandardDeviation() ) );
+                }
+                else {
+                    log.print( "" );
+                }
+                log.print( "\t" );
+                log.print( Integer.toString( min ) );
+                log.print( "\t" );
+                log.print( Integer.toString( max ) );
+                log.print( "\t" );
+                log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
+                log.print( "\t" );
+                log.print( Integer.toString( stats.getN() ) );
+                log.println();
+            }
+            else {
+                System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
+                System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
+                System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
+                        + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
+                        + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
+                if ( stats.getN() > 3 ) {
+                    System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
+                            + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
+                }
+                System.out.println( "Minimum duplications                :\t" + min + "\t"
+                        + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
+                System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
+                        + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
+                System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
+                        + df.format( median_count_percentage ) + "%" );
+                System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
+                        + df.format( min_count_percentage ) + "%" );
+                System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
+                        + df.format( max_count_percentage ) + "%" );
+                if ( algorithm == ALGORITHM.GSDIR ) {
+                    System.out.println( "Removed ext gene tree nodes         :\t"
+                            + rio.getRemovedGeneTreeNodes().size() );
+                }
+            }
+        }
+        catch ( final RIOException e ) {
+            ForesterUtil.fatalError( e.getLocalizedMessage() );
+        }
+        catch ( final SDIException e ) {
+            ForesterUtil.fatalError( e.getLocalizedMessage() );
+        }
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( e.getLocalizedMessage() );
+        }
+        catch ( final OutOfMemoryError e ) {
+            ForesterUtil.outOfMemoryError( e );
+        }
+        catch ( final Exception e ) {
+            ForesterUtil.unexpectedFatalError( e );
+        }
+        catch ( final Error e ) {
+            ForesterUtil.unexpectedFatalError( e );
+        }
+    }
+
+    private static final void writeTable( final File table_outfile,
+                                          final int gene_trees_analyzed,
+                                          final IntMatrix m,
+                                          final boolean verbose )
+            throws IOException {
+        final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
+        final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
+        df.setDecimalSeparatorAlwaysShown( false );
+        df.setRoundingMode( RoundingMode.HALF_UP );
+        for( int i = 0; i < m.size(); ++i ) {
+            w.print( "\t" );
+            w.print( m.getLabel( i ) );
+        }
+        w.println();
+        for( int x = 0; x < m.size(); ++x ) {
+            w.print( m.getLabel( x ) );
+            for( int y = 0; y < m.size(); ++y ) {
+                w.print( "\t" );
+                if ( x == y ) {
+                    if ( m.get( x, y ) != gene_trees_analyzed ) {
+                        ForesterUtil.unexpectedFatalError( "diagonal value is off" );
+                    }
+                    w.print( "-" );
+                }
+                else {
+                    w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
+                }
+            }
+            w.println();
+        }
+        w.close();
+        if ( verbose ) {
+            System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
+        }
+    }
+
+    private static final int writeOrtologGroups( final File outfile,
+                                                 final double cutoff,
+                                                 final int gene_trees_analyzed,
+                                                 final IntMatrix m,
+                                                 final boolean verbose,
+                                                 final boolean calc_conly )
+            throws IOException {
+        List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
+        BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        int below_075 = 0;
+        int below_05 = 0;
+        int below_025 = 0;
+        for( int x = 1; x < m.size(); ++x ) {
+            final String a = m.getLabel( x );
+            for( int y = 0; y < x; ++y ) {
+                final String b = m.getLabel( y );
+                final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
+                stats.addValue( s );
+                if ( s < 0.75 ) {
+                    below_075++;
+                    if ( s < 0.5 ) {
+                        below_05++;
+                        if ( s < 0.25 ) {
+                            below_025++;
+                        }
+                    }
+                }
+                if ( s >= cutoff ) {
+                    boolean found = false;
+                    for( final SortedSet<String> group : groups ) {
+                        if ( group.contains( a ) ) {
+                            group.add( b );
+                            found = true;
+                        }
+                        if ( group.contains( b ) ) {
+                            group.add( a );
+                            found = true;
+                        }
+                    }
+                    if ( !found ) {
+                        final SortedSet<String> new_group = new TreeSet<String>();
+                        new_group.add( a );
+                        new_group.add( b );
+                        groups.add( new_group );
+                    }
+                }
+            }
+        }
+        //Deal with singlets:
+        for( int x = 0; x < m.size(); ++x ) {
+            final String a = m.getLabel( x );
+            boolean found = false;
+            for( final SortedSet<String> group : groups ) {
+                if ( group.contains( a ) ) {
+                    found = true;
+                    break;
+                }
+            }
+            if ( !found ) {
+                final SortedSet<String> new_group = new TreeSet<String>();
+                new_group.add( a );
+                groups.add( new_group );
+            }
+        }
+        if ( calc_conly ) {
+            return groups.size();
+        }
+        final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
+        df.setDecimalSeparatorAlwaysShown( false );
+        df.setRoundingMode( RoundingMode.HALF_UP );
+        final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
+        int counter = 1;
+        for( final SortedSet<String> group : groups ) {
+            w.print( Integer.toString( counter++ ) );
+            for( final String s : group ) {
+                w.print( "\t" );
+                w.print( s );
+            }
+            w.println();
+        }
+        w.println();
+        w.println( "# Cutoff\t" + df.format( cutoff ) );
+        w.println();
+        w.println( "# Orthology support statistics:" );
+        if ( stats.getN() > 3 ) {
+            w.println( "# Median\t" + df.format( stats.median() ) );
+        }
+        w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
+        if ( stats.getN() > 3 ) {
+            w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
+        }
+        w.println( "# Min\t" + df.format( stats.getMin() ) );
+        w.println( "# Max\t" + df.format( stats.getMax() ) );
+        w.println( "# Total\t" + df.format( stats.getN() ) );
+        w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
+        w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
+        w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
+        w.close();
+        if ( verbose ) {
+            System.out.println( "Number of ortholog groups           :\t" + groups.size() );
+            System.out.println( "Wrote orthologs groups table to     :\t" + outfile.getCanonicalPath() );
+        }
+        return groups.size();
+    }
+
+    private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
+        final PhylogenyWriter writer = new PhylogenyWriter();
+        writer.toPhyloXML( f, p, 0 );
+        if ( comment != null ) {
+            System.out.println( comment + f.getCanonicalPath() );
+        }
+    }
+
+    private static void writeLogFile( final File logfile,
+                                      final RIO rio,
+                                      final File species_tree_file,
+                                      final File gene_trees_file,
+                                      final File outtable,
+                                      final String prg_name,
+                                      final String prg_v,
+                                      final String prg_date,
+                                      final String f,
+                                      final boolean verbose )
+            throws IOException {
+        final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
+        out.println( "# " + prg_name );
+        out.println( "# version : " + prg_v );
+        out.println( "# date    : " + prg_date );
+        out.println( "# based on: " + f );
+        out.println( "# ----------------------------------" );
+        out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
+        out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
+        out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
+        out.flush();
+        out.println( rio.getLog().toString() );
+        out.close();
+        if ( verbose ) {
+            System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
+        }
+    }
+
+    private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
+            throws IOException {
+        if ( !dir.exists() ) {
+            throw new IOException( "[" + dir + "] does not exist" );
+        }
+        if ( !dir.isDirectory() ) {
+            throw new IOException( "[" + dir + "] is not a directory" );
+        }
+        final File mapping_files[] = dir.listFiles( new FilenameFilter() {
+
+            @Override
+            public boolean accept( final File dir, final String name ) {
+                return ( name.endsWith( suffix ) );
+            }
+        } );
+        String my_suffix = suffix;
+        boolean done = false;
+        do {
+            int matches = 0;
+            for( File file : mapping_files ) {
+                if ( file.getName().equals( my_suffix ) ) {
+                    matches++;
+                }
+            }
+            if ( matches == 1) {
+                done = true;
+            }
+            else {
+                my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
+            }
+        } while (!done );
+        
+        
+        if ( mapping_files.length == 0 ) {
+            throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
+                    + dir + "] " );
+        }
+        if ( mapping_files.length > 1 ) {
+            throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
+                    + dir + "] " );
+        }
+        final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
+        return t.getColumnsAsMap( 0, 1 );
+    }
+}
author	cmzmasek <chris.zma@outlook.com>
	Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)
committer	cmzmasek <chris.zma@outlook.com>
	Mon, 17 Apr 2017 20:41:44 +0000 (13:41 -0700)
forester/java/src/org/forester/application/rio.java		patch \| blob \| history
forester/java/src/org/forester/rio/RIOUtil.java	[new file with mode: 0644]	patch \| blob