JAL-2844 work started on moving the graphical partitioning code to Aptx

[jalview.git] / forester / java / src / org / forester / rio / RIOUtil.java
diff --git a/forester/java/src/org/forester/rio/RIOUtil.java b/forester/java/src/org/forester/rio/RIOUtil.java

index 8266047..638a757 100644 (file)
--- a/forester/java/src/org/forester/rio/RIOUtil.java
+++ b/forester/java/src/org/forester/rio/RIOUtil.java
@@ -2,12 +2,14 @@
  package org.forester.rio;
  
  import java.io.File;
-import java.io.FilenameFilter;
+import java.io.FileNotFoundException;
  import java.io.IOException;
  import java.math.RoundingMode;
  import java.util.ArrayList;
+import java.util.Iterator;
  import java.util.List;
  import java.util.Map;
+import java.util.Map.Entry;
  import java.util.SortedMap;
  import java.util.SortedSet;
  import java.util.TreeSet;
@@ -18,15 +20,22 @@ import org.forester.io.parsers.PhylogenyParser;
  import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
  import org.forester.io.parsers.nhx.NHXParser;
  import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
  import org.forester.io.parsers.phyloxml.PhyloXmlParser;
  import org.forester.io.parsers.util.ParserUtils;
  import org.forester.io.writers.PhylogenyWriter;
  import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
  import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
  import org.forester.phylogeny.data.Sequence;
+import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
+import org.forester.phylogeny.factories.PhylogenyFactory;
  import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
  import org.forester.rio.RIO.REROOTING;
+import org.forester.sdi.GSDIR;
  import org.forester.sdi.SDIException;
+import org.forester.sdi.SDIutil;
  import org.forester.sdi.SDIutil.ALGORITHM;
  import org.forester.util.BasicDescriptiveStatistics;
  import org.forester.util.BasicTable;
@@ -36,6 +45,15 @@ import org.forester.util.ForesterUtil;
  
  public final class RIOUtil {
  
+    public final static String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_stripped_species_tree.xml";
+    public final static String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
+    public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
+    public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
+    public final static String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
+    public final static String BEST_TREE_SUFFIX               = "_RIO_consensus_gene_tree_dup_";
+    public final static String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
+    public final static String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
+
      public static final void executeAnalysis( final File gene_trees_file,
                                                final File species_tree_file,
                                                final File orthology_outtable,
@@ -56,7 +74,11 @@ public final class RIOUtil {
                                                final double ortholog_group_cutoff,
                                                final boolean perform_id_mapping,
                                                final File id_mapping_dir,
-                                              final String id_mapping_suffix ) {
+                                              final String id_mapping_suffix,
+                                              final boolean perform_gsdir_on_best_tree,
+                                              final File outdir,
+                                              final File best_trees_indir,
+                                              final String best_trees_suffix ) {
          try {
              final SortedMap<String, String> id_map;
              if ( perform_id_mapping ) {
@@ -122,6 +144,18 @@ public final class RIOUtil {
              else {
                  m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
              }
+            final GSDIR gsdir_for_best_tree;
+            if ( perform_gsdir_on_best_tree ) {
+                gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
+                                                            species_tree_file,
+                                                            outdir,
+                                                            best_trees_indir,
+                                                            id_map,
+                                                            best_trees_suffix );
+            }
+            else {
+                gsdir_for_best_tree = null;
+            }
              final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
              if ( perform_id_mapping ) {
                  writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
@@ -222,6 +256,13 @@ public final class RIOUtil {
                  log.print( "\t" );
                  log.print( Integer.toString( ortholog_groups_095 ) );
                  //
+                if ( true ) {
+                    log.print( "\t" );
+                    log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                    log.print( "\t" );
+                    log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                }
+                //
                  log.print( "\t" );
                  if ( stats.getN() > 3 ) {
                      log.print( df.format( median ) );
@@ -294,6 +335,51 @@ public final class RIOUtil {
          }
      }
  
+    private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
+                                                     final File species_tree_file,
+                                                     final File outdir,
+                                                     final File best_trees_indir,
+                                                     final SortedMap<String, String> id_map,
+                                                     final String best_trees_suffix )
+            throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
+        final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
+                                                           gene_trees_file.getName(),
+                                                           best_trees_suffix );
+        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+        final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
+        final Phylogeny species_tree = SDIutil
+                .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
+        PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+        best_tree.setRooted( true );
+        species_tree.setRooted( true );
+        if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
+            throw new IOException( "gene tree matching to ["
+                    + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
+        }
+        final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
+        while ( it.hasNext() ) {
+            final PhylogenyNode n = it.next();
+            final String name = n.getName().trim();
+            if ( !ForesterUtil.isEmpty( name ) ) {
+                try {
+                    ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
+                }
+                catch ( final PhyloXmlDataFormatException e ) {
+                    // Ignore.
+                }
+            }
+        }
+        final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
+        final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
+        result_gene_tree.setRerootable( false );
+        PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
+        final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
+        final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
+                + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
+        writeTree( result_gene_tree, outfile, null, id_map );
+        return gsdir_for_best_tree;
+    }
+
      private static final void writeOrthologyTable( final File table_outfile,
                                                     final int gene_trees_analyzed,
                                                     final IntMatrix m,
@@ -342,6 +428,14 @@ public final class RIOUtil {
          }
          if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
              w.println();
+            
+            final Iterator<?> it = id_map.entrySet().iterator();
+            while (it.hasNext()) {
+                Map.Entry<String, String> pair = ( Entry<String, String> ) it.next();
+                w.println( pair.getKey()  + "\t" + pair.getValue() );
+            } //TODO
+            
+            /*
              id_map.forEach( ( k, v ) -> {
                  try {
                      w.println( k + "\t" + v );
@@ -349,7 +443,7 @@ public final class RIOUtil {
                  catch ( final IOException e ) {
                      //ignore
                  }
-            } );
+            } );*/
          }
          w.close();
          if ( verbose ) {
@@ -529,63 +623,7 @@ public final class RIOUtil {
                                                                    final String prefix,
                                                                    final String suffix )
              throws IOException {
-        if ( !dir.exists() ) {
-            throw new IOException( "[" + dir + "] does not exist" );
-        }
-        if ( !dir.isDirectory() ) {
-            throw new IOException( "[" + dir + "] is not a directory" );
-        }
-        final File mapping_files[] = dir.listFiles( new FilenameFilter() {
-
-            @Override
-            public boolean accept( final File dir, final String name ) {
-                return ( name.endsWith( suffix ) );
-            }
-        } );
-        if ( mapping_files.length == 1 ) {
-            throw new IOException( "no files ending with \"" + suffix + "\" found in [" + dir + "]" );
-        }
-        String my_prefix = ForesterUtil.removeFileExtension( prefix );
-        boolean done = false;
-        boolean more_than_one = false;
-        File the_one = null;
-        do {
-            int matches = 0;
-            for( File file : mapping_files ) {
-                if ( file.getName().startsWith( my_prefix ) ) {
-                    matches++;
-                    if ( matches > 1 ) {
-                        the_one = null;
-                        break;
-                    }
-                    the_one = file;
-                }
-            }
-            if ( matches > 1 ) {
-                more_than_one = true;
-                done = true;
-            }
-            if ( matches == 1 ) {
-                done = true;
-            }
-            else {
-                if ( my_prefix.length() <= 1 ) {
-                    throw new IOException( "no file matching \"" + ForesterUtil.removeFileExtension( prefix )
-                            + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" );
-                }
-                my_prefix = my_prefix.substring( 0, my_prefix.length() - 1 );
-            }
-        } while ( !done );
-        if ( more_than_one ) {
-            throw new IOException( "multiple files matching \"" + ForesterUtil.removeFileExtension( prefix )
-                    + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" );
-        }
-        else if ( the_one != null ) {
-        }
-        else {
-            throw new IOException( "no file matching \"" + ForesterUtil.removeFileExtension( prefix )
-                    + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" );
-        }
+        final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
          final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
          return t.getColumnsAsMap( 0, 1 );
      }