JAL-2844 work started on moving the graphical partitioning code to Aptx
[jalview.git] / forester / java / src / org / forester / rio / RIOUtil.java
index 8d01afa..638a757 100644 (file)
@@ -2,12 +2,15 @@
 package org.forester.rio;
 
 import java.io.File;
-import java.io.FilenameFilter;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.math.RoundingMode;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
@@ -17,12 +20,22 @@ import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
 import org.forester.io.parsers.nhx.NHXParser;
 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.writers.PhylogenyWriter;
 import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
+import org.forester.phylogeny.data.Sequence;
+import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
+import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.rio.RIO.REROOTING;
+import org.forester.sdi.GSDIR;
 import org.forester.sdi.SDIException;
+import org.forester.sdi.SDIutil;
 import org.forester.sdi.SDIutil.ALGORITHM;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.BasicTable;
@@ -32,9 +45,19 @@ import org.forester.util.ForesterUtil;
 
 public final class RIOUtil {
 
+    public final static String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_stripped_species_tree.xml";
+    public final static String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
+    public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
+    public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
+    public final static String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
+    public final static String BEST_TREE_SUFFIX               = "_RIO_consensus_gene_tree_dup_";
+    public final static String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
+    public final static String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
+
     public static final void executeAnalysis( final File gene_trees_file,
                                               final File species_tree_file,
                                               final File orthology_outtable,
+                                              final File orthology_outtable_with_mappings,
                                               final File orthology_groups_outfile,
                                               final File logfile,
                                               final String outgroup,
@@ -48,8 +71,22 @@ public final class RIOUtil {
                                               final ALGORITHM algorithm,
                                               final boolean use_gene_trees_dir,
                                               final EasyWriter log,
-                                              final double ortholog_group_cutoff ) {
+                                              final double ortholog_group_cutoff,
+                                              final boolean perform_id_mapping,
+                                              final File id_mapping_dir,
+                                              final String id_mapping_suffix,
+                                              final boolean perform_gsdir_on_best_tree,
+                                              final File outdir,
+                                              final File best_trees_indir,
+                                              final String best_trees_suffix ) {
         try {
+            final SortedMap<String, String> id_map;
+            if ( perform_id_mapping ) {
+                id_map = obtainMapping( id_mapping_dir, gene_trees_file.getName(), id_mapping_suffix );
+            }
+            else {
+                id_map = null;
+            }
             final RIO rio;
             boolean iterating = false;
             final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
@@ -100,8 +137,6 @@ public final class RIOUtil {
                     System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
                 }
             }
-            ///
-            ////
             final IntMatrix m;
             if ( iterating ) {
                 m = rio.getOrthologTable();
@@ -109,19 +144,43 @@ public final class RIOUtil {
             else {
                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
             }
+            final GSDIR gsdir_for_best_tree;
+            if ( perform_gsdir_on_best_tree ) {
+                gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
+                                                            species_tree_file,
+                                                            outdir,
+                                                            best_trees_indir,
+                                                            id_map,
+                                                            best_trees_suffix );
+            }
+            else {
+                gsdir_for_best_tree = null;
+            }
             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
-            writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
+            if ( perform_id_mapping ) {
+                writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
+                writeOrthologyTable( orthology_outtable_with_mappings,
+                                     stats.getN(),
+                                     m,
+                                     !use_gene_trees_dir,
+                                     id_map,
+                                     false );
+            }
+            else {
+                writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, null, false );
+            }
             final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
                                                             ortholog_group_cutoff,
                                                             stats.getN(),
                                                             m,
                                                             !use_gene_trees_dir,
-                                                            false );
-            final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
-            final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
-            final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
-            final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
-            final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
+                                                            false,
+                                                            id_map );
+            final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true, null );
+            final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true, null );
+            final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true, null );
+            final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true, null );
+            final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true, null );
             if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
                 writeLogFile( logfile,
                               rio,
@@ -137,19 +196,22 @@ public final class RIOUtil {
             if ( return_species_tree != null ) {
                 writeTree( rio.getSpeciesTree(),
                            return_species_tree,
-                           use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
+                           use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t",
+                           null );
             }
             if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
                 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
                 writeTree( rio.getMinDuplicationsGeneTree(),
                            new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
-                           use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
+                           use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t",
+                           id_map );
             }
             if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
                 final int med = ( int ) rio.getDuplicationsStatistics().median();
                 writeTree( rio.getDuplicationsToTreeMap().get( med ),
                            new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
-                           use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
+                           use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t",
+                           id_map );
             }
             final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
             final int min = ( int ) stats.getMin();
@@ -194,6 +256,13 @@ public final class RIOUtil {
                 log.print( "\t" );
                 log.print( Integer.toString( ortholog_groups_095 ) );
                 //
+                if ( true ) {
+                    log.print( "\t" );
+                    log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                    log.print( "\t" );
+                    log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                }
+                //
                 log.print( "\t" );
                 if ( stats.getN() > 3 ) {
                     log.print( df.format( median ) );
@@ -266,10 +335,57 @@ public final class RIOUtil {
         }
     }
 
-    private static final void writeTable( final File table_outfile,
-                                          final int gene_trees_analyzed,
-                                          final IntMatrix m,
-                                          final boolean verbose )
+    private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
+                                                     final File species_tree_file,
+                                                     final File outdir,
+                                                     final File best_trees_indir,
+                                                     final SortedMap<String, String> id_map,
+                                                     final String best_trees_suffix )
+            throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
+        final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
+                                                           gene_trees_file.getName(),
+                                                           best_trees_suffix );
+        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+        final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
+        final Phylogeny species_tree = SDIutil
+                .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
+        PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+        best_tree.setRooted( true );
+        species_tree.setRooted( true );
+        if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
+            throw new IOException( "gene tree matching to ["
+                    + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
+        }
+        final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
+        while ( it.hasNext() ) {
+            final PhylogenyNode n = it.next();
+            final String name = n.getName().trim();
+            if ( !ForesterUtil.isEmpty( name ) ) {
+                try {
+                    ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
+                }
+                catch ( final PhyloXmlDataFormatException e ) {
+                    // Ignore.
+                }
+            }
+        }
+        final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
+        final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
+        result_gene_tree.setRerootable( false );
+        PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
+        final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
+        final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
+                + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
+        writeTree( result_gene_tree, outfile, null, id_map );
+        return gsdir_for_best_tree;
+    }
+
+    private static final void writeOrthologyTable( final File table_outfile,
+                                                   final int gene_trees_analyzed,
+                                                   final IntMatrix m,
+                                                   final boolean verbose,
+                                                   final SortedMap<String, String> id_map,
+                                                   final boolean replace_ids )
             throws IOException {
         final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
@@ -277,11 +393,25 @@ public final class RIOUtil {
         df.setRoundingMode( RoundingMode.HALF_UP );
         for( int i = 0; i < m.size(); ++i ) {
             w.print( "\t" );
-            w.print( m.getLabel( i ) );
+            if ( replace_ids ) {
+                if ( !id_map.containsKey( m.getLabel( i ) ) ) {
+                    throw new IOException( "no id mapping for \"" + m.getLabel( i ) + "\" (attempting to write ["
+                            + table_outfile + "])" );
+                }
+                w.print( id_map.get( m.getLabel( i ) ) );
+            }
+            else {
+                w.print( m.getLabel( i ) );
+            }
         }
         w.println();
         for( int x = 0; x < m.size(); ++x ) {
-            w.print( m.getLabel( x ) );
+            if ( replace_ids ) {
+                w.print( id_map.get( m.getLabel( x ) ) );
+            }
+            else {
+                w.print( m.getLabel( x ) );
+            }
             for( int y = 0; y < m.size(); ++y ) {
                 w.print( "\t" );
                 if ( x == y ) {
@@ -296,6 +426,25 @@ public final class RIOUtil {
             }
             w.println();
         }
+        if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
+            w.println();
+            
+            final Iterator<?> it = id_map.entrySet().iterator();
+            while (it.hasNext()) {
+                Map.Entry<String, String> pair = ( Entry<String, String> ) it.next();
+                w.println( pair.getKey()  + "\t" + pair.getValue() );
+            } //TODO
+            
+            /*
+            id_map.forEach( ( k, v ) -> {
+                try {
+                    w.println( k + "\t" + v );
+                }
+                catch ( final IOException e ) {
+                    //ignore
+                }
+            } );*/
+        }
         w.close();
         if ( verbose ) {
             System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
@@ -307,7 +456,8 @@ public final class RIOUtil {
                                                  final int gene_trees_analyzed,
                                                  final IntMatrix m,
                                                  final boolean verbose,
-                                                 final boolean calc_conly )
+                                                 final boolean calc_conly,
+                                                 final SortedMap<String, String> id_map )
             throws IOException {
         List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
         BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
@@ -378,7 +528,16 @@ public final class RIOUtil {
             w.print( Integer.toString( counter++ ) );
             for( final String s : group ) {
                 w.print( "\t" );
-                w.print( s );
+                if ( id_map != null && id_map.size() > 0 ) {
+                    if ( !id_map.containsKey( s ) ) {
+                        throw new IOException( "no id mapping for \"" + s + "\" (attempting to write [" + outfile
+                                + "])" );
+                    }
+                    w.print( id_map.get( s ) );
+                }
+                else {
+                    w.print( s );
+                }
             }
             w.println();
         }
@@ -407,7 +566,24 @@ public final class RIOUtil {
         return groups.size();
     }
 
-    private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
+    private static void writeTree( final Phylogeny p,
+                                   final File f,
+                                   final String comment,
+                                   final SortedMap<String, String> id_map )
+            throws IOException {
+        if ( id_map != null && id_map.size() > 0 ) {
+            final PhylogenyNodeIterator it = p.iteratorExternalForward();
+            while ( it.hasNext() ) {
+                final PhylogenyNode n = it.next();
+                if ( !id_map.containsKey( n.getName() ) ) {
+                    throw new IOException( "no id mapping for \"" + n.getName() + "\" (attempting to write [" + f
+                            + "])" );
+                }
+                final Sequence seq = new Sequence();
+                seq.setName( id_map.get( n.getName() ) );
+                n.getNodeData().addSequence( seq );
+            }
+        }
         final PhylogenyWriter writer = new PhylogenyWriter();
         writer.toPhyloXML( f, p, 0 );
         if ( comment != null ) {
@@ -443,48 +619,12 @@ public final class RIOUtil {
         }
     }
 
-    private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
+    private final static SortedMap<String, String> obtainMapping( final File dir,
+                                                                  final String prefix,
+                                                                  final String suffix )
             throws IOException {
-        if ( !dir.exists() ) {
-            throw new IOException( "[" + dir + "] does not exist" );
-        }
-        if ( !dir.isDirectory() ) {
-            throw new IOException( "[" + dir + "] is not a directory" );
-        }
-        final File mapping_files[] = dir.listFiles( new FilenameFilter() {
-
-            @Override
-            public boolean accept( final File dir, final String name ) {
-                return ( name.endsWith( suffix ) );
-            }
-        } );
-        String my_suffix = suffix;
-        boolean done = false;
-        do {
-            int matches = 0;
-            for( File file : mapping_files ) {
-                if ( file.getName().equals( my_suffix ) ) {
-                    matches++;
-                }
-            }
-            if ( matches == 1) {
-                done = true;
-            }
-            else {
-                my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
-            }
-        } while (!done );
-        
-        
-        if ( mapping_files.length == 0 ) {
-            throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
-                    + dir + "] " );
-        }
-        if ( mapping_files.length > 1 ) {
-            throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
-                    + dir + "] " );
-        }
-        final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
+        final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
+        final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
         return t.getColumnsAsMap( 0, 1 );
     }
 }