package org.forester.rio;
import java.io.File;
-import java.io.FilenameFilter;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.math.RoundingMode;
import java.util.ArrayList;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
import org.forester.io.parsers.nhx.NHXParser;
import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
import org.forester.io.parsers.phyloxml.PhyloXmlParser;
import org.forester.io.parsers.util.ParserUtils;
import org.forester.io.writers.PhylogenyWriter;
import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
+import org.forester.phylogeny.data.Sequence;
+import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
+import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.rio.RIO.REROOTING;
+import org.forester.sdi.GSDIR;
import org.forester.sdi.SDIException;
+import org.forester.sdi.SDIutil;
import org.forester.sdi.SDIutil.ALGORITHM;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.BasicTable;
public final class RIOUtil {
+ public final static String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_stripped_species_tree.xml";
+ public final static String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv";
+ public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
+ public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_";
+ public final static String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_";
+ public final static String BEST_TREE_SUFFIX = "_RIO_consensus_gene_tree_dup_";
+ public final static String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv";
+ public final static String LOGFILE_SUFFIX = "_RIO_log.tsv";
+
public static final void executeAnalysis( final File gene_trees_file,
final File species_tree_file,
final File orthology_outtable,
+ final File orthology_outtable_with_mappings,
final File orthology_groups_outfile,
final File logfile,
final String outgroup,
final ALGORITHM algorithm,
final boolean use_gene_trees_dir,
final EasyWriter log,
- final double ortholog_group_cutoff ) {
+ final double ortholog_group_cutoff,
+ final boolean perform_id_mapping,
+ final File id_mapping_dir,
+ final String id_mapping_suffix,
+ final boolean perform_gsdir_on_best_tree,
+ final File outdir,
+ final File best_trees_indir,
+ final String best_trees_suffix ) {
try {
+ final SortedMap<String, String> id_map;
+ if ( perform_id_mapping ) {
+ id_map = obtainMapping( id_mapping_dir, gene_trees_file.getName(), id_mapping_suffix );
+ }
+ else {
+ id_map = null;
+ }
final RIO rio;
boolean iterating = false;
final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
}
}
- ///
- ////
final IntMatrix m;
if ( iterating ) {
m = rio.getOrthologTable();
else {
m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
}
+ final GSDIR gsdir_for_best_tree;
+ if ( perform_gsdir_on_best_tree ) {
+ gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
+ species_tree_file,
+ outdir,
+ best_trees_indir,
+ id_map,
+ best_trees_suffix );
+ }
+ else {
+ gsdir_for_best_tree = null;
+ }
final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
- writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
+ if ( perform_id_mapping ) {
+ writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
+ writeOrthologyTable( orthology_outtable_with_mappings,
+ stats.getN(),
+ m,
+ !use_gene_trees_dir,
+ id_map,
+ false );
+ }
+ else {
+ writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, null, false );
+ }
final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
ortholog_group_cutoff,
stats.getN(),
m,
!use_gene_trees_dir,
- false );
- final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
- final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
- final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
- final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
- final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
+ false,
+ id_map );
+ final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true, null );
+ final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true, null );
+ final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true, null );
+ final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true, null );
+ final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true, null );
if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
writeLogFile( logfile,
rio,
if ( return_species_tree != null ) {
writeTree( rio.getSpeciesTree(),
return_species_tree,
- use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" );
+ use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t",
+ null );
}
if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
final int min = ( int ) rio.getDuplicationsStatistics().getMin();
writeTree( rio.getMinDuplicationsGeneTree(),
new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
- use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
+ use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t",
+ id_map );
}
if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
final int med = ( int ) rio.getDuplicationsStatistics().median();
writeTree( rio.getDuplicationsToTreeMap().get( med ),
new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
- use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
+ use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t",
+ id_map );
}
final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
final int min = ( int ) stats.getMin();
log.print( "\t" );
log.print( Integer.toString( ortholog_groups_095 ) );
//
+ if ( true ) {
+ log.print( "\t" );
+ log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
+ log.print( "\t" );
+ log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
+ }
+ //
log.print( "\t" );
if ( stats.getN() > 3 ) {
log.print( df.format( median ) );
}
}
- private static final void writeTable( final File table_outfile,
- final int gene_trees_analyzed,
- final IntMatrix m,
- final boolean verbose )
+ private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
+ final File species_tree_file,
+ final File outdir,
+ final File best_trees_indir,
+ final SortedMap<String, String> id_map,
+ final String best_trees_suffix )
+ throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
+ final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
+ gene_trees_file.getName(),
+ best_trees_suffix );
+ final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+ final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
+ final Phylogeny species_tree = SDIutil
+ .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
+ PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+ best_tree.setRooted( true );
+ species_tree.setRooted( true );
+ if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
+ throw new IOException( "gene tree matching to ["
+ + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
+ }
+ final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
+ while ( it.hasNext() ) {
+ final PhylogenyNode n = it.next();
+ final String name = n.getName().trim();
+ if ( !ForesterUtil.isEmpty( name ) ) {
+ try {
+ ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
+ }
+ catch ( final PhyloXmlDataFormatException e ) {
+ // Ignore.
+ }
+ }
+ }
+ final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
+ final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
+ result_gene_tree.setRerootable( false );
+ PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
+ final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
+ final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
+ + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
+ writeTree( result_gene_tree, outfile, null, id_map );
+ return gsdir_for_best_tree;
+ }
+
+ private static final void writeOrthologyTable( final File table_outfile,
+ final int gene_trees_analyzed,
+ final IntMatrix m,
+ final boolean verbose,
+ final SortedMap<String, String> id_map,
+ final boolean replace_ids )
throws IOException {
final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
df.setRoundingMode( RoundingMode.HALF_UP );
for( int i = 0; i < m.size(); ++i ) {
w.print( "\t" );
- w.print( m.getLabel( i ) );
+ if ( replace_ids ) {
+ if ( !id_map.containsKey( m.getLabel( i ) ) ) {
+ throw new IOException( "no id mapping for \"" + m.getLabel( i ) + "\" (attempting to write ["
+ + table_outfile + "])" );
+ }
+ w.print( id_map.get( m.getLabel( i ) ) );
+ }
+ else {
+ w.print( m.getLabel( i ) );
+ }
}
w.println();
for( int x = 0; x < m.size(); ++x ) {
- w.print( m.getLabel( x ) );
+ if ( replace_ids ) {
+ w.print( id_map.get( m.getLabel( x ) ) );
+ }
+ else {
+ w.print( m.getLabel( x ) );
+ }
for( int y = 0; y < m.size(); ++y ) {
w.print( "\t" );
if ( x == y ) {
}
w.println();
}
+ if ( !replace_ids && id_map != null && id_map.size() > 0 ) {
+ w.println();
+
+ final Iterator<?> it = id_map.entrySet().iterator();
+ while (it.hasNext()) {
+ Map.Entry<String, String> pair = ( Entry<String, String> ) it.next();
+ w.println( pair.getKey() + "\t" + pair.getValue() );
+ } //TODO
+
+ /*
+ id_map.forEach( ( k, v ) -> {
+ try {
+ w.println( k + "\t" + v );
+ }
+ catch ( final IOException e ) {
+ //ignore
+ }
+ } );*/
+ }
w.close();
if ( verbose ) {
System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
final int gene_trees_analyzed,
final IntMatrix m,
final boolean verbose,
- final boolean calc_conly )
+ final boolean calc_conly,
+ final SortedMap<String, String> id_map )
throws IOException {
List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
w.print( Integer.toString( counter++ ) );
for( final String s : group ) {
w.print( "\t" );
- w.print( s );
+ if ( id_map != null && id_map.size() > 0 ) {
+ if ( !id_map.containsKey( s ) ) {
+ throw new IOException( "no id mapping for \"" + s + "\" (attempting to write [" + outfile
+ + "])" );
+ }
+ w.print( id_map.get( s ) );
+ }
+ else {
+ w.print( s );
+ }
}
w.println();
}
return groups.size();
}
- private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
+ private static void writeTree( final Phylogeny p,
+ final File f,
+ final String comment,
+ final SortedMap<String, String> id_map )
+ throws IOException {
+ if ( id_map != null && id_map.size() > 0 ) {
+ final PhylogenyNodeIterator it = p.iteratorExternalForward();
+ while ( it.hasNext() ) {
+ final PhylogenyNode n = it.next();
+ if ( !id_map.containsKey( n.getName() ) ) {
+ throw new IOException( "no id mapping for \"" + n.getName() + "\" (attempting to write [" + f
+ + "])" );
+ }
+ final Sequence seq = new Sequence();
+ seq.setName( id_map.get( n.getName() ) );
+ n.getNodeData().addSequence( seq );
+ }
+ }
final PhylogenyWriter writer = new PhylogenyWriter();
writer.toPhyloXML( f, p, 0 );
if ( comment != null ) {
}
}
- private final static Map<String, String> obtainMapping( final File dir, final String prefix, final String suffix )
+ private final static SortedMap<String, String> obtainMapping( final File dir,
+ final String prefix,
+ final String suffix )
throws IOException {
- if ( !dir.exists() ) {
- throw new IOException( "[" + dir + "] does not exist" );
- }
- if ( !dir.isDirectory() ) {
- throw new IOException( "[" + dir + "] is not a directory" );
- }
- final File mapping_files[] = dir.listFiles( new FilenameFilter() {
-
- @Override
- public boolean accept( final File dir, final String name ) {
- return ( name.endsWith( suffix ) );
- }
- } );
- String my_suffix = suffix;
- boolean done = false;
- do {
- int matches = 0;
- for( File file : mapping_files ) {
- if ( file.getName().equals( my_suffix ) ) {
- matches++;
- }
- }
- if ( matches == 1) {
- done = true;
- }
- else {
- my_suffix = my_suffix.substring( 0, my_suffix.length() - 1);
- }
- } while (!done );
-
-
- if ( mapping_files.length == 0 ) {
- throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not found in ["
- + dir + "] " );
- }
- if ( mapping_files.length > 1 ) {
- throw new IOException( "file with prefix \"" + prefix + "\" and suffix \"" + suffix + "\" not unique in ["
- + dir + "] " );
- }
- final BasicTable<String> t = BasicTableParser.parse( mapping_files[ 0 ], '\t' );
+ final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
+ final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
return t.getColumnsAsMap( 0, 1 );
}
}