X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Frio%2FRIOUtil.java;h=89b753cfeed1014ca4c375fc9113f18a91a548d9;hb=c0439ed8b088887ffea2faf11bc7897333287cb3;hp=826604719128ee54ee10920c55d18fa430ccccc7;hpb=4f9fe473a3a762e323874166d607e14df6ef5dc0;p=jalview.git diff --git a/forester/java/src/org/forester/rio/RIOUtil.java b/forester/java/src/org/forester/rio/RIOUtil.java index 8266047..89b753c 100644 --- a/forester/java/src/org/forester/rio/RIOUtil.java +++ b/forester/java/src/org/forester/rio/RIOUtil.java @@ -2,12 +2,11 @@ package org.forester.rio; import java.io.File; -import java.io.FilenameFilter; +import java.io.FileNotFoundException; import java.io.IOException; import java.math.RoundingMode; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeSet; @@ -18,15 +17,22 @@ import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY; import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.rio.RIO.REROOTING; +import org.forester.sdi.GSDIR; import org.forester.sdi.SDIException; +import org.forester.sdi.SDIutil; import org.forester.sdi.SDIutil.ALGORITHM; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.BasicTable; @@ -36,6 +42,15 @@ import org.forester.util.ForesterUtil; public final class RIOUtil { + public final static String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_stripped_species_tree.xml"; + public final static String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv"; + public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv"; + public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_"; + public final static String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_"; + public final static String BEST_TREE_SUFFIX = "_RIO_consensus_gene_tree_dup_"; + public final static String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv"; + public final static String LOGFILE_SUFFIX = "_RIO_log.tsv"; + public static final void executeAnalysis( final File gene_trees_file, final File species_tree_file, final File orthology_outtable, @@ -56,7 +71,11 @@ public final class RIOUtil { final double ortholog_group_cutoff, final boolean perform_id_mapping, final File id_mapping_dir, - final String id_mapping_suffix ) { + final String id_mapping_suffix, + final boolean perform_gsdir_on_best_tree, + final File outdir, + final File best_trees_indir, + final String best_trees_suffix ) { try { final SortedMap id_map; if ( perform_id_mapping ) { @@ -122,6 +141,18 @@ public final class RIOUtil { else { m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); } + final GSDIR gsdir_for_best_tree; + if ( perform_gsdir_on_best_tree ) { + gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file, + species_tree_file, + outdir, + best_trees_indir, + id_map, + best_trees_suffix ); + } + else { + gsdir_for_best_tree = null; + } final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); if ( perform_id_mapping ) { writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true ); @@ -222,6 +253,13 @@ public final class RIOUtil { log.print( "\t" ); log.print( Integer.toString( ortholog_groups_095 ) ); // + if ( true ) { + log.print( "\t" ); + log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) ); + log.print( "\t" ); + log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) ); + } + // log.print( "\t" ); if ( stats.getN() > 3 ) { log.print( df.format( median ) ); @@ -294,6 +332,51 @@ public final class RIOUtil { } } + private final static GSDIR analyzeConsensusTree( final File gene_trees_file, + final File species_tree_file, + final File outdir, + final File best_trees_indir, + final SortedMap id_map, + final String best_trees_suffix ) + throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException { + final File the_one = ForesterUtil.getMatchingFile( best_trees_indir, + gene_trees_file.getName(), + best_trees_suffix ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ]; + final Phylogeny species_tree = SDIutil + .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO ); + PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree ); + best_tree.setRooted( true ); + species_tree.setRooted( true ); + if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) { + throw new IOException( "gene tree matching to [" + + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" ); + } + final PhylogenyNodeIterator it = best_tree.iteratorExternalForward(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + final String name = n.getName().trim(); + if ( !ForesterUtil.isEmpty( name ) ) { + try { + ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE ); + } + catch ( final PhyloXmlDataFormatException e ) { + // Ignore. + } + } + } + final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true ); + final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree(); + result_gene_tree.setRerootable( false ); + PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME ); + final String outname = ForesterUtil.removeFileExtension( the_one.getName() ); + final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX + + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" ); + writeTree( result_gene_tree, outfile, null, id_map ); + return gsdir_for_best_tree; + } + private static final void writeOrthologyTable( final File table_outfile, final int gene_trees_analyzed, final IntMatrix m, @@ -529,63 +612,7 @@ public final class RIOUtil { final String prefix, final String suffix ) throws IOException { - if ( !dir.exists() ) { - throw new IOException( "[" + dir + "] does not exist" ); - } - if ( !dir.isDirectory() ) { - throw new IOException( "[" + dir + "] is not a directory" ); - } - final File mapping_files[] = dir.listFiles( new FilenameFilter() { - - @Override - public boolean accept( final File dir, final String name ) { - return ( name.endsWith( suffix ) ); - } - } ); - if ( mapping_files.length == 1 ) { - throw new IOException( "no files ending with \"" + suffix + "\" found in [" + dir + "]" ); - } - String my_prefix = ForesterUtil.removeFileExtension( prefix ); - boolean done = false; - boolean more_than_one = false; - File the_one = null; - do { - int matches = 0; - for( File file : mapping_files ) { - if ( file.getName().startsWith( my_prefix ) ) { - matches++; - if ( matches > 1 ) { - the_one = null; - break; - } - the_one = file; - } - } - if ( matches > 1 ) { - more_than_one = true; - done = true; - } - if ( matches == 1 ) { - done = true; - } - else { - if ( my_prefix.length() <= 1 ) { - throw new IOException( "no file matching \"" + ForesterUtil.removeFileExtension( prefix ) - + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" ); - } - my_prefix = my_prefix.substring( 0, my_prefix.length() - 1 ); - } - } while ( !done ); - if ( more_than_one ) { - throw new IOException( "multiple files matching \"" + ForesterUtil.removeFileExtension( prefix ) - + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" ); - } - else if ( the_one != null ) { - } - else { - throw new IOException( "no file matching \"" + ForesterUtil.removeFileExtension( prefix ) - + "\" and ending with \"" + suffix + "\" found in [" + dir + "]" ); - } + final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix ); final BasicTable t = BasicTableParser.parse( the_one, '\t' ); return t.getColumnsAsMap( 0, 1 ); }