From c73326f29a91a733a71bfe36192dbd30e4f49af8 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Mon, 24 Apr 2017 18:01:53 -0700 Subject: [PATCH] in progress... --- .../java/src/org/forester/application/rio.java | 115 ++++++++++++++---- forester/java/src/org/forester/rio/RIOUtil.java | 126 ++++++++++++-------- 2 files changed, 164 insertions(+), 77 deletions(-) diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index aa50639..d915234 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -47,13 +47,6 @@ public class rio { final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; final static private String HELP_OPTION_1 = "help"; - final static private String LOGFILE_SUFFIX = "_RIO_log.tsv"; - final static private String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml"; - final static private String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv"; - final static private String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv"; - final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_"; - final static private String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_"; - final static private String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv"; final static private String HELP_OPTION_2 = "h"; final static private String GT_FIRST = "f"; final static private String GT_LAST = "l"; @@ -63,7 +56,10 @@ public class rio { final static private String GENE_TREES_SUFFIX_OPTION = "g"; final static private String MAPPINGS_DIR_OPTION = "m"; final static private String MAPPINGS_SUFFIX_OPTION = "ms"; + final static private String CONSENSUS_TREES_DIR_OPTION = "co"; + final static private String CONSENSUS_TREES_SUFFIX_OPTION = "cos"; final static private String MAPPINGS_SUFFIX_DEFAULT = ".nim"; + final static private String CONSENSUS_TREE_SUFFIX_DEFAULT = ".xml"; final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION = "c"; final static private String GENE_TREES_SUFFIX_DEFAULT = ".mlt"; final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5; @@ -102,6 +98,8 @@ public class rio { allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION ); allowed_options.add( MAPPINGS_DIR_OPTION ); allowed_options.add( MAPPINGS_SUFFIX_OPTION ); + allowed_options.add( CONSENSUS_TREES_DIR_OPTION ); + allowed_options.add( CONSENSUS_TREES_SUFFIX_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options ); @@ -325,6 +323,50 @@ public class rio { else { id_mapping_suffix = MAPPINGS_SUFFIX_DEFAULT; } + boolean perform_gsdir_on_best_tree; + final File best_trees_indir; + if ( cla.isOptionSet( CONSENSUS_TREES_DIR_OPTION ) ) { + best_trees_indir = new File( cla.getOptionValue( CONSENSUS_TREES_DIR_OPTION ) ); + perform_gsdir_on_best_tree = true; + if ( !use_dir ) { + ForesterUtil + .fatalError( "no consensus (\"best\") gene tree GSDIR analysis when operating on individual gene trees" ); + } + if ( !best_trees_indir.exists() ) { + ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir + + "\" does not exist" ); + } + if ( !best_trees_indir.isDirectory() ) { + ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir + + "\" is not a directory" ); + } + if ( best_trees_indir.listFiles().length < 1 ) { + ForesterUtil + .fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir + "\" is empty" ); + } + } + else { + best_trees_indir = null; + perform_gsdir_on_best_tree = false; + } + final String best_trees_suffix; + if ( cla.isOptionSet( CONSENSUS_TREES_SUFFIX_OPTION ) ) { + if ( !use_dir ) { + ForesterUtil + .fatalError( "no consensus (\"best\") gene tree suffix option when operating on individual gene trees" ); + } + if ( !perform_gsdir_on_best_tree ) { + ForesterUtil.fatalError( "no consensus (\"best\") gene tree directory given" ); + } + if ( !cla.isOptionHasAValue( CONSENSUS_TREES_SUFFIX_OPTION ) ) { + ForesterUtil.fatalError( "no value for -" + CONSENSUS_TREES_SUFFIX_OPTION ); + } + best_trees_suffix = cla.getOptionValueAsCleanString( CONSENSUS_TREES_SUFFIX_OPTION ); + } + else { + best_trees_suffix = CONSENSUS_TREE_SUFFIX_DEFAULT; + } + //////////////////////////////// ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file ); if ( !use_dir && orthology_outtable.exists() ) { ForesterUtil.fatalError( "\"" + orthology_outtable + "\" already exists" ); @@ -352,6 +394,15 @@ public class rio { } System.out.println( "Id mappings suffix :\t" + id_mapping_suffix ); } + if ( perform_gsdir_on_best_tree ) { + try { + System.out.println( "Consensus (\"best\") gene tree dir :\t" + best_trees_indir.getCanonicalPath() ); + } + catch ( IOException e ) { + ForesterUtil.fatalError( e.getLocalizedMessage() ); + } + System.out.println( "Consensus (\"best\") gene tree suffix :\t" + best_trees_suffix ); + } if ( use_dir ) { System.out.println( "Out-dir :\t" + outdir ); } @@ -520,9 +571,11 @@ public class rio { log.print( "\t" ); log.print( "0.95 O GROUPS" ); log.print( "\t" ); - if ( true ) { //TODO + if ( perform_gsdir_on_best_tree ) { log.print( "BEST TREE DUP" ); log.print( "\t" ); + log.print( "MEDIAN DUP - BEST TREE DUP" ); + log.print( "\t" ); } log.print( "MEDIAN DUP" ); log.print( "\t" ); @@ -556,22 +609,23 @@ public class rio { RIOUtil.executeAnalysis( gf, species_tree_file, new File( outdir.getCanonicalFile() + "/" + outname - + ORTHO_OUTTABLE_SUFFIX ), + + RIOUtil.ORTHO_OUTTABLE_SUFFIX ), + new File( outdir.getCanonicalFile() + "/" + outname + + RIOUtil.ORTHO_OUTTABLE_WITH_MAP_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname - + ORTHO_OUTTABLE_WITH_MAP_SUFFIX ), + + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname - + ORTHOLOG_GROUPS_SUFFIX ), - new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ), + + RIOUtil.LOGFILE_SUFFIX ), outgroup, rerooting, gt_first, gt_last, new File( outdir.getCanonicalFile() + "/" + outname - + STRIPPED_SPECIES_TREE_SUFFIX ), + + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname - + OUT_MIN_DUP_GENE_TREE_SUFFIX ), + + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ), new File( outdir.getCanonicalFile() + "/" + outname - + OUT_MED_DUP_GENE_TREE_SUFFIX ), + + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ), true, algorithm, true, @@ -579,7 +633,11 @@ public class rio { ortholog_group_cutoff, perform_id_mapping, id_mapping_dir, - id_mapping_suffix ); + id_mapping_suffix, + perform_gsdir_on_best_tree, + outdir, + best_trees_indir, + best_trees_suffix ); } catch ( IOException e ) { ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); @@ -590,23 +648,20 @@ public class rio { System.out.println(); } else { - String outname = orthology_outtable.toString(); - if ( outname.indexOf( "." ) > 0 ) { - outname = outname.substring( 0, outname.lastIndexOf( "." ) ); - } + String outname = ForesterUtil.removeFileExtension( orthology_outtable.toString() ); RIOUtil.executeAnalysis( gene_trees_file, species_tree_file, orthology_outtable, null, - new File( outname + ORTHOLOG_GROUPS_SUFFIX ), + new File( outname + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ), logfile, outgroup, rerooting, gt_first, gt_last, - new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ), - new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ), - new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ), + new File( outname + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ), + new File( outname + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ), + new File( outname + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ), algorithm == ALGORITHM.GSDIR, algorithm, false, @@ -614,6 +669,10 @@ public class rio { ortholog_group_cutoff, false, null, + null, + false, + null, + null, null ); } if ( !use_dir ) { @@ -661,6 +720,12 @@ public class rio { System.out.println( " -" + MAPPINGS_DIR_OPTION + "= : directory for id mapping files" ); System.out.println( " -" + MAPPINGS_SUFFIX_OPTION + "= : suffix for id mapping files (default: " + MAPPINGS_SUFFIX_DEFAULT + ")" ); + System.out.println( " -" + CONSENSUS_TREES_DIR_OPTION + + "= : directory with consenus (\"best\") gene trees to be analyzed with GSDIR" ); + System.out.println( " -" + CONSENSUS_TREES_SUFFIX_OPTION + + "= : suffix for consenus (\"best\") gene trees (default: " + CONSENSUS_TREE_SUFFIX_DEFAULT + + ")" ); + /// System.out.println(); System.out.println( " Formats" ); System.out @@ -678,6 +743,8 @@ public class rio { System.out.println( " rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" ); System.out.println( " rio -g=.mlt -m=id_maps_dir -ms=.nim -c=0.8 gene_trees_dir species.xml out_dir log.tsv" ); System.out.println( " rio -m=id_maps_dir -c=0.8 gene_trees_dir species.xml out_dir log.tsv" ); + System.out + .println( " rio -m=id_maps_dir -co=consensus_dir -cos=.xml -c=0.8 gene_trees_dir species.xml out_dir log.tsv" ); System.out.println(); System.exit( -1 ); } diff --git a/forester/java/src/org/forester/rio/RIOUtil.java b/forester/java/src/org/forester/rio/RIOUtil.java index 03d2796..89b753c 100644 --- a/forester/java/src/org/forester/rio/RIOUtil.java +++ b/forester/java/src/org/forester/rio/RIOUtil.java @@ -2,18 +2,15 @@ package org.forester.rio; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.math.RoundingMode; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeSet; -import javax.swing.JOptionPane; - -import org.forester.archaeopteryx.AptxUtil; import org.forester.datastructures.IntMatrix; import org.forester.io.parsers.IteratingPhylogenyParser; import org.forester.io.parsers.PhylogenyParser; @@ -45,6 +42,15 @@ import org.forester.util.ForesterUtil; public final class RIOUtil { + public final static String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_stripped_species_tree.xml"; + public final static String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv"; + public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv"; + public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_"; + public final static String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_"; + public final static String BEST_TREE_SUFFIX = "_RIO_consensus_gene_tree_dup_"; + public final static String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv"; + public final static String LOGFILE_SUFFIX = "_RIO_log.tsv"; + public static final void executeAnalysis( final File gene_trees_file, final File species_tree_file, final File orthology_outtable, @@ -65,7 +71,11 @@ public final class RIOUtil { final double ortholog_group_cutoff, final boolean perform_id_mapping, final File id_mapping_dir, - final String id_mapping_suffix ) { + final String id_mapping_suffix, + final boolean perform_gsdir_on_best_tree, + final File outdir, + final File best_trees_indir, + final String best_trees_suffix ) { try { final SortedMap id_map; if ( perform_id_mapping ) { @@ -131,53 +141,18 @@ public final class RIOUtil { else { m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); } - //////////////////////////////////////////// - //////////////////////////////////////////// - //TODO - final boolean perform_gsdir_on_best_tree = true; - final File best_trees_dir = new File( "best_trees" ); - final String best_trees_suffix = ".xml"; final GSDIR gsdir_for_best_tree; if ( perform_gsdir_on_best_tree ) { - final Phylogeny best_tree = obtainTree( best_trees_dir, gene_trees_file.getName(), best_trees_suffix ); - final Phylogeny species_tree = SDIutil - .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO ); - PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree ); - best_tree.setRooted( true ); - species_tree.setRooted( true ); - if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) { - throw new IOException( "gene tree matching to [" - + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) - + "] is not completely binary" ); - } - final PhylogenyNodeIterator it = best_tree.iteratorExternalForward(); - while ( it.hasNext() ) { - final PhylogenyNode n = it.next(); - final String name = n.getName().trim(); - if ( !ForesterUtil.isEmpty( name ) ) { - try { - ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE ); - } - catch ( final PhyloXmlDataFormatException e ) { - // Ignore. - } - } - } - gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true ); - final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree(); - System.out.println( gsdir_for_best_tree.getMinDuplicationsSum() ); - result_gene_tree.setRerootable( false ); - PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), - true, - true, - DESCENDANT_SORT_PRIORITY.NODE_NAME ); - writeTree( result_gene_tree, new File( gene_trees_file.getName() + "____.xml" ), null, id_map ); + gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file, + species_tree_file, + outdir, + best_trees_indir, + id_map, + best_trees_suffix ); } else { gsdir_for_best_tree = null; } - //////////////////////////////////////////// - //////////////////////////////////////////// final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics(); if ( perform_id_mapping ) { writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true ); @@ -278,6 +253,13 @@ public final class RIOUtil { log.print( "\t" ); log.print( Integer.toString( ortholog_groups_095 ) ); // + if ( true ) { + log.print( "\t" ); + log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) ); + log.print( "\t" ); + log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) ); + } + // log.print( "\t" ); if ( stats.getN() > 3 ) { log.print( df.format( median ) ); @@ -350,6 +332,51 @@ public final class RIOUtil { } } + private final static GSDIR analyzeConsensusTree( final File gene_trees_file, + final File species_tree_file, + final File outdir, + final File best_trees_indir, + final SortedMap id_map, + final String best_trees_suffix ) + throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException { + final File the_one = ForesterUtil.getMatchingFile( best_trees_indir, + gene_trees_file.getName(), + best_trees_suffix ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ]; + final Phylogeny species_tree = SDIutil + .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO ); + PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree ); + best_tree.setRooted( true ); + species_tree.setRooted( true ); + if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) { + throw new IOException( "gene tree matching to [" + + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" ); + } + final PhylogenyNodeIterator it = best_tree.iteratorExternalForward(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + final String name = n.getName().trim(); + if ( !ForesterUtil.isEmpty( name ) ) { + try { + ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE ); + } + catch ( final PhyloXmlDataFormatException e ) { + // Ignore. + } + } + } + final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true ); + final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree(); + result_gene_tree.setRerootable( false ); + PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME ); + final String outname = ForesterUtil.removeFileExtension( the_one.getName() ); + final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX + + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" ); + writeTree( result_gene_tree, outfile, null, id_map ); + return gsdir_for_best_tree; + } + private static final void writeOrthologyTable( final File table_outfile, final int gene_trees_analyzed, final IntMatrix m, @@ -589,11 +616,4 @@ public final class RIOUtil { final BasicTable t = BasicTableParser.parse( the_one, '\t' ); return t.getColumnsAsMap( 0, 1 ); } - - private final static Phylogeny obtainTree( final File dir, final String prefix, final String suffix ) - throws IOException { - final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix ); - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - return factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ]; - } } -- 1.7.10.2