From 2b1c49af616eb01a99a69daf8f703f494b1a5a7e Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 12 Dec 2012 04:45:47 +0000 Subject: [PATCH] "rio" work --- .../java/src/org/forester/application/rio.java | 61 ++++++++++++-- .../org/forester/io/parsers/util/ParserUtils.java | 7 +- forester/java/src/org/forester/rio/RIO.java | 87 ++++++++++++++++---- forester/java/src/org/forester/sdi/GSDIR.java | 2 +- 4 files changed, 127 insertions(+), 30 deletions(-) diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 397329c..5bf670b 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -31,10 +31,14 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.datastructures.IntMatrix; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.rio.RIO; @@ -48,8 +52,8 @@ import org.forester.util.ForesterUtil; public class rio { final static private String PRG_NAME = "rio"; - final static private String PRG_VERSION = "3.00 beta 4"; - final static private String PRG_DATE = "2012.12.10"; + final static private String PRG_VERSION = "4.000 beta 1"; + final static private String PRG_DATE = "2012.12.11"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/"; final static private String HELP_OPTION_1 = "help"; @@ -74,7 +78,7 @@ public class rio { if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { printHelp(); } - if ( ( args.length < 2 ) || ( args.length > 10 ) ) { + if ( ( args.length < 3 ) || ( args.length > 5 ) ) { System.out.println(); System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); System.out.println(); @@ -104,18 +108,23 @@ public class rio { boolean sdir = false; if ( cla.isOptionSet( USE_SDIR ) ) { sdir = true; + if ( logfile != null ) { + ForesterUtil.fatalError( PRG_NAME, "logfile output only for GSDIR algorithm" ); + } } long time = 0; System.out.println( "Gene trees : " + gene_trees_file ); System.out.println( "Species tree : " + species_tree_file ); System.out.println( "All vs all orthology table: " + othology_outtable ); if ( !sdir ) { + if ( logfile != null ) { + System.out.println( "Logfile : " + logfile ); + } System.out.println( "Non binary species tree : allowed (GSDIR algorithm)" ); } else { System.out.println( "Non binary species tree : disallowed (SDIR algorithm)" ); } - System.out.println(); time = System.currentTimeMillis(); Phylogeny species_tree = null; try { @@ -137,8 +146,14 @@ public class rio { algorithm = ALGORITHM.GSDIR; } try { - final RIO rio = new RIO( gene_trees_file, species_tree, algorithm ); + final RIO rio = new RIO( gene_trees_file, species_tree, algorithm, logfile != null ); + if ( algorithm == ALGORITHM.GSDIR ) { + System.out.println( "Taxonomy linking based on : " + rio.getGSDIRtaxCompBase() ); + } tableOutput( othology_outtable, rio ); + if ( ( algorithm == ALGORITHM.GSDIR ) && ( logfile != null ) ) { + writeLogFile( logfile, rio ); + } } catch ( final RIOException e ) { ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); @@ -152,15 +167,43 @@ public class rio { catch ( final Exception e ) { ForesterUtil.unexpectedFatalError( PRG_NAME, e ); } - if ( othology_outtable != null ) { - ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + othology_outtable + "\"" ); - } time = System.currentTimeMillis() - time; ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" ); ForesterUtil.programMessage( PRG_NAME, "OK" ); System.exit( 0 ); } + private static void writeLogFile( final File logfile, final RIO rio ) throws IOException { + final EasyWriter out = ForesterUtil.createEasyWriter( logfile ); + out.println( "Species stripped from gene trees:" ); + final SortedSet rn = new TreeSet(); + for( final PhylogenyNode n : rio.getRemovedGeneTreeNodes() ) { + final Taxonomy t = n.getNodeData().getTaxonomy(); + switch ( rio.getGSDIRtaxCompBase() ) { + case CODE: { + rn.add( t.getTaxonomyCode() ); + break; + } + case ID: { + rn.add( t.getIdentifier().toString() ); + break; + } + case SCIENTIFIC_NAME: { + rn.add( t.getScientificName() ); + break; + } + } + } + for( final String s : rn ) { + out.println( s ); + } + out.println(); + out.println( "Some information about duplication numbers in gene trees:" ); + out.println( rio.getLog().toString() ); + out.close(); + ForesterUtil.programMessage( PRG_NAME, "wrote log to \"" + logfile + "\"" ); + } + private static void tableOutput( final File table_outfile, final RIO rio ) throws IOException, RIOException { final IntMatrix m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true ); writeTable( table_outfile, rio, m ); @@ -204,7 +247,7 @@ public class rio { System.out.println(); System.out.println( " Options" ); System.out.println( " -" + USE_SDIR - + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" ); + + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" ); System.out.println(); System.out.println( " Formats" ); System.out.println( " The species tree is expected to be in phyloXML format." ); diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 35ec60c..4baf7ce 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -51,9 +51,10 @@ import org.forester.util.ForesterUtil; public final class ParserUtils { - final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA" ); - final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)[^A-Za-z].*" ); - final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)/\\d+-\\d+" ); + final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" ); + final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern + .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" ); + final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index 2bcd5e9..fd8ae3f 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -51,6 +51,7 @@ import org.forester.sdi.SDI.ALGORITHM; import org.forester.sdi.SDI.TaxonomyComparisonBase; import org.forester.sdi.SDIException; import org.forester.sdi.SDIR; +import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.ForesterUtil; public final class RIO { @@ -62,13 +63,27 @@ public final class RIO { private int _samples; private int _ext_nodes; private TaxonomyComparisonBase _gsdir_tax_comp_base; + private StringBuilder _log; + private boolean _produce_log; - public RIO( final File gene_trees_file, final Phylogeny species_tree, final ALGORITHM algorithm ) - throws IOException, SDIException, RIOException { - init(); + public RIO( final File gene_trees_file, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final boolean produce_log ) throws IOException, SDIException, RIOException { + init( produce_log ); inferOrthologs( gene_trees_file, species_tree, algorithm ); } + private final void init( final boolean produce_log ) { + _produce_log = produce_log; + _samples = -1; + _ext_nodes = -1; + _log = null; + _gsdir_tax_comp_base = null; + _analyzed_gene_trees = null; + _removed_gene_tree_nodes = null; + } + public final Phylogeny[] getAnalyzedGeneTrees() { return _analyzed_gene_trees; } @@ -106,15 +121,32 @@ public final class RIO { nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES ); } final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); - // Removes from species_tree all species not found in gene_tree. - final List _removed_species_tree_ext_nodes = PhylogenyMethods - .taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); - if ( species_tree.isEmpty() ) { - throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + if ( algorithm == ALGORITHM.SDIR ) { + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); + if ( species_tree.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } + } + if ( _produce_log ) { + _log = new StringBuilder(); } _analyzed_gene_trees = new Phylogeny[ gene_trees.length ]; int i = 0; int gene_tree_ext_nodes = 0; + if ( _produce_log ) { + _log.append( "#" ); + _log.append( "\t" ); + _log.append( "with minimal number of duplications" ); + _log.append( "/" ); + _log.append( "root placements" ); + _log.append( "\t[" ); + _log.append( "min" ); + _log.append( "-" ); + _log.append( "max" ); + _log.append( "]" ); + _log.append( ForesterUtil.LINE_SEPARATOR ); + } for( final Phylogeny gt : gene_trees ) { if ( algorithm == ALGORITHM.SDIR ) { // Removes from gene_tree all species not found in species_tree. @@ -137,11 +169,6 @@ public final class RIO { setNumberOfSamples( gene_trees.length ); } - private final void init() { - _samples = 1; - _ext_nodes = 0; - } - private final Phylogeny performOrthologInference( final Phylogeny gene_tree, final Phylogeny species_tree, final ALGORITHM algorithm, @@ -160,8 +187,29 @@ public final class RIO { break; } case GSDIR: { + // System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/" + // + species_tree.getNumberOfExternalNodes() ); final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 ); + // System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/" + // + species_tree.getNumberOfExternalNodes() ); assigned_tree = gsdir.getMinDuplicationsSumGeneTrees().get( 0 ); + if ( i == 0 ) { + _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes(); + } + if ( _produce_log ) { + final BasicDescriptiveStatistics stats = gsdir.getDuplicationsSumStats(); + _log.append( i ); + _log.append( "\t" ); + _log.append( gsdir.getMinDuplicationsSumGeneTrees().size() ); + _log.append( "/" ); + _log.append( stats.getN() ); + _log.append( "\t[" ); + _log.append( ( int ) stats.getMin() ); + _log.append( "-" ); + _log.append( ( int ) stats.getMax() ); + _log.append( "]" ); + _log.append( ForesterUtil.LINE_SEPARATOR ); + } _gsdir_tax_comp_base = gsdir.getTaxCompBase(); break; } @@ -177,10 +225,7 @@ public final class RIO { _ext_nodes = i; } - private final void setNumberOfSamples( int i ) { - if ( i < 1 ) { - i = 1; - } + private final void setNumberOfSamples( final int i ) { _samples = i; } @@ -240,4 +285,12 @@ public final class RIO { } return m; } + + public final TaxonomyComparisonBase getGSDIRtaxCompBase() { + return _gsdir_tax_comp_base; + } + + public final StringBuilder getLog() { + return _log; + } } diff --git a/forester/java/src/org/forester/sdi/GSDIR.java b/forester/java/src/org/forester/sdi/GSDIR.java index 035a443..70cc009 100644 --- a/forester/java/src/org/forester/sdi/GSDIR.java +++ b/forester/java/src/org/forester/sdi/GSDIR.java @@ -79,7 +79,7 @@ public class GSDIR extends GSDI { } _duplications_sum_stats.addValue( _duplications_sum ); } - System.out.println( _duplications_sum_stats.getSummaryAsString() ); + //System.out.println( _duplications_sum_stats.getSummaryAsString() ); } public int getMinDuplicationsSum() { -- 1.7.10.2