import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
import org.forester.datastructures.IntMatrix;
import org.forester.io.parsers.phyloxml.PhyloXmlParser;
import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.rio.RIO;
public class rio {
final static private String PRG_NAME = "rio";
- final static private String PRG_VERSION = "3.00 beta 4";
- final static private String PRG_DATE = "2012.12.10";
+ final static private String PRG_VERSION = "4.000 beta 1";
+ final static private String PRG_DATE = "2012.12.11";
final static private String E_MAIL = "czmasek@burnham.org";
final static private String WWW = "www.phylosoft.org/forester/";
final static private String HELP_OPTION_1 = "help";
if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
printHelp();
}
- if ( ( args.length < 2 ) || ( args.length > 10 ) ) {
+ if ( ( args.length < 3 ) || ( args.length > 5 ) ) {
System.out.println();
System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
System.out.println();
boolean sdir = false;
if ( cla.isOptionSet( USE_SDIR ) ) {
sdir = true;
+ if ( logfile != null ) {
+ ForesterUtil.fatalError( PRG_NAME, "logfile output only for GSDIR algorithm" );
+ }
}
long time = 0;
System.out.println( "Gene trees : " + gene_trees_file );
System.out.println( "Species tree : " + species_tree_file );
System.out.println( "All vs all orthology table: " + othology_outtable );
if ( !sdir ) {
+ if ( logfile != null ) {
+ System.out.println( "Logfile : " + logfile );
+ }
System.out.println( "Non binary species tree : allowed (GSDIR algorithm)" );
}
else {
System.out.println( "Non binary species tree : disallowed (SDIR algorithm)" );
}
- System.out.println();
time = System.currentTimeMillis();
Phylogeny species_tree = null;
try {
algorithm = ALGORITHM.GSDIR;
}
try {
- final RIO rio = new RIO( gene_trees_file, species_tree, algorithm );
+ final RIO rio = new RIO( gene_trees_file, species_tree, algorithm, logfile != null );
+ if ( algorithm == ALGORITHM.GSDIR ) {
+ System.out.println( "Taxonomy linking based on : " + rio.getGSDIRtaxCompBase() );
+ }
tableOutput( othology_outtable, rio );
+ if ( ( algorithm == ALGORITHM.GSDIR ) && ( logfile != null ) ) {
+ writeLogFile( logfile, rio );
+ }
}
catch ( final RIOException e ) {
ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
catch ( final Exception e ) {
ForesterUtil.unexpectedFatalError( PRG_NAME, e );
}
- if ( othology_outtable != null ) {
- ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + othology_outtable + "\"" );
- }
time = System.currentTimeMillis() - time;
ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
ForesterUtil.programMessage( PRG_NAME, "OK" );
System.exit( 0 );
}
+ private static void writeLogFile( final File logfile, final RIO rio ) throws IOException {
+ final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
+ out.println( "Species stripped from gene trees:" );
+ final SortedSet<String> rn = new TreeSet<String>();
+ for( final PhylogenyNode n : rio.getRemovedGeneTreeNodes() ) {
+ final Taxonomy t = n.getNodeData().getTaxonomy();
+ switch ( rio.getGSDIRtaxCompBase() ) {
+ case CODE: {
+ rn.add( t.getTaxonomyCode() );
+ break;
+ }
+ case ID: {
+ rn.add( t.getIdentifier().toString() );
+ break;
+ }
+ case SCIENTIFIC_NAME: {
+ rn.add( t.getScientificName() );
+ break;
+ }
+ }
+ }
+ for( final String s : rn ) {
+ out.println( s );
+ }
+ out.println();
+ out.println( "Some information about duplication numbers in gene trees:" );
+ out.println( rio.getLog().toString() );
+ out.close();
+ ForesterUtil.programMessage( PRG_NAME, "wrote log to \"" + logfile + "\"" );
+ }
+
private static void tableOutput( final File table_outfile, final RIO rio ) throws IOException, RIOException {
final IntMatrix m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
writeTable( table_outfile, rio, m );
System.out.println();
System.out.println( " Options" );
System.out.println( " -" + USE_SDIR
- + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" );
+ + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" );
System.out.println();
System.out.println( " Formats" );
System.out.println( " The species tree is expected to be in phyloXML format." );
public final class ParserUtils {
- final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA" );
- final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)[^A-Za-z].*" );
- final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)/\\d+-\\d+" );
+ final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
+ final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern
+ .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
+ final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
final public static PhylogenyParser createParserDependingFileContents( final File file,
final boolean phyloxml_validate_against_xsd )
import org.forester.sdi.SDI.TaxonomyComparisonBase;
import org.forester.sdi.SDIException;
import org.forester.sdi.SDIR;
+import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.ForesterUtil;
public final class RIO {
private int _samples;
private int _ext_nodes;
private TaxonomyComparisonBase _gsdir_tax_comp_base;
+ private StringBuilder _log;
+ private boolean _produce_log;
- public RIO( final File gene_trees_file, final Phylogeny species_tree, final ALGORITHM algorithm )
- throws IOException, SDIException, RIOException {
- init();
+ public RIO( final File gene_trees_file,
+ final Phylogeny species_tree,
+ final ALGORITHM algorithm,
+ final boolean produce_log ) throws IOException, SDIException, RIOException {
+ init( produce_log );
inferOrthologs( gene_trees_file, species_tree, algorithm );
}
+ private final void init( final boolean produce_log ) {
+ _produce_log = produce_log;
+ _samples = -1;
+ _ext_nodes = -1;
+ _log = null;
+ _gsdir_tax_comp_base = null;
+ _analyzed_gene_trees = null;
+ _removed_gene_tree_nodes = null;
+ }
+
public final Phylogeny[] getAnalyzedGeneTrees() {
return _analyzed_gene_trees;
}
nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
}
final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
- // Removes from species_tree all species not found in gene_tree.
- final List<PhylogenyNode> _removed_species_tree_ext_nodes = PhylogenyMethods
- .taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
- if ( species_tree.isEmpty() ) {
- throw new RIOException( "failed to establish species based mapping between gene and species trees" );
+ if ( algorithm == ALGORITHM.SDIR ) {
+ // Removes from species_tree all species not found in gene_tree.
+ PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
+ if ( species_tree.isEmpty() ) {
+ throw new RIOException( "failed to establish species based mapping between gene and species trees" );
+ }
+ }
+ if ( _produce_log ) {
+ _log = new StringBuilder();
}
_analyzed_gene_trees = new Phylogeny[ gene_trees.length ];
int i = 0;
int gene_tree_ext_nodes = 0;
+ if ( _produce_log ) {
+ _log.append( "#" );
+ _log.append( "\t" );
+ _log.append( "with minimal number of duplications" );
+ _log.append( "/" );
+ _log.append( "root placements" );
+ _log.append( "\t[" );
+ _log.append( "min" );
+ _log.append( "-" );
+ _log.append( "max" );
+ _log.append( "]" );
+ _log.append( ForesterUtil.LINE_SEPARATOR );
+ }
for( final Phylogeny gt : gene_trees ) {
if ( algorithm == ALGORITHM.SDIR ) {
// Removes from gene_tree all species not found in species_tree.
setNumberOfSamples( gene_trees.length );
}
- private final void init() {
- _samples = 1;
- _ext_nodes = 0;
- }
-
private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
final Phylogeny species_tree,
final ALGORITHM algorithm,
break;
}
case GSDIR: {
+ // System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/"
+ // + species_tree.getNumberOfExternalNodes() );
final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
+ // System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/"
+ // + species_tree.getNumberOfExternalNodes() );
assigned_tree = gsdir.getMinDuplicationsSumGeneTrees().get( 0 );
+ if ( i == 0 ) {
+ _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
+ }
+ if ( _produce_log ) {
+ final BasicDescriptiveStatistics stats = gsdir.getDuplicationsSumStats();
+ _log.append( i );
+ _log.append( "\t" );
+ _log.append( gsdir.getMinDuplicationsSumGeneTrees().size() );
+ _log.append( "/" );
+ _log.append( stats.getN() );
+ _log.append( "\t[" );
+ _log.append( ( int ) stats.getMin() );
+ _log.append( "-" );
+ _log.append( ( int ) stats.getMax() );
+ _log.append( "]" );
+ _log.append( ForesterUtil.LINE_SEPARATOR );
+ }
_gsdir_tax_comp_base = gsdir.getTaxCompBase();
break;
}
_ext_nodes = i;
}
- private final void setNumberOfSamples( int i ) {
- if ( i < 1 ) {
- i = 1;
- }
+ private final void setNumberOfSamples( final int i ) {
_samples = i;
}
}
return m;
}
+
+ public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
+ return _gsdir_tax_comp_base;
+ }
+
+ public final StringBuilder getLog() {
+ return _log;
+ }
}
}\r
_duplications_sum_stats.addValue( _duplications_sum );\r
}\r
- System.out.println( _duplications_sum_stats.getSummaryAsString() );\r
+ //System.out.println( _duplications_sum_stats.getSummaryAsString() );\r
}\r
\r
public int getMinDuplicationsSum() {\r