import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.SortedMap;
final static private String HELP_OPTION_1 = "help";
final static private String HELP_OPTION_2 = "h";
final static private String SUFFIX_FOR_SPECIES_TREE_USED = "_species_tree_used.xml";
+ final static private String OUTTREE_SUFFIX = "_gsdir.xml";
+ final static private String LOGFILE_NAME = "00_gsdi_log.tsv";
final static private String LOGFILE_SUFFIX = "_gsdi_log.txt";
final static private String REMAPPED_SUFFIX = "_gsdi_remapped.txt";
final static private String PRG_NAME = "gsdi";
- final static private String PRG_VERSION = "1.001";
- final static private String PRG_DATE = "170327";
+ final static private String PRG_VERSION = "1.100";
+ final static private String PRG_DATE = "170403";
final static private String PRG_DESC = "general speciation duplication inference";
final static private String E_MAIL = "phyloxml@gmail.com";
final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
gsdi.print_help();
System.exit( 0 );
}
- else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() != 2 && cla.getNumberOfNames() != 3 ) ) {
+ else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() != 3 ) ) {
System.out.println();
System.out.println( "Wrong number of arguments." );
System.out.println();
}
}
- private static void execute( final CommandLineArguments cla ) throws IOException {
+ private final static void execute( final CommandLineArguments cla ) throws IOException {
ALGORITHM base_algorithm = ALGORITHM.GSDI;
boolean most_parsimonous_duplication_model = false;
boolean allow_stripping_of_gene_tree = false;
File species_tree_file = null;
File out_file = null;
File log_file = null;
+ File out_dir = null;
try {
gene_tree_file = cla.getFile( 0 );
species_tree_file = cla.getFile( 1 );
- if ( cla.getNumberOfNames() == 3 ) {
+ if ( use_gene_tree_dir ) {
+ out_dir = cla.getFile( 2 );
+ if ( out_dir.exists() ) {
+ if ( !out_dir.isDirectory() ) {
+ ForesterUtil
+ .fatalError( gsdi.PRG_NAME,
+ "out-directory [" + out_dir + "] already exists but is not a directory" );
+ }
+ }
+ else {
+ final boolean success = out_dir.mkdirs();
+ if ( !success ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "could not create out-directory [" + out_dir + "]" );
+ }
+ }
+ }
+ else {
out_file = cla.getFile( 2 );
log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX );
}
ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() );
}
if ( use_gene_tree_dir ) {
- final File dir = new File( gene_tree_file.toString() );
- final File gene_tree_files[] = dir.listFiles( new FilenameFilter() {
+ final File indir = new File( gene_tree_file.toString() );
+ if ( !indir.exists() ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] does not exist" );
+ }
+ if ( !indir.isDirectory() ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] is not a directory" );
+ }
+ final String species_tree_file_name = species_tree_file.getName();
+ final File gene_tree_files[] = indir.listFiles( new FilenameFilter() {
@Override
public boolean accept( final File dir, final String name ) {
- return name.endsWith( gene_tree_suffix );
+ return ( ( name.endsWith( gene_tree_suffix ) ) && !( name.equals( species_tree_file_name ) ) );
}
} );
- for( final File file : gene_tree_files ) {
- System.out.println( file );
+ if ( gene_tree_files.length < 1 ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME,
+ "in-directory [" + indir
+ + "] does not contain any gene tree files with suffix "
+ + gene_tree_suffix );
}
- execute( base_algorithm,
- most_parsimonous_duplication_model,
- allow_stripping_of_gene_tree,
- transfer_taxonomy,
- gene_tree_files,
- species_tree_file );
+ executeDir( base_algorithm,
+ most_parsimonous_duplication_model,
+ allow_stripping_of_gene_tree,
+ transfer_taxonomy,
+ gene_tree_files,
+ species_tree_file,
+ out_dir );
}
else {
execute( base_algorithm,
}
}
- private static void execute( final ALGORITHM base_algorithm,
- final boolean most_parsimonous_duplication_model,
- final boolean allow_stripping_of_gene_tree,
- final boolean transfer_taxonomy,
- final File gene_tree_files[],
- final File species_tree_file )
+ private final static void executeDir( final ALGORITHM base_algorithm,
+ final boolean most_parsimonous_duplication_model,
+ final boolean allow_stripping_of_gene_tree,
+ final boolean transfer_taxonomy,
+ final File gene_tree_files[],
+ final File species_tree_file,
+ final File outdir )
throws IOException {
+ final File log_file = new File( outdir, LOGFILE_NAME );
+ if ( ForesterUtil.isWritableFile( log_file ) != null ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
+ }
+ EasyWriter log_writer = null;
+ try {
+ log_writer = ForesterUtil.createEasyWriter( log_file );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
+ }
+ log_writer.println( "# " + PRG_NAME );
+ log_writer.println( "# Version\t" + PRG_VERSION );
+ log_writer.println( "# Date\t" + PRG_DATE );
+ log_writer.println( "# Forester version\t" + ForesterConstants.FORESTER_VERSION );
+ log_writer.println( "# Species tree\t" + species_tree_file.getCanonicalPath() );
+ if ( base_algorithm == ALGORITHM.GSDI ) {
+ log_writer.println( "# Algorithm\tGSDI" );
+ }
+ else if ( base_algorithm == ALGORITHM.GSDIR ) {
+ log_writer.println( "# Algorithm\tGSDIR" );
+ }
+ log_writer.println( "# Use most parsimonous duplication model\t" + most_parsimonous_duplication_model );
+ log_writer.println( "# Allow stripping of gene tree nodes\t" + allow_stripping_of_gene_tree );
+ log_writer.println( "# Start time\t" + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
+ log_writer.println();
+ log_writer.print( "Gene-tree file\t" );
+ log_writer.print( "Gene-tree name/#\t" );
+ log_writer.print( "Ext. nodes\t" );
+ log_writer.print( "Speciations\t" );
+ log_writer.print( "Duplications\t" );
+ if ( !most_parsimonous_duplication_model ) {
+ log_writer.print( "Spec. or Dup.\t" );
+ }
+ if ( allow_stripping_of_gene_tree ) {
+ log_writer.print( "Stripped gene-tree ext. nodes\t" );
+ }
+ log_writer.print( "Taxonomy mapping" );
+ log_writer.println();
+ int counter = 0;
+ Arrays.sort( gene_tree_files );
for( final File gene_tree_file : gene_tree_files ) {
- if (gene_tree_file.getName().equals( species_tree_file.getName() )) {
- continue;
+ String outname = gene_tree_file.getName();
+ if ( outname.indexOf( "." ) > 0 ) {
+ outname = outname.substring( 0, outname.lastIndexOf( "." ) );
}
- execute( base_algorithm,
- most_parsimonous_duplication_model,
- allow_stripping_of_gene_tree,
- transfer_taxonomy,
- gene_tree_file,
- species_tree_file,
- new File( gene_tree_file + "gsdi" ),
- new File( gene_tree_file + "gsdi_log" ) );
+ outname = outname + OUTTREE_SUFFIX;
+ counter += executeOneTreeInDir( base_algorithm,
+ most_parsimonous_duplication_model,
+ allow_stripping_of_gene_tree,
+ transfer_taxonomy,
+ gene_tree_file,
+ species_tree_file,
+ new File( outdir, outname ),
+ log_writer );
+ log_writer.flush();
+ System.out.print( "\r" + counter );
+ }
+ System.out.print( "\r" );
+ log_writer.close();
+ System.out.println( "Analyzed " + counter + " gene trees" );
+ System.out.println();
+ System.out.println( "Wrote log to: " + log_file.getCanonicalPath() );
+ System.out.println();
+ }
+
+ private final static int executeOneTreeInDir( final ALGORITHM base_algorithm,
+ final boolean most_parsimonous_duplication_model,
+ final boolean allow_stripping_of_gene_tree,
+ final boolean transfer_taxonomy,
+ final File gene_tree_file,
+ final File species_tree_file,
+ final File out_file,
+ final EasyWriter log_writer )
+ throws IOException {
+ if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
+ }
+ if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
+ }
+ if ( ForesterUtil.isWritableFile( out_file ) != null ) {
+ ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
+ }
+ Phylogeny gene_trees[] = null;
+ try {
+ final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+ gene_trees = factory.create( gene_tree_file, PhyloXmlParser.createPhyloXmlParserXsdValidating() );
}
+ catch ( final IOException e ) {
+ fatalError( "error",
+ "failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage(),
+ log_writer );
+ }
+ int counter = 0;
+ final List<Phylogeny> out_trees = new ArrayList<Phylogeny>();
+ for( final Phylogeny gene_tree : gene_trees ) {
+ if ( !gene_tree.isEmpty() && gene_tree.getNumberOfExternalNodes() > 1 ) {
+ Phylogeny species_tree = null;
+ try {
+ species_tree = SDIutil.parseSpeciesTree( gene_tree,
+ species_tree_file,
+ REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE,
+ true,
+ TAXONOMY_EXTRACTION.NO );
+ }
+ catch ( final PhyloXmlDataFormatException e ) {
+ fatalError( "user error",
+ "failed to transfer general node name, in [" + species_tree_file + "]: "
+ + e.getMessage(),
+ log_writer );
+ }
+ catch ( final SDIException e ) {
+ fatalError( "user error", e.getMessage(), log_writer );
+ }
+ catch ( final IOException e ) {
+ fatalError( "error",
+ "Failed to read species tree from [" + species_tree_file + "]: " + e.getMessage(),
+ log_writer );
+ }
+ gene_tree.setRooted( true );
+ species_tree.setRooted( true );
+ if ( !gene_tree.isCompletelyBinary() ) {
+ fatalError( "user error",
+ "gene tree [" + gene_tree_file + "] is not completely binary",
+ log_writer );
+ }
+ if ( base_algorithm == ALGORITHM.SDI ) {
+ if ( !species_tree.isCompletelyBinary() ) {
+ fatalError( "user error",
+ "species tree is not completely binary, use GSDI or GSDIR instead",
+ log_writer );
+ }
+ }
+ log_writer.print( gene_tree_file.getName() );
+ log_writer.print( "\t" );
+ log_writer.print( ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
+ if ( gene_trees.length > 1 ) {
+ log_writer.print( ( ForesterUtil.isEmpty( gene_tree.getName() ) ? Integer.toString( counter )
+ : ( ":" + Integer.toString( counter ) ) ) );
+ }
+ log_writer.print( "\t" );
+ GSDII gsdii = null;
+ try {
+ if ( base_algorithm == ALGORITHM.GSDI ) {
+ gsdii = new GSDI( gene_tree,
+ species_tree,
+ most_parsimonous_duplication_model,
+ allow_stripping_of_gene_tree,
+ true,
+ transfer_taxonomy );
+ }
+ else if ( base_algorithm == ALGORITHM.GSDIR ) {
+ gsdii = new GSDIR( gene_tree,
+ species_tree,
+ allow_stripping_of_gene_tree,
+ true,
+ transfer_taxonomy );
+ }
+ }
+ catch ( final SDIException e ) {
+ fatalError( "user error", e.getLocalizedMessage(), log_writer );
+ }
+ catch ( final OutOfMemoryError e ) {
+ ForesterUtil.outOfMemoryError( e );
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace();
+ fatalError( "unexpected error", e.toString(), log_writer );
+ }
+ if ( base_algorithm == ALGORITHM.GSDIR ) {
+ final Phylogeny gt = ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree();
+ gt.setRerootable( false );
+ out_trees.add( gt );
+ }
+ else {
+ gene_tree.setRerootable( false );
+ out_trees.add( gene_tree );
+ }
+ log_writer.print( gene_tree.getNumberOfExternalNodes() + "\t" );
+ log_writer.print( gsdii.getSpeciationsSum() + "\t" );
+ if ( ( base_algorithm == ALGORITHM.GSDIR ) ) {
+ final GSDIR gsdir = ( GSDIR ) gsdii;
+ log_writer.print( gsdir.getMinDuplicationsSum() + "\t" );
+ }
+ else if ( ( base_algorithm == ALGORITHM.GSDI ) ) {
+ final GSDI gsdi = ( GSDI ) gsdii;
+ log_writer.print( gsdi.getDuplicationsSum() + "\t" );
+ if ( !most_parsimonous_duplication_model ) {
+ log_writer.print( gsdi.getSpeciationOrDuplicationEventsSum() + "\t" );
+ }
+ }
+ if ( allow_stripping_of_gene_tree ) {
+ log_writer.print( gsdii.getStrippedExternalGeneTreeNodes().size() + "\t" );
+ }
+ log_writer.print( gsdii.getTaxCompBase().toString() );
+ log_writer.println();
+ ++counter;
+ }
+ }
+ if ( counter > 0 ) {
+ try {
+ final PhylogenyWriter writer = new PhylogenyWriter();
+ writer.toPhyloXML( out_file, out_trees, 0, ForesterUtil.LINE_SEPARATOR );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil
+ .fatalError( PRG_NAME,
+ "Failed to write to [" + out_file.getCanonicalPath() + "]: " + e.getMessage() );
+ }
+ }
+ return counter;
}
- private static void execute( final ALGORITHM base_algorithm,
- final boolean most_parsimonous_duplication_model,
- final boolean allow_stripping_of_gene_tree,
- final boolean transfer_taxonomy,
- final File gene_tree_file,
- final File species_tree_file,
- final File out_file,
- final File log_file )
+ private final static void execute( final ALGORITHM base_algorithm,
+ final boolean most_parsimonous_duplication_model,
+ final boolean allow_stripping_of_gene_tree,
+ final boolean transfer_taxonomy,
+ final File gene_tree_file,
+ final File species_tree_file,
+ final File out_file,
+ final File log_file )
throws IOException {
if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
try {
final PhylogenyWriter writer = new PhylogenyWriter();
if ( base_algorithm == ALGORITHM.GSDIR ) {
- writer.toPhyloXML( out_file, ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree(), 0 );
+ final Phylogeny gt = ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree();
+ gt.setRerootable( false );
+ writer.toPhyloXML( out_file, gt, 0 );
}
else {
+ gene_tree.setRerootable( false );
writer.toPhyloXML( out_file, gene_tree, 0 );
}
}
log_writer.close();
}
- private static void fatalError( final String type, final String msg, final EasyWriter log_writer ) {
+ private final static void fatalError( final String type, final String msg, final EasyWriter log_writer ) {
try {
log_writer.flush();
log_writer.println();
ForesterUtil.fatalError( gsdi.PRG_NAME, msg );
}
- private static void print_help() {
+ private final static void print_help() {
System.out.println( "Usage: " + PRG_NAME
- + " [-options] <gene tree in phyloXML format, or directory with gene trees> <species tree> [outfile]" );
+ + " [-options] <gene tree file, or gene trees in-directory> <species tree> <outfile, or out-directory>" );
System.out.println();
System.out.println( "Options:" );
System.out.println( " -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
System.out.println();
System.out.println( "Examples: gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
+ " gene_tree.xml tree_of_life.xml out.xml" );
- System.out.println( " gsdi -" + SUFFIX_FOR_DIR_OPTION + " -" + SUFFIX_FOR_DIR_OPTION + "=.xml"
- + " gene_tree_dir tree_of_life.xml" );
+ System.out.println( " gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION + " -" + SUFFIX_FOR_DIR_OPTION
+ + "=.xml" + " gene_tree_dir tree_of_life.xml out_dir" );
+ System.out.println( " gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION + " -" + MOST_PARSIMONIOUS_OPTION
+ + " -" + GSDIR_OPTION + " -" + TRANSFER_TAXONOMY_OPTION + " -" + SUFFIX_FOR_DIR_OPTION + "=.xml"
+ + " gene_tree_dir tree_of_life.xml out_dir" );
System.out.println();
}
- private static void printMappedNodesToLog( final EasyWriter log_writer, final GSDII gsdi ) throws IOException {
+ private final static void printMappedNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
+ throws IOException {
final SortedSet<String> ss = new TreeSet<String>();
for( final PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) {
ss.add( n.toString() );
}
}
- private static void printStrippedGeneTreeNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
+ private final static void printStrippedGeneTreeNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
throws IOException {
final SortedMap<String, Integer> sm = new TreeMap<String, Integer>();
for( final PhylogenyNode n : gsdi.getStrippedExternalGeneTreeNodes() ) {
}
}
- private static void writeToRemappedFile( final File out_file,
- final SortedSet<String> remapped,
- final EasyWriter log_writer )
+ private final static void writeToRemappedFile( final File out_file,
+ final SortedSet<String> remapped,
+ final EasyWriter log_writer )
throws IOException {
final File file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + REMAPPED_SUFFIX );
final EasyWriter remapped_writer = ForesterUtil.createEasyWriter( file );