From: cmzmasek@gmail.com Date: Wed, 9 Feb 2011 01:06:02 +0000 (+0000) Subject: initial commit X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=493e40b0c936b65da342134da37e8b856b9b80af;p=jalview.git initial commit --- diff --git a/forester/java/src/org/forester/application/confadd.java b/forester/java/src/org/forester/application/confadd.java new file mode 100644 index 0000000..efccace --- /dev/null +++ b/forester/java/src/org/forester/application/confadd.java @@ -0,0 +1,265 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.tools.ConfidenceAssessor; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class confadd { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String FIRST_OPTION = "f"; + final static private String LAST_OPTION = "l"; + final static private String STRICT_OPTION = "s"; + final static private String NORMALIZE_OPTION = "n"; + final static private String PRG_NAME = "confadd"; + final static private String PRG_VERSION = "1.01"; + final static private String PRG_DATE = "2010.10.26"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( args.length < 4 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + if ( cla.getNumberOfNames() != 4 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( FIRST_OPTION ); + allowed_options.add( LAST_OPTION ); + allowed_options.add( STRICT_OPTION ); + allowed_options.add( NORMALIZE_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final String confidence_type = cla.getName( 0 ); + final File target_file = cla.getFile( 1 ); + final File evaluators_file = cla.getFile( 2 ); + final File outfile = cla.getFile( 3 ); + if ( ForesterUtil.isEmpty( confidence_type ) ) { + ForesterUtil.fatalError( PRG_NAME, "attempt to use empty confidence type" ); + } + if ( outfile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" ); + } + if ( !target_file.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "target [" + target_file + "] does not exist" ); + } + if ( !evaluators_file.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "evaluators [" + evaluators_file + "] does not exist" ); + } + boolean strict = false; + int first = 0; + int last = 0; + double norm = 0; + try { + if ( cla.isOptionSet( STRICT_OPTION ) ) { + if ( cla.isOptionHasAValue( STRICT_OPTION ) ) { + ForesterUtil.fatalError( PRG_NAME, "no value allowed for -" + STRICT_OPTION + " allowed" ); + } + strict = true; + } + if ( cla.isOptionSet( FIRST_OPTION ) ) { + first = cla.getOptionValueAsInt( FIRST_OPTION ); + } + if ( cla.isOptionSet( LAST_OPTION ) ) { + last = cla.getOptionValueAsInt( LAST_OPTION ); + } + if ( cla.isOptionSet( NORMALIZE_OPTION ) ) { + norm = cla.getOptionValueAsDouble( NORMALIZE_OPTION ); + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getLocalizedMessage() ); + } + if ( ( first < 0 ) || ( last < 0 ) ) { + ForesterUtil + .fatalError( PRG_NAME, + "attempt to set first or last evaluator topology to use to a number less than zero" ); + } + if ( norm < 0 ) { + ForesterUtil.fatalError( PRG_NAME, "illegal value for normalizer [" + norm + "]" ); + } + Phylogeny[] targets = null; + Phylogeny[] evaluators = null; + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + try { + targets = factory.create( target_file, ForesterUtil.createParserDependingOnFileType( target_file, true ) ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read target phylogenies from [" + target_file + "]: " + + e.getLocalizedMessage() ); + } + int counter = 0; + for( final Phylogeny target : targets ) { + try { + checkUniquenessOfExternalNodes( target, "target " + counter ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + counter++; + } + if ( targets.length == 1 ) { + ForesterUtil.programMessage( PRG_NAME, "read in one target" ); + } + else { + ForesterUtil.programMessage( PRG_NAME, "read in a total of " + targets.length + " targets" ); + } + try { + evaluators = factory.create( evaluators_file, ForesterUtil + .createParserDependingOnFileType( evaluators_file, true ) ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read evaluator topologies from [" + evaluators_file + "]: " + + e.getLocalizedMessage() ); + } + counter = 0; + for( final Phylogeny evaluator : evaluators ) { + try { + checkUniquenessOfExternalNodes( evaluator, "evaluator " + counter ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + counter++; + } + ForesterUtil.programMessage( PRG_NAME, "read in a total of " + evaluators.length + " evaluator topologies" ); + System.gc(); + if ( last == 0 ) { + last = evaluators.length - 1; + } + if ( ( last >= evaluators.length ) || ( last <= first ) ) { + ForesterUtil.fatalError( PRG_NAME, "illegal value for first or last evaluator topology to use [" + first + + ", " + last + "]" ); + } + double value = 1; + if ( norm > 0 ) { + value = norm / ( 1 + last - first ); + } + ForesterUtil.programMessage( PRG_NAME, "first topology to use: " + first ); + String is_last = ""; + if ( last == ( evaluators.length - 1 ) ) { + is_last = " (corresponds to last topology in file)"; + } + ForesterUtil.programMessage( PRG_NAME, "last topology to use : " + last + is_last ); + ForesterUtil.programMessage( PRG_NAME, "sum of topologies used as evaluators: " + ( last - first + 1 ) ); + if ( norm > 0 ) { + ForesterUtil.programMessage( PRG_NAME, "normalizer: " + norm + " (" + ForesterUtil.round( value, 6 ) + ")" ); + } + else { + ForesterUtil.programMessage( PRG_NAME, "normalizer: n/a" ); + } + ForesterUtil.programMessage( PRG_NAME, "strict: " + strict ); + for( final Phylogeny target : targets ) { + try { + ConfidenceAssessor.evaluate( confidence_type, evaluators, target, strict, value, first, last ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( targets, 0, outfile, ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to [" + outfile + "]: " + e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "wrote output to: [" + outfile + "]" ); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void printHelp() { + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + + " [options] " ); + System.out.println(); + System.out.println( "options:" ); + System.out.println(); + System.out.println( " -" + STRICT_OPTION + + " : strict [default: non-strict]: all nodes between 'target' and 'evaluators' must match" ); + System.out.println( " -" + NORMALIZE_OPTION + + "=: normalize to this value (e.g. 100 for most bootstrap analyses) [default: no normalization]" ); + System.out.println( " -" + FIRST_OPTION + "=: first evaluator topology to use (0-based) [default: 0]" ); + System.out.println( " -" + LAST_OPTION + + "=: last evaluator topology to use (0-based) [default: use all until final topology]" ); + System.out.println(); + } + + private static void checkUniquenessOfExternalNodes( final Phylogeny phy, final String msg ) + throws IllegalArgumentException { + final Set ext_nodes = new HashSet( phy.getNumberOfExternalNodes() ); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( ext_nodes.contains( node ) ) { + throw new IllegalArgumentException( "external node [" + node.toString() + "] of " + msg + + " is not unique" ); + } + ext_nodes.add( node ); + } + } +} diff --git a/forester/java/src/org/forester/application/count_support.java b/forester/java/src/org/forester/application/count_support.java new file mode 100644 index 0000000..67dfc1b --- /dev/null +++ b/forester/java/src/org/forester/application/count_support.java @@ -0,0 +1,262 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.tools.SupportCount; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class count_support { + + final static private String PRG_NAME = "count_support"; + final static private String PRG_VERSION = "1.0"; + final static private String PRG_DATE = "2008.03.04"; + private final static boolean WRITE_EVALUATORS_AS_NHX = false; + + public static void main( final String args[] ) { + ForesterUtil + .printProgramInformation( count_support.PRG_NAME, count_support.PRG_VERSION, count_support.PRG_DATE ); + if ( ( args.length < 3 ) || ( args.length > 7 ) ) { + System.out.println(); + System.out.println( count_support.PRG_NAME + ": wrong number of arguments" ); + System.out.println(); + System.out + .println( "Usage: \"count_support [options] [outfile for evaluator phylogenies, " + + "always unstripped if -t= option is used, otherwise strippedness is dependent on -s option]\"\n" ); + System.out + .println( " Options: -s strip external nodes from evaluator phylogenies not found in phylogeny to be evaluated" ); + System.out.println( " : -t= threshold for similarity (0.0 to 1.0)" ); + System.out.println( " : -n no branch lengths in outfile for evaluator phylogenies" ); + System.out.println(); + System.exit( -1 ); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( "s" ); + allowed_options.add( "t" ); + allowed_options.add( "n" ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Unknown option(s): " + dissallowed_options ); + } + final File phylogeny_infile = cla.getFile( 0 ); + final File evaluators_infile = cla.getFile( 1 ); + final File phylogeny_outfile = cla.getFile( 2 ); + File evaluators_outfile = null; + boolean branch_lengths_in_ev_out = true; + if ( cla.isOptionSet( "n" ) ) { + branch_lengths_in_ev_out = false; + } + if ( cla.getNumberOfNames() == 4 ) { + evaluators_outfile = cla.getFile( 3 ); + } + else { + if ( !branch_lengths_in_ev_out ) { + ForesterUtil.fatalError( count_support.PRG_NAME, + "Cannot use -n option if no outfile for evaluators specified" ); + } + } + Phylogeny p = null; + Phylogeny[] ev = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogeny_infile, true ); + p = factory.create( phylogeny_infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Could not read \"" + phylogeny_infile + "\" [" + + e.getMessage() + "]" ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( evaluators_infile, true ); + ev = factory.create( evaluators_infile, pp ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Could not read \"" + evaluators_infile + "\" [" + + e.getMessage() + "]" ); + } + boolean strip = false; + if ( cla.isOptionSet( "s" ) ) { + strip = true; + } + double threshhold = -1.0; + if ( cla.isOptionSet( "t" ) ) { + try { + threshhold = cla.getOptionValueAsDouble( "t" ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "error in command line arguments: " + e.getMessage() ); + } + if ( ( threshhold < 0 ) || ( threshhold > 1.0 ) ) { + ForesterUtil.fatalError( count_support.PRG_NAME, + "support threshold has to be between 0.0 and 1.0 (inclusive)" ); + } + } + List evaluator_phylogenies_above_threshold = null; + try { + if ( threshhold >= 0 ) { + evaluator_phylogenies_above_threshold = SupportCount.count( p, ev, strip, threshhold, true ); + if ( evaluator_phylogenies_above_threshold.size() < 1 ) { + ForesterUtil.fatalError( "count_support", "appears like threshold for similarity is set too high" ); + } + } + else { + SupportCount.count( p, ev, strip, true ); + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Failure during support counting: " + e.getMessage() ); + } + if ( threshhold >= 0 ) { + count_support.normalizeSupport( p, 100, evaluator_phylogenies_above_threshold.size() ); + System.out.println( evaluator_phylogenies_above_threshold.size() + " out of " + ev.length + + " evaluator phylogenies are above threshold of " + threshhold ); + } + try { + final PhylogenyWriter w = new PhylogenyWriter(); + w.toPhyloXML( phylogeny_outfile, p, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Failure to write output [" + e.getMessage() + "]" ); + } + System.out.println(); + System.out.println( "Wrote phylogeny with support values to: " + phylogeny_outfile ); + if ( evaluators_outfile != null ) { + try { + final PhylogenyWriter w = new PhylogenyWriter(); + if ( evaluator_phylogenies_above_threshold != null ) { + System.out.println( "Writing " + evaluator_phylogenies_above_threshold.size() + + " evaluator phylogenies above threshold of " + threshhold + " to: " + evaluators_outfile ); + if ( count_support.WRITE_EVALUATORS_AS_NHX ) { + w.toNewHampshireX( evaluator_phylogenies_above_threshold, evaluators_outfile, ";" + + ForesterUtil.getLineSeparator() ); + } + else { + w.toNewHampshire( evaluator_phylogenies_above_threshold, + true, + branch_lengths_in_ev_out, + evaluators_outfile, + ";" + ForesterUtil.getLineSeparator() ); + } + } + else { + System.out.println( "Writing " + ev.length + " evaluator phylogenies to :" + evaluators_outfile ); + if ( count_support.WRITE_EVALUATORS_AS_NHX ) { + w.toNewHampshireX( Arrays.asList( ev ), evaluators_outfile, ";" + + ForesterUtil.getLineSeparator() ); + } + else { + w.toNewHampshire( Arrays.asList( ev ), true, branch_lengths_in_ev_out, evaluators_outfile, ";" + + ForesterUtil.getLineSeparator() ); + } + } + } + catch ( final IOException e ) { + ForesterUtil.fatalError( count_support.PRG_NAME, "Failure to write output [" + e.getMessage() + "]" ); + } + } + System.out.println(); + System.out.println( "Done." ); + System.out.println(); + } + + private static void normalizeSupport( final Phylogeny p, final double normalized_max, final int number_phylos ) { + double min = Double.MAX_VALUE; + double max = -Double.MAX_VALUE; + double sum = 0.0; + int n = 0; + for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !node.isRoot() && !node.isExternal() ) { + final double b = PhylogenyMethods.getConfidenceValue( node ); + if ( b > max ) { + max = b; + } + if ( ( b >= 0 ) && ( b < min ) ) { + min = b; + } + sum += b; + ++n; + } + } + double av = sum / n; + System.out.println( "Max support before normalization is : " + max ); + System.out.println( "Min support before normalization is : " + min ); + System.out.println( "Average support before normalization is: " + av + " (=" + sum + "/" + n + ")" ); + System.out.println( "Normalizing so that theoretical maximum support value is: " + normalized_max ); + System.out.println( "Number of phylogenies used in support analysis: " + number_phylos ); + final double f = normalized_max / number_phylos; + min = Double.MAX_VALUE; + max = -Double.MAX_VALUE; + sum = 0.0; + n = 0; + for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isRoot() || node.isExternal() ) { + PhylogenyMethods.setBootstrapConfidence( node, Confidence.CONFIDENCE_DEFAULT_VALUE ); + } + else { + double b = PhylogenyMethods.getConfidenceValue( node ); + b = f * b; + PhylogenyMethods.setBootstrapConfidence( node, b ); + if ( b > max ) { + max = b; + } + if ( ( b >= 0 ) && ( b < min ) ) { + min = b; + } + sum += b; + ++n; + } + } + av = sum / n; + System.out.println( "Max support after normalization is : " + max ); + System.out.println( "Min support after normalization is : " + min ); + System.out.println( "Average support after normalization is: " + av + " (=" + sum + "/" + n + ")" ); + } +} diff --git a/forester/java/src/org/forester/application/decorator.java b/forester/java/src/org/forester/application/decorator.java new file mode 100644 index 0000000..c1a2786 --- /dev/null +++ b/forester/java/src/org/forester/application/decorator.java @@ -0,0 +1,384 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.tools.PhylogenyDecorator; +import org.forester.tools.PhylogenyDecorator.FIELD; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public final class decorator { + + private static final String SEQUENCE_NAME_FIELD = "s"; + private static final String TAXONOMY_CODE_FIELD = "c"; + private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD = "sn"; + private static final String DS_FILED = "d"; + private static final String SEQUENCE_ANNOTATION_DESC = "a"; + private static final String NODE_NAME_FIELD = "n"; + final static private String PICKY_OPTION = "p"; + final static private String FIELD_OPTION = "f"; + final static private String MOVE_DOMAIN_NUMBER_OPTION = "mdn"; // Hidden expert option. + final static private String TREE_NAME_OPTION = "pn"; + final static private String TREE_ID_OPTION = "pi"; + final static private String TREE_DESC_OPTION = "pd"; + final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION = "sn"; + final static private String PROCESS_NAME_INTELLIGENTLY_OPTION = "x"; + final static private String PROCESS_SIMILAR_TO_OPTION = "xs"; + final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION = "c"; + final static private String ALLOW_REMOVAL_OF_CHARS_OPTION = "r"; + final static private String ADVANCED_TABLE_OPTION = "table"; + final static private String KEY_COLUMN = "k"; + final static private String VALUE_COLUMN = "v"; + final static private String MAPPING_FILE_SEPARATOR_OPTION = "s"; + final static private String MAPPING_FILE_SEPARATOR_DEFAULT = ":"; + final static private boolean USE_FIRST_SEPARATOR_ONLY = true; + final static private String PRG_NAME = "decorator"; + final static private String PRG_VERSION = "1.10"; + final static private String PRG_DATE = "2009.10.08"; + + private static void argumentsError() { + System.out.println(); + System.out.println( decorator.PRG_NAME + " -" + ADVANCED_TABLE_OPTION + " | -f= " + + "[mapping table file] " ); + System.out.println(); + System.out.println( "options:" ); + System.out.println(); + System.out.println( " -" + ADVANCED_TABLE_OPTION + " : table instead of one to one map (-f=)" ); + System.out.println( " -r= : allow to remove up to n characters from the end of the names" ); + System.out.println( " in phylogenies infile if not found (in map) otherwise" ); + System.out.println( " -p : for picky, fails if node name not found in mapping table, default is off" ); + System.out.println( " -" + TREE_NAME_OPTION + "=: name for the phylogeny" ); + System.out.println( " -" + TREE_ID_OPTION + "=: identifier for the phylogeny (in the form provider:value)" ); + System.out.println( " -" + TREE_DESC_OPTION + "=: description for phylogenies" ); + System.out.println(); + System.out.println(); + System.out.println( "advanced options, only available if -" + ADVANCED_TABLE_OPTION + " is not used:" ); + System.out.println(); + System.out.println( " -f= : field to be replaced: " + NODE_NAME_FIELD + " : node name" ); + System.out.println( " " + SEQUENCE_ANNOTATION_DESC + + " : sequence annotation description" ); + System.out.println( " " + DS_FILED + " : domain structure" ); + System.out.println( " " + TAXONOMY_CODE_FIELD + " : taxonomy code" ); + System.out.println( " " + TAXONOMY_SCIENTIFIC_NAME_FIELD + + ": taxonomy scientific name" ); + System.out.println( " " + SEQUENCE_NAME_FIELD + " : sequence name" ); + System.out.println( " -k= : key column in mapping table (0 based)," ); + System.out.println( " names of the node to be decorated - default is 0" ); + System.out.println( " -v= : value column in mapping table (0 based)," ); + System.out.println( " data which with to decorate - default is 1" ); + System.out.println( " -" + EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION + + " : to extract bracketed scientific names" ); + System.out.println( " -s= : column separator in mapping file, default is \"" + + decorator.MAPPING_FILE_SEPARATOR_DEFAULT + "\"" ); + System.out.println( " -x : process name \"intelligently\" (only for -f=n)" ); + System.out.println( " -" + decorator.PROCESS_SIMILAR_TO_OPTION + + " : process name \"intelligently\" and process information after \"similar to\" (only for -f=n)" ); + System.out.println( " -c : cut name after first space (only for -f=n)" ); + System.out.println(); + System.exit( -1 ); + } + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( decorator.PRG_NAME, decorator.PRG_VERSION, decorator.PRG_DATE ); + if ( ( args.length < 4 ) || ( args.length > 12 ) ) { + decorator.argumentsError(); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( ( cla.getNumberOfNames() < 3 ) || ( cla.getNumberOfNames() > 4 ) ) { + decorator.argumentsError(); + } + final File phylogenies_infile = cla.getFile( 0 ); + final File mapping_infile = cla.getFile( 1 ); + final File phylogenies_outfile = cla.getFile( 2 ); + if ( phylogenies_outfile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "[" + phylogenies_outfile + "] already exists" ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( decorator.ADVANCED_TABLE_OPTION ); + allowed_options.add( decorator.PICKY_OPTION ); + allowed_options.add( decorator.FIELD_OPTION ); + allowed_options.add( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION ); + allowed_options.add( decorator.PROCESS_SIMILAR_TO_OPTION ); + allowed_options.add( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION ); + allowed_options.add( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ); + allowed_options.add( decorator.KEY_COLUMN ); + allowed_options.add( decorator.VALUE_COLUMN ); + allowed_options.add( decorator.MAPPING_FILE_SEPARATOR_OPTION ); + allowed_options.add( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION ); + allowed_options.add( decorator.TREE_NAME_OPTION ); + allowed_options.add( decorator.TREE_ID_OPTION ); + allowed_options.add( decorator.TREE_DESC_OPTION ); + allowed_options.add( decorator.MOVE_DOMAIN_NUMBER_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final boolean advanced_table = cla.isOptionSet( decorator.ADVANCED_TABLE_OPTION ); + if ( !advanced_table ) { + final List mandatory_options = new ArrayList(); + mandatory_options.add( decorator.FIELD_OPTION ); + final String missing_options = cla.validateMandatoryOptionsAsString( mandatory_options ); + if ( missing_options.length() > 0 ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "missing option(s): " + missing_options ); + } + } + final boolean picky = cla.isOptionSet( decorator.PICKY_OPTION ); + String separator = decorator.MAPPING_FILE_SEPARATOR_DEFAULT; + if ( cla.isOptionSet( decorator.MAPPING_FILE_SEPARATOR_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + separator = cla.getOptionValue( decorator.MAPPING_FILE_SEPARATOR_OPTION ); + } + int key_column = 0; + int value_column = 1; + String field_str = ""; + FIELD field = FIELD.NODE_NAME; + int numbers_of_chars_allowed_to_remove_if_not_found_in_map = -1; + boolean cut_name_after_space = false; + boolean process_name_intelligently = false; + boolean process_similar_to = false; + boolean extract_bracketed_scientific_name = false; + boolean move_domain_numbers_at_end_to_middle = false; + String tree_name = ""; + String tree_id = ""; + String tree_desc = ""; + try { + if ( cla.isOptionSet( decorator.TREE_NAME_OPTION ) ) { + tree_name = cla.getOptionValueAsCleanString( decorator.TREE_NAME_OPTION ); + } + if ( cla.isOptionSet( decorator.TREE_ID_OPTION ) ) { + tree_id = cla.getOptionValueAsCleanString( decorator.TREE_ID_OPTION ); + } + if ( cla.isOptionSet( decorator.TREE_DESC_OPTION ) ) { + tree_desc = cla.getOptionValueAsCleanString( decorator.TREE_DESC_OPTION ); + } + if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + extract_bracketed_scientific_name = true; + } + if ( cla.isOptionSet( decorator.KEY_COLUMN ) ) { + if ( advanced_table ) { + argumentsError(); + } + key_column = cla.getOptionValueAsInt( decorator.KEY_COLUMN ); + } + if ( cla.isOptionSet( decorator.VALUE_COLUMN ) ) { + if ( advanced_table ) { + argumentsError(); + } + value_column = cla.getOptionValueAsInt( decorator.VALUE_COLUMN ); + } + if ( cla.isOptionSet( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + cut_name_after_space = true; + } + if ( cla.isOptionSet( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + process_name_intelligently = true; + } + if ( cla.isOptionSet( decorator.PROCESS_SIMILAR_TO_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + process_similar_to = true; + } + if ( cla.isOptionSet( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ) ) { + numbers_of_chars_allowed_to_remove_if_not_found_in_map = cla + .getOptionValueAsInt( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ); + } + if ( cla.isOptionSet( decorator.MOVE_DOMAIN_NUMBER_OPTION ) ) { + move_domain_numbers_at_end_to_middle = true; + } + if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) { + field_str = cla.getOptionValue( decorator.FIELD_OPTION ); + if ( field_str.equals( NODE_NAME_FIELD ) ) { + field = FIELD.NODE_NAME; + } + else if ( field_str.equals( SEQUENCE_ANNOTATION_DESC ) ) { + field = FIELD.SEQUENCE_ANNOTATION_DESC; + } + else if ( field_str.equals( DS_FILED ) ) { + field = FIELD.DOMAIN_STRUCTURE; + extract_bracketed_scientific_name = false; + } + else if ( field_str.equals( TAXONOMY_CODE_FIELD ) ) { + field = FIELD.TAXONOMY_CODE; + } + else if ( field_str.equals( SEQUENCE_NAME_FIELD ) ) { + field = FIELD.SEQUENCE_NAME; + } + else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) { + field = FIELD.TAXONOMY_SCIENTIFIC_NAME; + extract_bracketed_scientific_name = false; + } + else { + ForesterUtil.fatalError( decorator.PRG_NAME, "unknown value for \"" + decorator.FIELD_OPTION + + "\" option: \"" + field_str + "\"" ); + } + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "error in command line: " + e.getMessage() ); + } + if ( ( field != FIELD.NODE_NAME ) && ( cut_name_after_space || process_name_intelligently ) ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x or -c option without -f=n" ); + } + if ( ( field != FIELD.NODE_NAME ) && process_similar_to ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION + + " option without -f=n" ); + } + if ( cut_name_after_space && process_name_intelligently ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x and -c option together" ); + } + if ( process_similar_to && process_name_intelligently ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION + + " and -x option together" ); + } + if ( process_similar_to && cut_name_after_space ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION + + " and -c option together" ); + } + Phylogeny[] phylogenies = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogenies_infile, true ); + phylogenies = factory.create( phylogenies_infile, pp ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read phylgenies from [" + phylogenies_infile + + "] [" + e.getMessage() + "]" ); + } + Map map = null; + if ( !advanced_table ) { + BasicTable mapping_table = null; + try { + mapping_table = BasicTableParser.parse( mapping_infile, separator, decorator.USE_FIRST_SEPARATOR_ONLY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read [" + mapping_infile + "] [" + + e.getMessage() + "]" ); + } + if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" ); + } + if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" ); + } + map = mapping_table.getColumnsAsMap( key_column, value_column ); + } + if ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id ) + || !ForesterUtil.isEmpty( tree_desc ) ) { + if ( ( phylogenies.length > 1 ) + && ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id ) ) ) { + ForesterUtil.fatalError( decorator.PRG_NAME, + "attempt to set same name or id on more than one phylogeny" ); + } + if ( !ForesterUtil.isEmpty( tree_name ) ) { + phylogenies[ 0 ].setName( tree_name ); + } + if ( !ForesterUtil.isEmpty( tree_id ) ) { + final String[] s_ary = tree_id.split( ":" ); + phylogenies[ 0 ].setIdentifier( new Identifier( s_ary[ 1 ], s_ary[ 0 ] ) ); + } + if ( !ForesterUtil.isEmpty( tree_desc ) ) { + for( int i = 0; i < phylogenies.length; ++i ) { + phylogenies[ i ].setDescription( tree_desc ); + } + } + } + try { + if ( advanced_table ) { + Map> table = null; + try { + table = PhylogenyDecorator.parseMappingTable( mapping_infile ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read \"" + mapping_infile + "\" [" + + e.getMessage() + "]" ); + } + PhylogenyDecorator.decorate( phylogenies, + table, + picky, + numbers_of_chars_allowed_to_remove_if_not_found_in_map ); + } + else { + PhylogenyDecorator.decorate( phylogenies, + map, + field, + extract_bracketed_scientific_name, + picky, + cut_name_after_space, + process_name_intelligently, + process_similar_to, + numbers_of_chars_allowed_to_remove_if_not_found_in_map, + move_domain_numbers_at_end_to_middle ); + } + } + catch ( final NullPointerException e ) { + ForesterUtil.unexpectedFatalError( decorator.PRG_NAME, e ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "failed to map [" + e + "]" ); + } + try { + final PhylogenyWriter w = new PhylogenyWriter(); + w.toPhyloXML( phylogenies, 0, phylogenies_outfile, ForesterUtil.getLineSeparator() ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( decorator.PRG_NAME, "failed to write output [" + e.getMessage() + "]" ); + } + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "wrote: " + phylogenies_outfile ); + ForesterUtil.programMessage( PRG_NAME, "OK." ); + } +} diff --git a/forester/java/src/org/forester/application/get_distances.java b/forester/java/src/org/forester/application/get_distances.java new file mode 100644 index 0000000..8a8d328 --- /dev/null +++ b/forester/java/src/org/forester/application/get_distances.java @@ -0,0 +1,92 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.StringTokenizer; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.ForesterUtil; + +public class get_distances { + + public static void main( final String args[] ) { + if ( args.length != 3 ) { + System.out.println( "\nget_distances: Wrong number of arguments.\n" ); + System.out.println( "Usage: \"get_distances \"\n" ); + System.exit( -1 ); + } + final File phylogeny_infile = new File( args[ 0 ] ); + final File names_infile = new File( args[ 1 ] ); + final File outfile = new File( args[ 2 ] ); + Phylogeny p = null; + try { + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogeny_infile, true ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + p = factory.create( phylogeny_infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.out.println( "\nCould not read \"" + phylogeny_infile + "\" [" + e.getMessage() + "]\n" ); + System.exit( -1 ); + } + String line = ""; + try { + final BufferedReader in = new BufferedReader( new FileReader( names_infile ) ); + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + while ( ( line = in.readLine() ) != null ) { + if ( line.length() < 3 ) { + continue; + } + final StringTokenizer st = new StringTokenizer( line ); + if ( st.countTokens() < 2 ) { + continue; + } + final double d = PhylogenyMethods.getInstance().calculateDistance( p.getNode( st.nextToken() ), + p.getNode( st.nextToken() ) ); + out.write( line + " " + d ); + out.newLine(); + } + out.flush(); + out.close(); + in.close(); + } + catch ( final IOException e ) { + System.out.println( "\nError during processing of \"" + names_infile + "\" [" + e.getMessage() + + "] at line \"" + line + "\"\n" ); + System.exit( -1 ); + } + System.out.println( "\nDone.\n" ); + } +} diff --git a/forester/java/src/org/forester/application/goac.java b/forester/java/src/org/forester/application/goac.java new file mode 100644 index 0000000..1a6e00f --- /dev/null +++ b/forester/java/src/org/forester/application/goac.java @@ -0,0 +1,208 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2009 Christian M. Zmasek +// Copyright (C) 2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; + +import org.forester.go.GoId; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.CommandLineArguments; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; +import org.forester.util.GeneralTable; + +public class goac { + + private static final String ALL = "{ALL}"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String PRG_NAME = "goac"; + final static private String PRG_VERSION = "1.03"; + final static private String PRG_DATE = "2010.04.21"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/goac"; + + private static void addStats( final SortedMap> data_to_be_analyzed, + final GeneralTable table ) { + for( final String go : table.getColumnIdentifiers() ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final String label : data_to_be_analyzed.keySet() ) { + if ( !label.equals( ALL ) ) { + final Double value = table.getValue( go, label ); + stats.addValue( value == null ? 0.0 : value ); + } + } + table.setValue( go, "{AVG}", stats.arithmeticMean() ); + table.setValue( go, "{SUM}", stats.getSum() ); + table.setValue( go, "{MED}", stats.median() ); + if ( stats.getN() > 1 ) { + table.setValue( go, "{SD}", stats.sampleStandardDeviation() ); + } + else { + table.setValue( go, "{SD}", new Double( 0 ) ); + } + table.setValue( go, "{MIN}", stats.getMin() ); + table.setValue( go, "{MAX}", stats.getMax() ); + } + } + + public static void main( final String args[] ) { + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + final List allowed_options = new ArrayList(); + if ( cla.getNumberOfNames() != 3 ) { + printHelp(); + System.exit( -1 ); + } + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File obofile = cla.getFile( 0 ); + final File query_superterms_file = cla.getFile( 1 ); + final File exp_file = cla.getFile( 2 ); + final OBOparser parser = new OBOparser( obofile, OBOparser.ReturnType.BASIC_GO_TERM ); + List all_go_terms = null; + try { + all_go_terms = parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.toString() ); + } + final Map goid_to_term_map = GoUtils.createGoIdToGoTermMap( all_go_terms ); + final List query_superterms_ids = new ArrayList(); + SortedMap> query_superterms_id_raw = null; + try { + query_superterms_id_raw = GoUtils.parseGoIds( query_superterms_file, "#", "" ); + } + catch ( final IOException e ) { + ForesterUtil.printErrorMessage( PRG_NAME, e.getMessage() ); + System.exit( -1 ); + } + final List queries = query_superterms_id_raw.get( "" ); + for( final GoId id : queries ) { + if ( !goid_to_term_map.containsKey( id ) ) { + ForesterUtil.printErrorMessage( PRG_NAME, "\"" + id + "\" not present in \"" + obofile + "\"" ); + System.exit( -1 ); + } + query_superterms_ids.add( id ); + } + SortedMap> data_to_be_analyzed = null; + try { + data_to_be_analyzed = GoUtils.parseGoIds( exp_file, "#", ">" ); + } + catch ( final IOException e ) { + ForesterUtil.printErrorMessage( PRG_NAME, e.getMessage() ); + System.exit( -1 ); + } + final List all_ids = new ArrayList(); + for( final String label : data_to_be_analyzed.keySet() ) { + final List experiment_set_ids = data_to_be_analyzed.get( label ); + for( final GoId go_id : experiment_set_ids ) { + if ( !goid_to_term_map.containsKey( go_id ) ) { + ForesterUtil.printErrorMessage( PRG_NAME, "GO id [" + go_id + "] not found in GO id to term map" ); + System.exit( -1 ); + } + all_ids.add( go_id ); + } + } + if ( data_to_be_analyzed.size() > 1 ) { + data_to_be_analyzed.put( ALL, all_ids ); + } + final GeneralTable table_counts = new GeneralTable(); + final GeneralTable table_percentage = new GeneralTable(); + for( final String label : data_to_be_analyzed.keySet() ) { + System.out.println(); + System.out.println( label + "\t\t\t\t" ); + final List experiment_set_ids = data_to_be_analyzed.get( label ); + Map counts_id = null; + try { + counts_id = GoUtils.countCategoriesId( query_superterms_ids, experiment_set_ids, goid_to_term_map ); + } + catch ( final Exception e ) { + ForesterUtil.printErrorMessage( PRG_NAME, e.getMessage() ); + System.exit( -1 ); + } + int sum = 0; + for( final GoId id : counts_id.keySet() ) { + sum += counts_id.get( id ); + } + if ( sum > 0 ) { + table_counts.setValue( "{total}", label, ( double ) sum ); + } + for( final GoId id : counts_id.keySet() ) { + final int counts = counts_id.get( id ); + double percentage = 0.0; + if ( sum > 0 ) { + percentage = 100.0 * counts / ( sum ); + } + System.out.println( counts + "\t" + counts + "/" + sum + "\t" + percentage + "\t" + id + "\t" + + goid_to_term_map.get( id ).getName() ); + table_counts.setValue( goid_to_term_map.get( id ).getName(), label, ( double ) counts ); + table_percentage.setValue( goid_to_term_map.get( id ).getName(), label, percentage ); + } + } + addStats( data_to_be_analyzed, table_counts ); + addStats( data_to_be_analyzed, table_percentage ); + System.out.println(); + System.out.println(); + System.out.println(); + System.out.println( table_counts.toString( ForesterUtil.FORMATTER_3 ) ); + System.out.println(); + System.out.println(); + System.out.println(); + System.out.println( table_percentage.toString( ForesterUtil.FORMATTER_3 ) ); + System.out.println(); + } + + private static void printHelp() { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + System.out.println( "Usage:" ); + System.out.println(); + System.out + .println( PRG_NAME + + " " ); + System.out.println(); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/meta_ontologizer.java b/forester/java/src/org/forester/application/meta_ontologizer.java new file mode 100644 index 0000000..b1125f6 --- /dev/null +++ b/forester/java/src/org/forester/application/meta_ontologizer.java @@ -0,0 +1,141 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.go.etc.MetaOntologizer; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class meta_ontologizer { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String P_OPTION = "p"; + final static private String PRG_NAME = "meta_ontologizer"; + final static private String PRG_VERSION = "1.10"; + final static private String PRG_DATE = "2009.04.29"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + private static final String RESULT_FILE_PREFIX = "table"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( args.length < 4 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( P_OPTION ); + final List mandatory_options = new ArrayList(); + mandatory_options.add( P_OPTION ); + if ( ( cla.getNumberOfNames() != 5 ) && ( cla.getNumberOfNames() != 6 ) ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final String missing = cla.validateMandatoryOptionsAsString( mandatory_options ); + if ( missing.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "missing option(s): " + missing ); + } + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File obo_file = cla.getFile( 0 ); + final File pfam2go_file = cla.getFile( 1 ); + final File ontologizer_outdir = cla.getFile( 2 ); + File domain_gain_loss_file = null; + String outfile_base = null; + String comment = null; + if ( cla.getNumberOfNames() == 6 ) { + domain_gain_loss_file = cla.getFile( 3 ); + outfile_base = cla.getName( 4 ); + comment = cla.getName( 5 ); + } + else { + outfile_base = cla.getName( 3 ); + comment = cla.getName( 4 ); + } + double p_adjusted_upper_limit = -1; + try { + p_adjusted_upper_limit = cla.getOptionValueAsDouble( P_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + try { + final PfamToGoParser parser = new PfamToGoParser( pfam2go_file ); + final List pfam_to_go_mappings = parser.parse(); + ForesterUtil.programMessage( PRG_NAME, "parsed " + pfam_to_go_mappings.size() + " Pfam to GO mappings" ); + MetaOntologizer.reformat( ontologizer_outdir, + RESULT_FILE_PREFIX, + domain_gain_loss_file, + outfile_base, + obo_file, + p_adjusted_upper_limit, + comment, + pfam_to_go_mappings ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + e.printStackTrace(); + } + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void printHelp() { + System.out.println( "Usage:" ); + System.out.println(); + System.out + .println( PRG_NAME + + " -p= [domain gain loss file] " ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/nhx_too.java b/forester/java/src/org/forester/application/nhx_too.java new file mode 100644 index 0000000..e7b8b96 --- /dev/null +++ b/forester/java/src/org/forester/application/nhx_too.java @@ -0,0 +1,110 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class nhx_too { + + final static private String PRG_NAME = "nhx_too"; + final static private String PRG_VERSION = "0.1"; + final static private String PRG_DATE = "2008.03.04"; + final static private String INT_NODE_NAME_IS_SUPPORT = "is"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( nhx_too.PRG_NAME, nhx_too.PRG_VERSION, nhx_too.PRG_DATE ); + if ( ( args.length < 3 ) || ( args.length > 3 ) ) { + System.out.println(); + System.out.println( nhx_too.PRG_NAME + ": wrong number of arguments" ); + System.out.println(); + System.out.println( "Usage: \"" + nhx_too.PRG_NAME + " [options] \n" ); + System.out.println( " Options: -" + nhx_too.INT_NODE_NAME_IS_SUPPORT + + ": internal node names are support values (i.e. MrBayes output)" ); + System.out.println(); + System.exit( -1 ); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( nhx_too.INT_NODE_NAME_IS_SUPPORT ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( nhx_too.PRG_NAME, "Unknown option(s): " + dissallowed_options ); + } + final File phylogeny_infile = cla.getFile( 0 ); + final File phylogeny_outfile = cla.getFile( 1 ); + boolean int_node_name_is_support = false; + if ( cla.isOptionSet( nhx_too.INT_NODE_NAME_IS_SUPPORT ) ) { + int_node_name_is_support = true; + } + Phylogeny p = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogeny_infile, true ); + p = factory.create( phylogeny_infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( nhx_too.PRG_NAME, "Could not read \"" + phylogeny_infile + "\" [" + e.getMessage() + + "]" ); + } + if ( int_node_name_is_support ) { + try { + ForesterUtil.transferInternalNodeNamesToConfidence( p ); + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( nhx_too.PRG_NAME, + "Failure during moving of internal names to support values [" + + e.getMessage() + "]" ); + } + } + try { + final PhylogenyWriter w = new PhylogenyWriter(); + w.toNewHampshireX( p, phylogeny_outfile ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( nhx_too.PRG_NAME, "Failure to write output [" + e.getMessage() + "]" ); + } + System.out.println(); + System.out.println( "Done [wrote \"" + phylogeny_outfile + "\"]." ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/nj.java b/forester/java/src/org/forester/application/nj.java new file mode 100644 index 0000000..767edca --- /dev/null +++ b/forester/java/src/org/forester/application/nj.java @@ -0,0 +1,152 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.io.parsers.SymmetricalDistanceMatrixParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class nj { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String VERBOSE_OPTION = "v"; + final static private String UPPER_TRIANGLE_OPTION = "u"; + final static private String PRG_NAME = "nj"; + final static private String PRG_VERSION = "0.0.1"; + final static private String PRG_DATE = "2008.03.04"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + final List allowed_options = new ArrayList(); + allowed_options.add( HELP_OPTION_1 ); + allowed_options.add( HELP_OPTION_2 ); + allowed_options.add( VERBOSE_OPTION ); + allowed_options.add( UPPER_TRIANGLE_OPTION ); + if ( ( args.length < 2 ) ) { + printHelp(); + System.exit( -1 ); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( cla.getNumberOfNames() != 2 ) { + printHelp(); + System.exit( -1 ); + } + boolean verbose = false; + boolean upper_triangle = false; + if ( cla.isOptionSet( VERBOSE_OPTION ) ) { + verbose = true; + } + if ( cla.isOptionSet( UPPER_TRIANGLE_OPTION ) ) { + upper_triangle = true; + } + final File infile = cla.getFile( 0 ); + final File outfile = cla.getFile( 1 ); + final String error1 = ForesterUtil.isReadableFile( infile ); + if ( !ForesterUtil.isEmpty( error1 ) ) { + ForesterUtil.fatalError( PRG_NAME, "cannot read from infile [" + infile + "]: " + error1 ); + } + if ( outfile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "outfile [" + outfile + "] already exists" ); + } + final String error2 = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error2 ) ) { + ForesterUtil.fatalError( PRG_NAME, "cannot write to outfile [" + outfile + "]: " + error2 ); + } + final SymmetricalDistanceMatrixParser parser = SymmetricalDistanceMatrixParser.createInstance(); + if ( upper_triangle ) { + parser.setInputMatrixType( SymmetricalDistanceMatrixParser.InputMatrixType.UPPER_TRIANGLE ); + } + else { + parser.setInputMatrixType( SymmetricalDistanceMatrixParser.InputMatrixType.LOWER_TRIANGLE ); + } + DistanceMatrix[] matrices = null; + try { + matrices = parser.parse( infile ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read from infile [" + infile + "]: " + e.getMessage() ); + } + if ( verbose ) { + System.out.println( PRG_NAME + " > read " + matrices.length + " pairwise distance matrice(s) of size " + + matrices[ 0 ].getSize() ); + } + final List ps = new ArrayList(); + final NeighborJoining nj = NeighborJoining.createInstance(); + nj.setVerbose( verbose ); + final long start_time = new Date().getTime(); + for( final DistanceMatrix matrix : matrices ) { + ps.add( nj.execute( matrix ) ); + } + final long end_time = new Date().getTime(); + final PhylogenyWriter w = new PhylogenyWriter(); + try { + w.toPhyloXML( outfile, ps, 1, ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); + } + System.out.println(); + System.out.println( PRG_NAME + " > OK [" + ( end_time - start_time ) + "ms]" ); + System.out.println(); + } + + private static void printHelp() { + System.out.println(); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( "% java -cp forester.jar org.forester.applications." + PRG_NAME + + " [options] " ); + System.out.println(); + System.out.println( " Options: " ); + System.out.println( VERBOSE_OPTION + ": verbose on" ); + System.out.println( UPPER_TRIANGLE_OPTION + ": upper triangle option on (lower triangle is default)" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/obo_tool.java b/forester/java/src/org/forester/application/obo_tool.java new file mode 100644 index 0000000..31a9b26 --- /dev/null +++ b/forester/java/src/org/forester/application/obo_tool.java @@ -0,0 +1,137 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.go.GoTerm; +import org.forester.go.OBOparser; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class obo_tool { + + private static final String IDS_TO_NAMES_SUFFIX = "_ids_to_names"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String GO_ID_TO_NAME_OPTION = "i"; + final static private String PRG_NAME = "obo_tool"; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "2008.11.26"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( args.length < 3 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( GO_ID_TO_NAME_OPTION ); + if ( cla.getNumberOfNames() != 2 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean output_ids_to_names = false; + if ( cla.isOptionSet( GO_ID_TO_NAME_OPTION ) ) { + output_ids_to_names = true; + } + final File infile = cla.getFile( 0 ); + final File outfile = cla.getFile( 1 ); + final OBOparser parser = new OBOparser( infile, OBOparser.ReturnType.BASIC_GO_TERM ); + List go_terms = null; + try { + go_terms = parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.toString() ); + } + ForesterUtil.programMessage( PRG_NAME, "successfully read in " + go_terms.size() + " GO terms from [" + infile + + "]" ); + if ( output_ids_to_names ) { + final File outfile_ids_to_names = new File( outfile + IDS_TO_NAMES_SUFFIX ); + final String error = ForesterUtil.isWritableFile( outfile_ids_to_names ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( PRG_NAME, error ); + } + try { + final Writer out = new BufferedWriter( new FileWriter( outfile_ids_to_names ) ); + for( final GoTerm go_term : go_terms ) { + out.write( go_term.getGoId().getId() ); + out.write( "\t" ); + out.write( go_term.getDefinition() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.toString() ); + } + ForesterUtil.programMessage( PRG_NAME, "wrote: [" + outfile_ids_to_names + "]" ); + } + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void printHelp() { + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " " ); + System.out.println(); + System.out.println( " options: " ); + System.out.println(); + System.out.println( " -" + GO_ID_TO_NAME_OPTION + ": output GO id to name map file" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/pccx.java b/forester/java/src/org/forester/application/pccx.java new file mode 100644 index 0000000..d46d18f --- /dev/null +++ b/forester/java/src/org/forester/application/pccx.java @@ -0,0 +1,300 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.pccx.BasicExternalNodeBasedCoverageExtender; +import org.forester.pccx.Coverage; +import org.forester.pccx.CoverageCalculationOptions; +import org.forester.pccx.CoverageCalculator; +import org.forester.pccx.CoverageExtender; +import org.forester.pccx.ExternalNodeBasedCoverageMethod; +import org.forester.pccx.ExternalNodeBasedCoverageMethodOptions; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + */ +public class pccx { + + final static private int EXTEND_BY_DEFAULT = -100; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String USE_REAL_BL_OPTION = "d"; + final static private String USE_LOG_REAL_BL_OPTION = "ld"; + final static private String EXTEND_BY_OPTION = "x"; + final static private String OUTPUT_OPTION = "o"; + final static private String INPUT_OPTION = "i"; + final static private String OUTPUT_ANNOTATED_PHYLOGENIES_OPTION = "p"; + final static private String PRG_NAME = "pccx"; + final static private String PRG_VERSION = "1.0.0"; + final static private String BRANCH_LENGTH_BASED_SCORING = "org.forester.tools.modeling.BranchLengthBasedScoringMethod"; + final static private String BRANCH_COUNTING_BASED_SCORING = "org.forester.tools.modeling.BranchCountingBasedScoringMethod"; + final static private String LOG_BRANCH_LENGTH_BASED_SCORING = "org.forester.tools.modeling.LogBranchLengthBasedScoringMethod"; + final static private String PRG_DATE = "2008.03.04"; + final static private String WWW = "www.phylosoft.org/forester/applications/pccx"; + final static private String E_MAIL = "czmasek@burnham.org"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( pccx.PRG_NAME, pccx.PRG_VERSION, pccx.PRG_DATE, pccx.E_MAIL, pccx.WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( pccx.HELP_OPTION_1 ) || cla.isOptionSet( pccx.HELP_OPTION_2 ) ) { + System.out.println(); + pccx.printHelp(); + System.exit( 0 ); + } + if ( ( args.length < 2 ) ) { + System.out.println(); + System.out.println( "Incorrect number of arguments." ); + System.out.println(); + pccx.printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + boolean use_bl = false; + boolean use_log_bl = false; + int extend_by = pccx.EXTEND_BY_DEFAULT; + allowed_options.add( pccx.USE_REAL_BL_OPTION ); + allowed_options.add( pccx.USE_LOG_REAL_BL_OPTION ); + allowed_options.add( pccx.EXTEND_BY_OPTION ); + allowed_options.add( pccx.INPUT_OPTION ); + allowed_options.add( pccx.OUTPUT_OPTION ); + allowed_options.add( pccx.OUTPUT_ANNOTATED_PHYLOGENIES_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( pccx.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + if ( cla.getNumberOfNames() < 1 ) { + System.out.println(); + System.out.println( "No phylogenies infile indicated." ); + System.out.println(); + pccx.printHelp(); + System.exit( -1 ); + } + final File phylogenies_infile = cla.getFile( 0 ); + final List external_otu_names = new ArrayList(); + if ( cla.getNumberOfNames() > 1 ) { + for( int i = 1; i < cla.getNumberOfNames(); ++i ) { + external_otu_names.add( cla.getName( i ) ); + } + } + if ( cla.isOptionSet( pccx.USE_REAL_BL_OPTION ) ) { + use_bl = true; + } + if ( cla.isOptionSet( pccx.USE_LOG_REAL_BL_OPTION ) ) { + use_log_bl = true; + } + if ( use_bl && use_log_bl ) { + System.out.println(); + pccx.printHelp(); + System.exit( -1 ); + } + if ( cla.isOptionSet( pccx.EXTEND_BY_OPTION ) ) { + extend_by = 0; + try { + extend_by = cla.getOptionValueAsInt( pccx.EXTEND_BY_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, e.getMessage() ); + } + } + Phylogeny[] phylogenies = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogenies_infile, true ); + phylogenies = factory.create( phylogenies_infile, pp ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, "could not read \"" + phylogenies_infile + "\": " + e.getMessage() ); + } + final List phylogenies_list = Arrays.asList( phylogenies ); + File outfile = null; + PrintStream out = System.out; + if ( cla.isOptionSet( pccx.OUTPUT_OPTION ) ) { + try { + outfile = new File( cla.getOptionValue( pccx.OUTPUT_OPTION ) ); + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( pccx.PRG_NAME, error ); + } + out = new PrintStream( outfile ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, e.getMessage() ); + } + } + File infile = null; + BasicTable intable = null; + if ( cla.isOptionSet( pccx.INPUT_OPTION ) ) { + try { + infile = new File( cla.getOptionValue( pccx.INPUT_OPTION ) ); + final String error = ForesterUtil.isReadableFile( infile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( pccx.PRG_NAME, error ); + } + intable = BasicTableParser.parse( infile, " ", false ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, "failed to read \"" + infile + "\" [" + e.getMessage() + "]" ); + } + try { + for( int row = 0; row < intable.getNumberOfRows(); ++row ) { + System.out.println( "Adding external node: " + intable.getValueAsString( 0, row ) ); + external_otu_names.add( intable.getValueAsString( 0, row ) ); + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, e.getMessage() ); + } + } + File annotated_phylogenies_outfile = null; + boolean output_annoted_phylogenies = false; + if ( cla.isOptionSet( pccx.OUTPUT_ANNOTATED_PHYLOGENIES_OPTION ) ) { + output_annoted_phylogenies = true; + annotated_phylogenies_outfile = new File( cla.getOptionValue( pccx.OUTPUT_ANNOTATED_PHYLOGENIES_OPTION ) ); + final String error = ForesterUtil.isWritableFile( annotated_phylogenies_outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( pccx.PRG_NAME, error ); + } + } + try { + final CoverageCalculationOptions options; + if ( use_log_bl ) { + options = new ExternalNodeBasedCoverageMethodOptions( pccx.LOG_BRANCH_LENGTH_BASED_SCORING ); + } + else if ( use_bl ) { + options = new ExternalNodeBasedCoverageMethodOptions( pccx.BRANCH_LENGTH_BASED_SCORING ); + } + else { + options = new ExternalNodeBasedCoverageMethodOptions( pccx.BRANCH_COUNTING_BASED_SCORING ); + } + final int s = phylogenies_list.get( 0 ).getNumberOfExternalNodes() - external_otu_names.size(); + if ( extend_by > s ) { + extend_by = s; + } + System.out.println(); + System.out.println( "Options: " + options.asString() ); + System.out.println(); + if ( extend_by != pccx.EXTEND_BY_DEFAULT ) { + if ( extend_by > 0 ) { + System.out.println( "Printing " + extend_by + " names to extend coverage in an optimal manner:" ); + } + else { + System.out.println( "Printing names to completely extend coverage in an optimal manner:" ); + } + System.out.println(); + final CoverageCalculator cc = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options ); + final CoverageExtender ce = new BasicExternalNodeBasedCoverageExtender(); + Coverage cov = cc.calculateCoverage( phylogenies_list, external_otu_names, false ); + System.out.println( " before:" ); + System.out.println( cov.asString() ); + System.out.println(); + final List result = ce.find( phylogenies_list, external_otu_names, extend_by, options, out ); + final List new_names = new ArrayList( external_otu_names ); + for( final Object element : result ) { + final String n = ( String ) element; + new_names.add( n ); + } + cov = cc.calculateCoverage( phylogenies_list, new_names, output_annoted_phylogenies ); + System.out.println(); + System.out.println( " after:" ); + System.out.println( cov.asString() ); + } + else { + final CoverageCalculator cc = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options ); + final Coverage cov = cc.calculateCoverage( phylogenies_list, + external_otu_names, + output_annoted_phylogenies ); + System.out.println( cov.asString() ); + } + System.out.println(); + if ( output_annoted_phylogenies ) { + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( annotated_phylogenies_outfile, phylogenies_list.get( 0 ), 1 ); + System.out.println( "Wrote annotated phylogeny to \"" + annotated_phylogenies_outfile + "\"" ); + System.out.println(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, "Failed to write to \"" + annotated_phylogenies_outfile + + "\" [" + e.getMessage() + "]" ); + } + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( pccx.PRG_NAME, e.toString() ); + } + System.out.println(); + System.out.println( pccx.PRG_NAME + ": successfully completed" ); + System.out.println( "If this application is useful to you, please cite:" ); + System.out.println( pccx.WWW ); + System.out.println(); + out.flush(); + out.close(); + } + + private static void printHelp() { + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( pccx.PRG_NAME + + " [options] [external node name 1] [name 2] ... [name n]" ); + System.out.println(); + System.out.println( " Options: " ); + System.out.println(); + System.out.println( " -d : 1/distance based scoring method (instead of branch counting based)" ); + System.out.println( " -ld : -ln(distance) based scoring method (instead of branch counting based)" ); + System.out.println( " -x[=] : optimally extend coverage by external nodes. Use none, 0," ); + System.out.println( " or negative value for complete coverage extension." ); + System.out.println( " -o= : write output to " ); + System.out.println( " -i= : read (new-line separated) external node names from " ); + System.out.println( " -" + pccx.OUTPUT_ANNOTATED_PHYLOGENIES_OPTION + + "= : write output as annotated phylogeny to (only first" ); + System.out.println( " phylogeny in phylogenies infile is used)" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/perfume.java b/forester/java/src/org/forester/application/perfume.java new file mode 100644 index 0000000..a24f0a6 --- /dev/null +++ b/forester/java/src/org/forester/application/perfume.java @@ -0,0 +1,85 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.application; + +import java.io.File; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.CommandLineArguments; + +public class perfume { + + private final static String BASE = "b_"; + + public static void main( final String args[] ) { + if ( ( args.length != 2 ) ) { + System.exit( -1 ); + } + try { + CommandLineArguments cla = null; + cla = new CommandLineArguments( args ); + final File in = cla.getFile( 0 ); + final File out = cla.getFile( 1 ); + if ( out.exists() ) { + System.out.println( out + " already exists" ); + System.exit( -1 ); + } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + final Phylogeny[] phylogenies_0 = factory.create( in, xml_parser ); + final Phylogeny phylogeny_0 = phylogenies_0[ 0 ]; + final PhylogenyNodeIterator it = phylogeny_0.iteratorPostorder(); + int i = 0; + while ( it.hasNext() ) { + final PhylogenyNode node = it.next(); + processNode( node, i ); + i++; + } + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( out, phylogeny_0, 0 ); + } + catch ( final Exception e ) { + System.out.println( e.getLocalizedMessage() ); + e.printStackTrace(); + System.exit( -1 ); + } + } + + private static void processNode( final PhylogenyNode node, final int i ) { + node.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + if ( !node.isExternal() ) { + if ( ( node.getName() == null ) || node.getName().isEmpty() ) { + node.setName( BASE + i ); + } + } + } +} diff --git a/forester/java/src/org/forester/application/pfam2go_extractor.java b/forester/java/src/org/forester/application/pfam2go_extractor.java new file mode 100644 index 0000000..164a170 --- /dev/null +++ b/forester/java/src/org/forester/application/pfam2go_extractor.java @@ -0,0 +1,103 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2009 Christian M. Zmasek +// Copyright (C) 2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.go.GoId; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.surfacing.DomainId; + +public class pfam2go_extractor { + + final static private String PRG_NAME = "pfam2go_extractor"; + + public static void main( final String args[] ) { + if ( args.length < 3 ) { + printHelp(); + } + final PfamToGoParser p = new PfamToGoParser( new File( args[ 0 ] ) ); + List pfam2go = null; + try { + pfam2go = p.parse(); + } + catch ( final IOException e ) { + printHelp(); + e.printStackTrace(); + } + final OBOparser parser = new OBOparser( new File( args[ 1 ] ), OBOparser.ReturnType.BASIC_GO_TERM ); + List all_go_terms = null; + try { + all_go_terms = parser.parse(); + } + catch ( final IOException e ) { + printHelp(); + e.printStackTrace(); + } + final Map goid_to_term_map = GoUtils.createGoIdToGoTermMap( all_go_terms ); + System.out.println( "# pfam2go : " + args[ 0 ] ); + System.out.println( "# OBO file: " + args[ 1 ] ); + final GoId[] queries = new GoId[ args.length - 2 ]; + for( int i = 2; i < args.length; ++i ) { + queries[ i - 2 ] = new GoId( args[ i ] ); + System.out.println( "# " + ( i - 2 ) + ": " + queries[ i - 2 ].getId() + " = " + + goid_to_term_map.get( queries[ i - 2 ] ).getName() + " (" + + goid_to_term_map.get( queries[ i - 2 ] ).getDefinition() + ")" ); + } + final SortedSet pfams = new TreeSet(); + for( final PfamToGoMapping pfam_to_go_mapping : pfam2go ) { + final DomainId domain_id = pfam_to_go_mapping.getKey(); + final GoId go_id = pfam_to_go_mapping.getValue(); + final Set supers = GoUtils.getAllSuperGoIds( go_id, goid_to_term_map ); + supers.add( go_id ); + for( int i = 0; i < queries.length; ++i ) { + if ( supers.contains( queries[ i ] ) ) { + pfams.add( domain_id.toString() ); + } + } + } + for( final String pfam : pfams ) { + System.out.println( pfam ); + } + } + + private static void printHelp() { + System.out.println(); + System.out.println( PRG_NAME + + " [more GO ids]" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/pfam_go.java b/forester/java/src/org/forester/application/pfam_go.java new file mode 100644 index 0000000..8d95d86 --- /dev/null +++ b/forester/java/src/org/forester/application/pfam_go.java @@ -0,0 +1,142 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2009 Christian M. Zmasek +// Copyright (C) 2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class pfam_go { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String PRG_NAME = "pfam2go"; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "2010.02.02"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org"; + + private static void doit( final File pfams_file, final List mappings ) throws IOException { + final BufferedReader reader = ForesterUtil.obtainReader( pfams_file ); + String line = ""; + int found_count = 0; + int not_found_count = 0; + final Set encountered_domains = new HashSet(); + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( ForesterUtil.isEmpty( line ) || line.startsWith( "##" ) ) { + continue; + } + else if ( line.startsWith( "#" ) ) { + encountered_domains.clear(); + line = line.replace( '#', '>' ); + System.out.println( line ); + } + else { + if ( !encountered_domains.contains( line ) ) { + encountered_domains.add( line ); + boolean found = false; + for( final PfamToGoMapping mapping : mappings ) { + if ( mapping.getKey().getId().equals( line ) ) { + System.out.println( mapping.getValue() ); + found = true; + } + } + if ( found ) { + found_count++; + } + else { + not_found_count++; + } + } + else { + System.err.println( "# duplicate domain: " + line ); + } + } + } + System.out.println( "# pfams with mapping to GO : " + found_count ); + System.out.println( "# pfams without mapping to GO: " + not_found_count ); + reader.close(); + } + + public static void main( final String args[] ) { + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + final List allowed_options = new ArrayList(); + if ( cla.getNumberOfNames() != 2 ) { + printHelp(); + System.exit( -1 ); + } + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File pfam2go_file = cla.getFile( 0 ); + final File pfams_file = cla.getFile( 1 ); + final PfamToGoParser pfam2go_parser = new PfamToGoParser( pfam2go_file ); + List mappings = null; + try { + mappings = pfam2go_parser.parse(); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + try { + doit( pfams_file, mappings ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + System.out.println(); + } + + private static void printHelp() { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " " ); + System.out.println(); + System.out.println(); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/application/pfamacc2go.java b/forester/java/src/org/forester/application/pfamacc2go.java new file mode 100644 index 0000000..1d6ab2f --- /dev/null +++ b/forester/java/src/org/forester/application/pfamacc2go.java @@ -0,0 +1,111 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.List; + +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; + +public class pfamacc2go { + + final static private String PRG_NAME = "pfamacc2go"; + + public static void main( final String args[] ) { + if ( args.length != 2 ) { + printHelp(); + System.exit( -1 ); + } + final PfamToGoParser p = new PfamToGoParser( new File( args[ 0 ] ) ); + p.setUseAccessors( true ); + List pfam2go = null; + try { + pfam2go = p.parse(); + } + catch ( final IOException e ) { + printHelp(); + e.printStackTrace(); + } + BufferedReader br = null; + try { + br = new BufferedReader( new FileReader( args[ 1 ] ) ); + } + catch ( final FileNotFoundException e ) { + printHelp(); + e.printStackTrace(); + } + String line; + int total_pfam_ids = 0; + int mapped_pfam_ids = 0; + try { + while ( ( line = br.readLine() ) != null ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) ) { + String[] pfam_ids = null; + if ( line.contains( "," ) ) { + pfam_ids = line.split( "," ); + } + else { + pfam_ids = new String[ 1 ]; + pfam_ids[ 0 ] = line; + } + for( final String pfam_id : pfam_ids ) { + total_pfam_ids++; + boolean mapped = false; + for( final PfamToGoMapping pfam_to_go_mapping : pfam2go ) { + if ( pfam_to_go_mapping.getKey().getId().equals( pfam_id ) ) { + mapped = true; + System.out.println( pfam_to_go_mapping.getValue().toString() ); + } + } + if ( mapped ) { + mapped_pfam_ids++; + } + } + } + } + } + catch ( final Exception e ) { + printHelp(); + e.printStackTrace(); + } + System.out.println( "# total pfam ids : " + total_pfam_ids ); + System.out.println( "# pfam ids mapped: " + mapped_pfam_ids ); + } + + private static void printHelp() { + System.out.println(); + System.out.println( PRG_NAME + + " " ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/pfamacc2pfamid.java b/forester/java/src/org/forester/application/pfamacc2pfamid.java new file mode 100644 index 0000000..2a4ec9c --- /dev/null +++ b/forester/java/src/org/forester/application/pfamacc2pfamid.java @@ -0,0 +1,131 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.HashMap; +import java.util.Map; + +public class pfamacc2pfamid { + + final static private String PRG_NAME = "pfamacc2pfamid"; + + public static void main( final String args[] ) { + if ( args.length != 2 ) { + printHelp(); + System.exit( -1 ); + } + BufferedReader br = null; + try { + br = new BufferedReader( new FileReader( args[ 0 ] ) ); + } + catch ( final FileNotFoundException e ) { + printHelp(); + e.printStackTrace(); + } + String line; + final Map acc_id = new HashMap(); + String id = null; + try { + while ( ( line = br.readLine() ) != null ) { + if ( line.startsWith( "#=GF ID" ) ) { + if ( id != null ) { + System.err.println( "illegal format" ); + System.exit( -1 ); + } + id = line.substring( 7 ).trim(); + } + else if ( line.startsWith( "#=GF AC" ) ) { + if ( id == null ) { + System.err.println( "illegal format" ); + System.exit( -1 ); + } + String acc = line.substring( 7 ).trim(); + if ( acc.indexOf( '.' ) > 0 ) { + acc = acc.substring( 0, acc.indexOf( '.' ) ); + } + acc_id.put( acc, id ); + id = null; + } + else if ( line.startsWith( "//" ) ) { + if ( id != null ) { + System.err.println( "illegal format" ); + System.exit( -1 ); + } + } + } + } + catch ( final Exception e ) { + printHelp(); + e.printStackTrace(); + } + try { + br = new BufferedReader( new FileReader( args[ 1 ] ) ); + } + catch ( final FileNotFoundException e ) { + printHelp(); + e.printStackTrace(); + } + int not_found = 0; + try { + while ( ( line = br.readLine() ) != null ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) ) { + String[] pfam_accs = null; + if ( line.contains( "," ) ) { + pfam_accs = line.split( "," ); + } + else { + pfam_accs = new String[ 1 ]; + pfam_accs[ 0 ] = line; + } + for( final String pfam_acc : pfam_accs ) { + if ( acc_id.containsKey( pfam_acc ) ) { + System.out.println( acc_id.get( pfam_acc ) ); + } + else { + not_found++; + } + } + } + } + } + catch ( final Exception e ) { + printHelp(); + e.printStackTrace(); + } + System.err.println( "# not found: " + not_found ); + } + + private static void printHelp() { + System.out.println(); + System.out.println( PRG_NAME + " " ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/phyloxml_converter.java b/forester/java/src/org/forester/application/phyloxml_converter.java new file mode 100644 index 0000000..9822e1f --- /dev/null +++ b/forester/java/src/org/forester/application/phyloxml_converter.java @@ -0,0 +1,376 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; +import org.forester.util.ForesterUtil.PhylogenyNodeField; + +public class phyloxml_converter { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String FIELD_OPTION = "f"; + final static private String FIELD_CLADE_NAME = "nn"; + final static private String FIELD_TAXONOMY_CODE = "tc"; + final static private String FIELD_TAXONOMY_SCI_NAME = "sn"; + final static private String FIELD_TAXONOMY_COMM_NAME = "cn"; + final static private String FIELD_SEQUENCE_GENE_NAME = "gn"; + final static private String FIELD_SEQUENCE_SYMBOL = "sy"; + final static private String FIELD_DUMMY = "dummy"; + final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i"; + final static private String MIDPOINT_REROOT = "m"; + final static private String EXTRACT_TAXONOMY = "xt"; + final static private String EXTRACT_TAXONOMY_PF = "xp"; + final static private String ORDER_SUBTREES = "o"; + final static private String NO_TREE_LEVEL_INDENDATION = "ni"; + final static private String REPLACE_UNDER_SCORES = "ru"; + final static private String PRG_NAME = "phyloxml_converter"; + final static private String PRG_VERSION = "1.21"; + final static private String PRG_DATE = "2010.10.02"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + final static private boolean SPECIAL = false; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( args.length < 3 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( NO_TREE_LEVEL_INDENDATION ); + allowed_options.add( FIELD_OPTION ); + allowed_options.add( MIDPOINT_REROOT ); + allowed_options.add( ORDER_SUBTREES ); + allowed_options.add( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ); + allowed_options.add( REPLACE_UNDER_SCORES ); + allowed_options.add( EXTRACT_TAXONOMY ); + allowed_options.add( EXTRACT_TAXONOMY_PF ); + if ( cla.getNumberOfNames() != 2 ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final List mandatory_options = new ArrayList(); + mandatory_options.add( FIELD_OPTION ); + final String missing_options = cla.validateMandatoryOptionsAsString( mandatory_options ); + if ( missing_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "missing option(s): " + missing_options ); + } + if ( !cla.isOptionValueSet( FIELD_OPTION ) ) { + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final String field_option_value = cla.getOptionValue( FIELD_OPTION ); + PhylogenyNodeField field = null; + if ( field_option_value.equals( FIELD_CLADE_NAME ) ) { + field = PhylogenyNodeField.CLADE_NAME; + } + else if ( field_option_value.equals( FIELD_TAXONOMY_CODE ) ) { + field = PhylogenyNodeField.TAXONOMY_CODE; + } + else if ( field_option_value.equals( FIELD_TAXONOMY_SCI_NAME ) ) { + field = PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME; + } + else if ( field_option_value.equals( FIELD_TAXONOMY_COMM_NAME ) ) { + field = PhylogenyNodeField.TAXONOMY_COMMON_NAME; + } + else if ( field_option_value.equals( FIELD_SEQUENCE_GENE_NAME ) ) { + field = PhylogenyNodeField.SEQUENCE_NAME; + } + else if ( field_option_value.equals( FIELD_SEQUENCE_SYMBOL ) ) { + field = PhylogenyNodeField.SEQUENCE_SYMBOL; + } + else if ( field_option_value.equals( FIELD_DUMMY ) ) { + } + else { + ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \"" + + field_option_value + "\"" ); + } + boolean int_values_are_boots = false; + if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) { + int_values_are_boots = true; + } + boolean midpoint_reroot = false; + if ( cla.isOptionSet( MIDPOINT_REROOT ) ) { + midpoint_reroot = true; + } + boolean order_subtrees = false; + if ( cla.isOptionSet( ORDER_SUBTREES ) ) { + order_subtrees = true; + } + boolean replace_underscores = false; + if ( cla.isOptionSet( REPLACE_UNDER_SCORES ) ) { + replace_underscores = true; + } + boolean no_indendation = false; + if ( cla.isOptionSet( NO_TREE_LEVEL_INDENDATION ) ) { + no_indendation = true; + } + boolean extr_taxonomy = false; + if ( cla.isOptionSet( EXTRACT_TAXONOMY ) ) { + extr_taxonomy = true; + } + boolean extr_taxonomy_pf_only = false; + if ( cla.isOptionSet( EXTRACT_TAXONOMY_PF ) ) { + extr_taxonomy_pf_only = true; + } + final File infile = cla.getFile( 0 ); + final File outfile = cla.getFile( 1 ); + if ( outfile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" ); + } + if ( !infile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "[" + infile + "] does not exist" ); + } + Phylogeny[] phys = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser parser = ForesterUtil.createParserDependingOnFileType( infile, true ); + if ( parser instanceof NHXParser ) { + if ( ( field != PhylogenyNodeField.TAXONOMY_CODE ) + && ( field != PhylogenyNodeField.TAXONOMY_COMMON_NAME ) + && ( field != PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) { + if ( extr_taxonomy_pf_only ) { + ( ( NHXParser ) parser ) + .setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + replace_underscores = false; + } + else if ( extr_taxonomy ) { + ( ( NHXParser ) parser ).setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.YES ); + replace_underscores = false; + } + } + else { + ( ( NHXParser ) parser ).setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.NO ); + } + ( ( NHXParser ) parser ).setReplaceUnderscores( replace_underscores ); + ( ( NHXParser ) parser ).setIgnoreQuotes( false ); + } + else if ( parser instanceof NexusPhylogeniesParser ) { + ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( replace_underscores ); + ( ( NexusPhylogeniesParser ) parser ).setIgnoreQuotes( false ); + } + phys = factory.create( infile, parser ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read phylogeny from [" + infile + "]: " + e.getMessage() ); + } + if ( SPECIAL ) { + for( final Phylogeny phy : phys ) { + performSpecialProcessing( phy ); + } + } + if ( int_values_are_boots ) { + for( final Phylogeny phy : phys ) { + ForesterUtil.transferInternalNamesToBootstrapSupport( phy ); + } + } + if ( field != null ) { + for( final Phylogeny phy : phys ) { + ForesterUtil.transferNodeNameToField( phy, field ); + } + } + if ( midpoint_reroot ) { + try { + for( final Phylogeny phy : phys ) { + PhylogenyMethods.midpointRoot( phy ); + } + } + catch ( final Exception e ) { + System.out.println( "" ); + ForesterUtil.printWarningMessage( PRG_NAME, "midpoint rerooting failed: " + e.getLocalizedMessage() ); + } + } + if ( order_subtrees ) { + for( final Phylogeny phy : phys ) { + phy.orderAppearance( true ); + } + } + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + if ( no_indendation ) { + writer.setIndentPhyloxml( false ); + } + writer.toPhyloXML( phys, 0, outfile, ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to [" + outfile + "]: " + e.getMessage() ); + } + System.out.println( "[" + PRG_NAME + "] wrote: [" + outfile + "]" ); + System.out.println( "[" + PRG_NAME + "] OK" ); + System.out.println(); + } + + private static void performSpecialProcessing( final Phylogeny phy ) { + // Can place some kind of custom processing here. + // final List remove_us = new ArrayList(); + // int counter = 0; + // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { + // final PhylogenyNode node = it.next(); + // final String name = node.getNodeName().toLowerCase(); + // if ( name.startsWith( "environmental_samples" ) || name.startsWith( "unclassified" ) + // || name.startsWith( "bacteria" ) || name.startsWith( "other" ) + // || name.startsWith( "viroids" ) || name.startsWith( "viruses" ) ) { + // remove_us.add( node ); + // System.out.println( counter++ ); + // } + // } + // phy.hashIDs(); + // for( final PhylogenyNode node : remove_us ) { + // if ( phy.getNode( node.getNodeId() ) != null ) { + // phy.deleteSubtree( node ); + // System.out.println( "deleted: " + node ); + // } + // } + // phy.hashIDs(); + // + // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { + // final PhylogenyNode node = it.next(); + // node.getNodeData().setTaxonomy( null ); + // } + // phy.reRoot( phy.getFirstExternalNode() ); + // PhylogenyMethods.midpointRoot( phy ); + // phy.orderAppearance( true ); + for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + final String name = node.getName(); + if ( !ForesterUtil.isEmpty( name ) ) { + // final Taxonomy taxo = new Taxonomy(); + // if ( node.isExternal() ) { + // taxo.setTaxonomyCode( name ); + // node.getNodeData().setTaxonomy( taxo ); + // } + // else if ( name.indexOf( '_' ) == -1 || name.length() > 6 ) { + // taxo.setScientificName( name ); + // node.getNodeData().setTaxonomy( taxo ); + // } + // node.setName( "" ); + // if ( name.indexOf( "BF" ) >= 0 ) { + // taxo.setTaxonomyCode( "BACFR" ); + // } + // else if ( name.indexOf( "BT" ) >= 0 ) { + // taxo.setTaxonomyCode( "BACTN" ); + // } + // else if ( name.indexOf( "MXAN" ) >= 0 ) { + // taxo.setTaxonomyCode( "MYXXD" ); + // } + // else if ( name.indexOf( "STIAU" ) >= 0 ) { + // taxo.setTaxonomyCode( "STIAU" ); + // } + // else if ( name.indexOf( "BOVA" ) >= 0 ) { + // taxo.setTaxonomyCode( "BACOV" ); + // } + // else if ( name.indexOf( "BUNI" ) >= 0 ) { + // taxo.setTaxonomyCode( "BACUN" ); + // } + // else if ( name.indexOf( "Pgin" ) >= 0 ) { + // taxo.setTaxonomyCode( "PORGI" ); + // } + // else if ( name.equals( "3CGH" ) || name.equals( "3CK7" ) ) { + // taxo.setTaxonomyCode( "BACTN" ); + // } + // node.getNodeData().setTaxonomy( taxo ); + } + } + } + + private static void printHelp() { + System.out.println( "Usage:" ); + System.out.println(); + System.out + .println( PRG_NAME + + " -" + + FIELD_OPTION + + "= [options] " ); + System.out.println(); + System.out.println( " field options: " ); + System.out.println(); + System.out.println( " " + FIELD_CLADE_NAME + ": transfer name to node/clade name" ); + System.out.println( " " + FIELD_TAXONOMY_CODE + ": transfer name to taxonomy code" ); + System.out.println( " " + FIELD_TAXONOMY_SCI_NAME + ": transfer name to taxonomy scientific name" ); + System.out.println( " " + FIELD_TAXONOMY_COMM_NAME + ": transfer name to taxonomy common name" ); + System.out.println( " " + FIELD_SEQUENCE_GENE_NAME + ": transfer name to sequence name" ); + System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" ); + System.out.println(); + System.out.println( " options: " ); + System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT + + " : internal names in NH or NHX tree are bootstrap support values" ); + System.out.println( " -" + REPLACE_UNDER_SCORES + ": replace all underscores with spaces" ); + System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" ); + System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" ); + System.out + .println( " -" + + EXTRACT_TAXONOMY + + ": extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: " + + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); + System.out + .println( " -" + + EXTRACT_TAXONOMY_PF + + ": extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: " + + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); + System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + ": no tree level indendation in phyloXML output" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/printAllSpecies.java b/forester/java/src/org/forester/application/printAllSpecies.java new file mode 100644 index 0000000..734239b --- /dev/null +++ b/forester/java/src/org/forester/application/printAllSpecies.java @@ -0,0 +1,79 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.FileWriter; +import java.io.PrintWriter; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.ForesterUtil; + +public class printAllSpecies { + + public static void main( final String args[] ) { + Phylogeny tree = null; + PhylogenyNode node = null; + PrintWriter out = null; + File infile = null, outfile = null; + if ( args.length != 2 ) { + System.err.println( "\nprintAllSpecies: Wrong number of arguments." ); + System.err.println( "Usage: \"java printAllSpecies \"\n" ); + System.exit( -1 ); + } + infile = new File( args[ 0 ] ); + outfile = new File( args[ 1 ] ); + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( infile, true ); + tree = factory.create( infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.err.println( e + "\nCould not read " + infile + "\n" ); + System.exit( -1 ); + } + node = tree.getFirstExternalNode(); + try { + out = new PrintWriter( new FileWriter( outfile ), true ); + while ( node != null ) { + out.println( PhylogenyMethods.getSpecies( node ) ); + node = node.getNextExternalNode(); + } + } + catch ( final Exception e ) { + System.err.println( e + "\nException during writing.\n" ); + System.exit( -1 ); + } + finally { + out.close(); + } + } +} diff --git a/forester/java/src/org/forester/application/printSameOrder.java b/forester/java/src/org/forester/application/printSameOrder.java new file mode 100644 index 0000000..530a0fe --- /dev/null +++ b/forester/java/src/org/forester/application/printSameOrder.java @@ -0,0 +1,78 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.FileWriter; +import java.io.PrintWriter; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.ForesterUtil; + +public class printSameOrder { + + public static void main( final String args[] ) { + Phylogeny tree = null; + PhylogenyNode node = null; + PrintWriter out = null; + File infile = null, outfile = null; + if ( args.length != 2 ) { + System.err.println( "\nprintSameOrder: Wrong number of arguments." ); + System.err.println( "Usage: \"java printSameOrder \"\n" ); + System.exit( -1 ); + } + infile = new File( args[ 0 ] ); + outfile = new File( args[ 1 ] ); + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( infile, true ); + tree = factory.create( infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.err.println( e + "\nCould not read " + infile + "\n" ); + System.exit( -1 ); + } + node = tree.getFirstExternalNode(); + try { + out = new PrintWriter( new FileWriter( outfile ), true ); + while ( node != null ) { + out.println( node.getName() ); + node = node.getNextExternalNode(); + } + } + catch ( final Exception e ) { + System.err.println( e + "\nException during writing.\n" ); + System.exit( -1 ); + } + finally { + out.close(); + } + } +} diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java new file mode 100644 index 0000000..b4902b1 --- /dev/null +++ b/forester/java/src/org/forester/application/rio.java @@ -0,0 +1,501 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Vector; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PreorderTreeIterator; +import org.forester.sdi.DistanceCalculator; +import org.forester.sdi.RIO; +import org.forester.sdi.SDIR; +import org.forester.util.ForesterUtil; + +public class rio { + + final static private String PRG_NAME = "RIO"; + final static private String PRG_VERSION = "2.03 ALPHA"; + final static private String PRG_DATE = "2010.01.15"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/"; + final static private boolean TIME = true; + final static private boolean VERBOSE = true; + // For method getDistances -- calculation of distances. + final static private boolean MINIMIZE_COST = false; + // For method getDistances -- calculation of distances. + final static private boolean MINIMIZE_DUPS = true; + // For method getDistances -- calculation of distances. + final static private boolean MINIMIZE_HEIGHT = true; + final static private int WARN_NO_ORTHOS_DEFAULT = 2; + final static private int + // How many sd away from mean to root. + WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2; + // How many sd away from mean to LCA of orthos. + final static private double THRESHOLD_ULTRA_PARALOGS_DEFAULT = 50; + // How many sd away from mean to LCA of orthos. + final static private double WARN_ONE_ORTHO_DEFAULT = 2; + + // Factor between the two distances to their LCA + // (larger/smaller). + // Factor between the two distances to their LCA + // (larger/smaller). + /** + * Calculates the mean and standard deviation of all nodes of Phylogeny t + * which have a bootstrap values zero or more. Returns null in case of + * failure (e.g t has no bootstrap values, or just one). + *

+ * + * @param t + * reference to a tree with bootstrap values + * @return Array of doubles, [0] is the mean, [1] the standard deviation + */ + private static double[] calculateMeanBoostrapValue( final Phylogeny t ) { + double b = 0; + int n = 0; + long sum = 0; + double x = 0.0, mean = 0.0; + final double[] da = new double[ 2 ]; + final Vector bv = new Vector(); + PhylogenyNode node = null; + PreorderTreeIterator i = null; + i = new PreorderTreeIterator( t ); + // Calculates the mean. + while ( i.hasNext() ) { + node = i.next(); + if ( !( ( node.getParent() != null ) && node.getParent().isRoot() + && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode1() ) > 0 ) + && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode2() ) > 0 ) && ( node + .getParent().getChildNode2() == node ) ) ) { + b = PhylogenyMethods.getConfidenceValue( node ); + if ( b > 0 ) { + sum += b; + bv.addElement( new Double( b ) ); + n++; + } + } + // i.next(); + } + if ( n < 2 ) { + return null; + } + mean = ( double ) sum / n; + // Calculates the standard deviation. + sum = 0; + for( int j = 0; j < n; ++j ) { + b = ( bv.elementAt( j ) ).intValue(); + x = b - mean; + sum += ( x * x ); + } + da[ 0 ] = mean; + da[ 1 ] = java.lang.Math.sqrt( sum / ( n - 1.0 ) ); + return da; + } + + private final static void errorInCommandLine() { + System.out.println( "\nrio: Error in command line.\n" ); + printHelp(); + System.exit( -1 ); + } + + // Uses DistanceCalculator to calculate distances. + private final static StringBuffer getDistances( final File tree_file_for_dist_val, + final File outfile, + final Phylogeny species_tree, + final String seq_name, + final ArrayList al_ortholog_names_for_dc, + final HashMap ortholog_hashmap, + final HashMap super_ortholog_hashmap, + final int warn_more_than_one_ortho, + final int warn_no_orthos, + final double warn_one_ortho, + final int bootstraps, + final double t_orthologs_dc ) throws IOException { + Phylogeny consensus_tree = null; + Phylogeny + // to be a consensus tree. + assigned_cons_tree = null; + final SDIR sdiunrooted = new SDIR(); + final ArrayList al_ortholog_nodes = new ArrayList(); + double m = 0.0; + double sd = 0.0; + double d = 0.0; + int n = 0; + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + consensus_tree = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ]; + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, consensus_tree ); + assigned_cons_tree = sdiunrooted.infer( consensus_tree, + species_tree, + rio.MINIMIZE_COST, + rio.MINIMIZE_DUPS, + rio.MINIMIZE_HEIGHT, + true, + 1 )[ 0 ]; + final DistanceCalculator dc = new DistanceCalculator(); + final StringBuffer sb = new StringBuffer(); + sb.append( "Given the threshold for distance calculations (" + ForesterUtil.roundToInt( t_orthologs_dc ) + + "): " ); + // No orthologs. + if ( al_ortholog_names_for_dc.size() == 0 ) { + dc.setTree( assigned_cons_tree ); + // Remark. Calculation of mean and sd _does_ include the node + // with seq_name. + m = dc.getMean(); + sd = dc.getStandardDeviation(); + d = dc.getDistanceToRoot( seq_name ); + n = dc.getN(); + sb.append( "No sequence is considered orthologous to query." + + "\ndistance of query to root = " + ForesterUtil.FORMATTER_06.format( d ) + + "\nmean of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( m ) + + "\nsd of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( sd ) + + "\nn (sum of sequences in alignment plus query) = " + n ); + if ( !( ( ( m - ( warn_no_orthos * sd ) ) < d ) && ( ( m + ( warn_no_orthos * sd ) ) > d ) ) ) { + sb.append( "\nWARNING: distance of query to root is outside of mean+/-" + warn_no_orthos + "*sd!" ); + } + } + // One ortholog. + else if ( al_ortholog_names_for_dc.size() == 1 ) { + final String name_of_ortholog = al_ortholog_names_for_dc.get( 0 ); + al_ortholog_nodes.add( assigned_cons_tree.getNode( name_of_ortholog ) ); + al_ortholog_nodes.add( assigned_cons_tree.getNode( seq_name ) ); + dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes ); + // Remark. Calculation of mean _does_ include the node + // with seq_name. + d = dc.getDistanceToLCA( seq_name ); + final double d_o = dc.getDistanceToLCA( name_of_ortholog ); + sb.append( "One sequence is considered orthologous to query." + "\nLCA is LCA of query and its ortholog." + + "\ndistance of query to LCA = " + ForesterUtil.FORMATTER_06.format( d ) + + "\ndistance of ortholog to LCA = " + ForesterUtil.FORMATTER_06.format( d_o ) ); + if ( ( d_o > 0.0 ) + && ( d > 0.0 ) + && ( ( ( d_o >= d ) && ( ( d_o / d ) > warn_one_ortho ) ) || ( ( d_o < d ) && ( ( d / d_o ) > warn_one_ortho ) ) ) ) { + sb.append( "\nWARNING: Ratio of distances to LCA is greater than " + warn_one_ortho + "!" ); + } + else if ( ( ( d_o == 0.0 ) || ( d == 0.0 ) ) && ( ( d_o != 0.0 ) || ( d != 0.0 ) ) ) { + sb.append( "\nWARNING: Ratio could not be calculated, " + " one distance is 0.0!" ); + } + } + // More than one ortholog. + else { + for( int i = 0; i < al_ortholog_names_for_dc.size(); ++i ) { + al_ortholog_nodes.add( assigned_cons_tree.getNodeViaSequenceName( al_ortholog_names_for_dc.get( i ) ) ); + } + al_ortholog_nodes.add( assigned_cons_tree.getNodesViaSequenceName( seq_name ).get( 0 ) ); + dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes ); + // Remark. Calculation of mean and sd _does_ include the node + // with seq_name. + m = dc.getMean(); + sd = dc.getStandardDeviation(); + d = dc.getDistanceToLCA( seq_name ); + n = dc.getN(); + sb.append( "More than one sequence is considered orthologous to query." + + "\nLCA is LCA of query and its orthologs." + + "\ndistance of query to LCA = " + + ForesterUtil.FORMATTER_06.format( d ) + + "\nmean of distances (for query and its orthologs) to LCA = " + + ForesterUtil.FORMATTER_06.format( m ) + + "\nsd of distances (for query and its orthologs) to LCA = " + + ForesterUtil.FORMATTER_06.format( sd ) + + "\nn (sum of orthologs plus query) = " + n ); + if ( !( ( ( m - ( warn_more_than_one_ortho * sd ) ) < d ) && ( ( m + ( warn_more_than_one_ortho * sd ) ) > d ) ) ) { + sb.append( "\n!WARNING: distance of query to LCA is outside of mean+/-" + warn_more_than_one_ortho + + "*sd!" ); + } + } + return sb; + } + + public static void main( final String[] args ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + File species_tree_file = null; + File multiple_trees_file = null; + File outfile = null; + File distance_matrix_file = null; + File tree_file_for_dist_val = null; + File tree_file_for_avg_bs = null; + String seq_name = ""; + String arg = ""; + boolean output_ultraparalogs = false; + ArrayList orthologs_al_for_dc = null; + double t_orthologs = 0.0; + double t_sn = 0.0; + double t_orthologs_dc = 0.0; + double[] bs_mean_sd = null; + int sort = 13; + Phylogeny species_tree = null; + RIO rio_instance = null; + PrintWriter out = null; + long time = 0; + int warn_no_orthos = WARN_NO_ORTHOS_DEFAULT; + int warn_more_than_one_ortho = WARN_MORE_THAN_ONE_ORTHO_DEFAULT; + double warn_one_ortho = WARN_ONE_ORTHO_DEFAULT; + double threshold_ultra_paralogs = THRESHOLD_ULTRA_PARALOGS_DEFAULT; + if ( args.length < 2 ) { + printHelp(); + System.exit( 0 ); + } + else if ( ( args.length < 3 ) || ( args.length > 18 ) ) { + errorInCommandLine(); + } + for( int i = 0; i < args.length; ++i ) { + if ( args[ i ].trim().charAt( 0 ) != 'p' ) { + if ( args[ i ].trim().length() < 3 ) { + errorInCommandLine(); + } + else { + arg = args[ i ].trim().substring( 2 ); + } + } + try { + switch ( args[ i ].trim().charAt( 0 ) ) { + case 'M': + multiple_trees_file = new File( arg ); + break; + case 'N': + seq_name = arg; + break; + case 'S': + species_tree_file = new File( arg ); + break; + case 'O': + outfile = new File( arg ); + break; + case 'D': + distance_matrix_file = new File( arg ); + break; + case 'T': + tree_file_for_dist_val = new File( arg ); + break; + case 't': + tree_file_for_avg_bs = new File( arg ); + break; + case 'p': + output_ultraparalogs = true; + break; + case 'P': + sort = Integer.parseInt( arg ); + if ( ( sort < 0 ) || ( sort > 17 ) ) { + errorInCommandLine(); + } + break; + case 'L': + t_orthologs = Double.parseDouble( arg ); + break; + case 'B': + t_sn = Double.parseDouble( arg ); + break; + case 'U': + t_orthologs_dc = Double.parseDouble( arg ); + break; + case 'v': + threshold_ultra_paralogs = Double.parseDouble( arg ); + break; + case 'X': + warn_more_than_one_ortho = Integer.parseInt( arg ); + break; + case 'Y': + warn_no_orthos = Integer.parseInt( arg ); + break; + case 'Z': + warn_one_ortho = Double.parseDouble( arg ); + break; + default: + errorInCommandLine(); + } + } + catch ( final Exception e ) { + errorInCommandLine(); + } + } + if ( ( seq_name == "" ) || ( species_tree_file == null ) || ( multiple_trees_file == null ) + || ( outfile == null ) ) { + errorInCommandLine(); + } + if ( ( sort < 0 ) || ( sort > 17 ) ) { + errorInCommandLine(); + } + if ( ( sort > 2 ) && ( distance_matrix_file == null ) ) { + errorInCommandLine(); + } + if ( VERBOSE ) { + System.out.println( "\nMultiple trees file: " + multiple_trees_file ); + System.out.println( "Seq name: " + seq_name ); + System.out.println( "Species tree file: " + species_tree_file ); + System.out.println( "Outfile: " + outfile ); + if ( distance_matrix_file != null ) { + System.out.println( "Distance matrix file: " + distance_matrix_file ); + } + if ( tree_file_for_dist_val != null ) { + if ( tree_file_for_avg_bs == null ) { + System.out.println( "Phy to read dists and calc mean support from: " + tree_file_for_dist_val ); + } + else { + System.out.println( "Phylogeny to read dist values from: " + tree_file_for_dist_val ); + } + } + if ( tree_file_for_avg_bs != null ) { + System.out.println( "Phylogeny to calc mean bootstrap from: " + tree_file_for_avg_bs ); + } + System.out.println( "Sort: " + sort ); + System.out.println( "Threshold orthologs: " + t_orthologs ); + System.out.println( "Threshold subtree neighborings: " + t_sn ); + System.out.println( "Threshold orthologs for distance calc.: " + t_orthologs_dc ); + if ( output_ultraparalogs ) { + System.out.println( "Threshold ultra paralogs: " + threshold_ultra_paralogs ); + } + System.out.println( "More than one ortholog sd diff: " + warn_more_than_one_ortho ); + System.out.println( "No orthologs sd diff: " + warn_no_orthos ); + System.out.println( "One ortholog factor : " + warn_one_ortho + "\n" ); + } + if ( TIME && VERBOSE ) { + time = System.currentTimeMillis(); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ]; + } + catch ( final Exception e ) { + e.printStackTrace(); + System.exit( -1 ); + } + if ( !species_tree.isRooted() ) { + ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" ); + System.exit( -1 ); + } + if ( !species_tree.isCompletelyBinary() ) { + ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" ); + System.exit( -1 ); + } + rio_instance = new RIO(); + final StringBuffer output = new StringBuffer(); + try { + if ( distance_matrix_file != null ) { + rio_instance.readDistanceMatrix( distance_matrix_file ); + } + rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name ); + output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs, t_sn ) ); + if ( tree_file_for_dist_val != null ) { + orthologs_al_for_dc = rio_instance.inferredOrthologsToArrayList( seq_name, t_orthologs_dc ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + if ( tree_file_for_avg_bs != null ) { + final Phylogeny p = factory.create( tree_file_for_avg_bs, new PhyloXmlParser() )[ 0 ]; + bs_mean_sd = calculateMeanBoostrapValue( p ); + } + else { + final Phylogeny p = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ]; + bs_mean_sd = calculateMeanBoostrapValue( p ); + } + if ( ( bs_mean_sd != null ) && ( bs_mean_sd.length == 2 ) ) { + final double bs_mean = bs_mean_sd[ 0 ]; + final double bs_sd = bs_mean_sd[ 1 ]; + output.append( "\n\nMean bootstrap value of consensus tree (sd): " + + ForesterUtil.roundToInt( ( bs_mean * 100.0 ) / rio_instance.getBootstraps() ) + "% (+/-" + + ForesterUtil.roundToInt( ( bs_sd * 100.0 ) / rio_instance.getBootstraps() ) + "%)\n" ); + } + output.append( "\n\nDistance values:\n" ); + output.append( getDistances( tree_file_for_dist_val, + outfile, + species_tree, + seq_name, + orthologs_al_for_dc, + rio_instance.getInferredOrthologs( seq_name ), + rio_instance.getInferredSuperOrthologs( seq_name ), + warn_more_than_one_ortho, + warn_no_orthos, + warn_one_ortho, + rio_instance.getBootstraps(), + t_orthologs_dc ) ); + } + if ( output_ultraparalogs ) { + output.append( "\n\nUltra paralogs:\n" ); + output.append( rio_instance + .inferredUltraParalogsToString( seq_name, sort > 2, threshold_ultra_paralogs ) ); + } + output.append( "\n\nSort priority: " + RIO.getOrder( sort ) ); + output.append( "\nExt nodes : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() ); + output.append( "\nSamples : " + rio_instance.getBootstraps() + "\n" ); + out = new PrintWriter( new FileWriter( outfile ), true ); + } + catch ( final Exception e ) { + ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() ); + e.printStackTrace(); + System.exit( -1 ); + } + out.println( output ); + out.close(); + ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" ); + if ( TIME && VERBOSE ) { + time = System.currentTimeMillis() - time; + ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" ); + } + ForesterUtil.programMessage( PRG_NAME, "OK." ); + System.exit( 0 ); + } + + private final static void printHelp() { + System.out.println( "M= (String) Multiple gene tree file (mandatory)" ); + System.out.println( "N= (String) Query sequence name (mandatory)" ); + System.out.println( "S= (String) Species tree file (mandatory)" ); + System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" ); + System.out.println( "D= (String) Distance matrix file for pairwise distances" ); + System.out.println( "T= (String) Phylogeny file for distances of query to LCA" ); + System.out.println( " of orthologs and for mean bootstrap value (if t= is not used)," ); + System.out.println( " must be binary )" ); + System.out.println( "t= (String) Phylogeny file for mean bootstrap value (if this option is used," ); + System.out.println( " the mean bootstrap value is not calculated from the tree read in" ); + System.out.println( " with T=), not necessary binary" ); + System.out.println( "p To output ultra paralogs" ); + System.out.println( "P= (int) Sort priority" ); + System.out.println( "L= (double) Threshold orthologs for output" ); + System.out.println( "U= (double) Threshold orthologs for distance calculation" ); + System.out.println( "X= (int) More than one ortholog: " ); + System.out.println( " numbers of sd the dist. to LCA has to differ from mean to generate a warning" ); + System.out.println( "Y= (int) No orthologs:" ); + System.out.println( " Numbers of sd the dist to root has to differ from mean to generate a warning" ); + System.out.println( "Z= (double) One ortholog:" ); + System.out.println( " threshold for factor between the two distances to their LCA (larger/smaller)" ); + System.out.println( " to generate a warning" ); + System.out.println(); + System.out.println( " Sort priority (\"P=\"):" ); + System.out.println( RIO.getOrderHelp().toString() ); + System.out.println(); + System.out + .println( " Example: \"rio M=gene_trees.xml N=bcl2_NEMVE S=species_tree.xml D=distances P=13 p O=out\"" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/sdi.java b/forester/java/src/org/forester/application/sdi.java new file mode 100644 index 0000000..08d94fb --- /dev/null +++ b/forester/java/src/org/forester/application/sdi.java @@ -0,0 +1,238 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.sdi.GSDI; +import org.forester.sdi.SDI; +import org.forester.sdi.SDIse; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public final class sdi { + + final static private String STRIP_OPTION = "s"; + final static private String GSDI_OPTION = "g"; + final static private String MOST_PARSIMONIOUS_OPTION = "m"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String DEFAULT_OUTFILE = "sdi_out.xml"; + final static private String PRG_NAME = "sdi"; + final static private String PRG_VERSION = "beta 0.4"; + final static private String PRG_DATE = "2009.01.22"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( sdi.PRG_NAME, sdi.PRG_VERSION, sdi.PRG_DATE ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( sdi.HELP_OPTION_1 ) || cla.isOptionSet( sdi.HELP_OPTION_2 ) ) { + System.out.println(); + sdi.print_help(); + System.exit( 0 ); + } + else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) { + System.out.println(); + System.out.println( "Wrong number of arguments." ); + System.out.println(); + sdi.print_help(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( sdi.STRIP_OPTION ); + allowed_options.add( sdi.GSDI_OPTION ); + allowed_options.add( sdi.MOST_PARSIMONIOUS_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean use_sdise = true; + boolean strip = false; + boolean most_parsimonous_duplication_model = false; + if ( cla.isOptionSet( sdi.STRIP_OPTION ) ) { + strip = true; + } + if ( cla.isOptionSet( sdi.GSDI_OPTION ) ) { + use_sdise = false; + } + if ( cla.isOptionSet( sdi.MOST_PARSIMONIOUS_OPTION ) ) { + if ( use_sdise ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" ); + } + most_parsimonous_duplication_model = true; + } + Phylogeny species_tree = null; + Phylogeny gene_tree = null; + File gene_tree_file = null; + File species_tree_file = null; + File out_file = null; + try { + gene_tree_file = cla.getFile( 0 ); + species_tree_file = cla.getFile( 1 ); + if ( cla.getNumberOfNames() == 3 ) { + out_file = cla.getFile( 2 ); + } + else { + out_file = new File( sdi.DEFAULT_OUTFILE ); + } + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "error in command line: " + e.getMessage() ); + } + if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) { + ForesterUtil.fatalError( sdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) ); + } + if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) { + ForesterUtil.fatalError( sdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) ); + } + if ( ForesterUtil.isWritableFile( out_file ) != null ) { + ForesterUtil.fatalError( sdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "Failed to read species tree from \"" + gene_tree_file + "\" [" + + e.getMessage() + "]" ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "Failed to read gene tree from \"" + gene_tree_file + "\" [" + + e.getMessage() + "]" ); + } + gene_tree.setRooted( true ); + species_tree.setRooted( true ); + if ( !gene_tree.isCompletelyBinary() ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary." ); + } + if ( use_sdise ) { + if ( !species_tree.isCompletelyBinary() ) { + ForesterUtil.fatalError( sdi.PRG_NAME, "species tree (\"" + species_tree_file + + "\") is not completely binary." ); + } + } + // For timing. + // gene_tree = Helper.createBalancedTree( 10 ); + // species_tree = Helper.createBalancedTree( 13 ); + // species_tree = Helper.createUnbalancedTree( 1024 ); + // gene_tree = Helper.createUnbalancedTree( 8192 ); + // species_tree = gene_tree.copyTree(); + // gene_tree = species_tree.copyTree(); + // Helper.numberSpeciesInOrder( species_tree ); + // Helper.numberSpeciesInOrder( gene_tree ); + // Helper.randomizeSpecies( 1, 8192, gene_tree ); + // Helper.intervalNumberSpecies( gene_tree, 4096 ); + // Helper.numberSpeciesInDescOrder( gene_tree ); + System.out.println(); + System.out.println( "Strip species tree: " + strip ); + SDI sdi = null; + final long start_time = new Date().getTime(); + try { + if ( use_sdise ) { + System.out.println(); + System.out.println( "Using SDIse algorithm." ); + sdi = new SDIse( gene_tree, species_tree ); + } + else { + System.out.println(); + System.out.println( "Using GSDI algorithm." ); + System.out.println(); + System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); + sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model ); + } + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + System.out.println(); + System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( out_file, gene_tree, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "Failed to write to \"" + out_file + "\" [" + e.getMessage() + "]" ); + } + System.out.println(); + System.out.println( "Successfully wrote resulting gene tree to: " + out_file ); + System.out.println(); + if ( use_sdise ) { + sdi.computeMappingCostL(); + System.out.println( "Mapping cost : " + sdi.computeMappingCostL() ); + } + System.out.println( "Number of duplications : " + sdi.getDuplicationsSum() ); + if ( !use_sdise && !most_parsimonous_duplication_model ) { + System.out.println( "Number of potential duplications: " + + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() ); + } + if ( !use_sdise ) { + System.out.println( "Number speciations : " + ( ( GSDI ) sdi ).getSpeciationsSum() ); + } + System.out.println(); + } // main( final String args[] ) + + private static void print_help() { + System.out.println( "Usage: \"" + sdi.PRG_NAME + + " [-options] [outfile]\"" ); + System.out.println(); + System.out.println( "Options:" ); + System.out.println( " -" + sdi.STRIP_OPTION + ": to strip the species tree prior to duplication inference" ); + System.out.println( " -" + sdi.GSDI_OPTION + + ": to use GSDI algorithm instead of SDIse algorithm (under development, not recommended)" ); + System.out + .println( " -" + sdi.MOST_PARSIMONIOUS_OPTION + ": use most parimonious duplication model for GSDI: " ); + System.out.println( " assign nodes as speciations which would otherwise be assiged" ); + System.out.println( " as unknown because of polytomies in the species tree" ); + System.out.println(); + System.out.println( "Species tree:" ); + System.out.println( " In phyloXML format, with taxonomy data in appropriate fields." ); + System.out.println(); + System.out.println( "Gene tree:" ); + System.out.println( " In phyloXM format, with taxonomy and sequence data in appropriate fields." ); + System.out.println(); + System.out + .println( "!! WARNING: GSDI algorithm is under development (and possibly not correct), please use SDIse instead !!" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/sdi_dir.java b/forester/java/src/org/forester/application/sdi_dir.java new file mode 100644 index 0000000..fddb6e4 --- /dev/null +++ b/forester/java/src/org/forester/application/sdi_dir.java @@ -0,0 +1,460 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.sdi.SDIR; +import org.forester.sdi.SDIse; +import org.forester.util.ForesterUtil; + +/* + * Allows to infer duplications - speciations on all (rooted or unrooted) gene + * trees in a directory by using method "infer" of class SDIunrooted.

The + * output of this is a (re)rooted tree with speciation - duplication assigned + * for each tree (in "gene tree directory" with suffix "suffix for gene trees"), + * as well as a summary list ("outputfile name").

The summary list contains + * the following. The number in brackets indicates how many external nodes of + * the gene tree had to be removed since the associated species was not found in + * the species tree. "en" indicates the number of external nodes in the + * resulting (analyzed and returned) gene tree. "d" are the number of + * duplications, "L=" the mapping cost, "h=" the height, "d=" the minimal + * difference in tree heights (of the two subtrees at the root; this number is + * 0.0 for a midpoint rooted tree) of the resulting, analyzed and rooted gene + * tree(s).

The output file ending with "_Sdist" is a file which lists the + * distribution of trees sizes, "_Ddist" lists the distribution of the sums of + * duplications (up to a certain maximal size, set with final variables + * MAX_EXT_NODE_DIST and MAX_DUP_DIST). + * + * @see SDIunrooted + * + * @author Christian M. Zmasek + */ +public class sdi_dir { + + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org"; + final static private String PRG_NAME = "sdi_dir"; + final static private String PRG_VERSION = "2.00"; + final static private String PRG_DATE = "2010.04.26"; + + private static void errorInCommandLine() { + System.out.println( "\nsdi_dir: Error in command line.\n" ); + System.out.print( "Usage: % sdi_dir [-options] " ); + System.out.println( " " ); + System.out.println( "\nOptions:" ); + System.out.println( " -l to root by minimizing the mapping cost L (and also the sum of duplications)" ); + System.out.println( " -d to root by minimizing the sum of duplications" ); + System.out.println( " -h to root by minimizing tree height (can be used together with -l or -d)" ); + System.out.println( " -w to write assigned gene trees into output directory" ); + System.out.println( "\nGene tree directory" ); + System.out.println( " The directory from which to read phyloXML formatted gene trees which" ); + System.out.println( " contain taxonomic information in appropriate sub-elements of taxonomy" ); + System.out.println( " (see: www.phyloxml.org)." ); + System.out.println( " The gene trees can either be rooted, in which case no rooting with -l, -d, or -h " ); + System.out.println( " is necessary, or they can be unrooted, in which case rooting is mandatory." ); + System.out.println( "\nSuffix for gene trees" ); + System.out.println( " Suffix of the gene trees to analyze (e.g. \".phyloxml\")." ); + System.out.println( "\nSpecies tree file" ); + System.out.println( " In phyloXML format, taxonomic information in appropriate sub-elements of taxonomy." ); + System.out.println( " (see: www.phyloxml.org)." ); + System.out.println( "\nOutput directory" ); + System.out.println( " The directory into which the assigned gene trees will be written." ); + System.out.println( "\nOutputfile name" ); + System.out.println( " File name for summary output files." ); + System.out.println( "" ); + System.exit( -1 ); + } + + /** + * Runs method "infer" of class SDIunrooted on all gene trees in directory + * indir. + *

+ * Trees are rooted by minimizing either the sum of duplications, the + * mapping cost L, or the tree height (or combinations thereof). One + * resulting tree for each (out of possibly many) is stored in outdir and a + * summary outfile is created. The distributions of the tree sizes (name of + * outfile + _Ddist) and the distributions of the sum of duplications per + * tree (name of outfile + _Sdist) are written out as well. + *

+ * If both minimize_sum_of_dup and minimize_mapping_cost are true, trees are + * rooted by minimizing by minimizing the mapping cost L. + *

+ * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are + * false trees are assumed to be alreadty rooted. + *

+ * (Last modified: 02/02/01) + * + * @see SDIR#infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @param indir + * a directory containing gene trees in NHX format + * @param species_tree_file + * a species tree file in NHX format + * @param outdir + * a directory where to write trees + * @param outfile + * a file name for the summary file + * @param suffix + * a suffix for the trees to read (e.g. nhx), is case sensitive + * @param write_trees + * set to true to write out one tree with minmal duplications or + * L each + * @param minimize_mapping_cost + * set to true to root by minimizing the mapping cost L + * @param minimize_sum_of_dup + * set to true to root by minimizing the sum of duplications + * @param minimize_height + * set to true to root by minimizing the tree height -- if + * minimize_mapping_cost is set to true or minimize_sum_of_dup is + * set to true, then out of the resulting trees with minimal + * mapping cost or minimal number of duplications the tree with + * the minimal height is chosen + */ + public static void infer( final File indir, + final File species_tree_file, + final File outdir, + final File outfile, + String suffix, + final boolean write_trees, + final boolean minimize_mapping_cost, + boolean minimize_sum_of_dup, + final boolean minimize_height ) throws IOException { + final int MIN_EXT_NODES = 4; // Minimal size of trees [in ext nodes] + // to be analyzed. + final int MAX_EXT_NODES = 5000; // Maximal size of trees [in ext nodes] + // to be analyzed. + final int MAX_DUP_DIST = 50; // Max number of dups to output in dup + // distribution ("_Ddist"). + final int MAX_EXT_NODE_DIST = 1000; // Max number of ext nodes to output + // in size + // distribution ("_Sdist"). + int successful = 0, number_of_too_small_trees = 0, number_of_too_large_trees = 0, dups = 0, c = 0, ext_nodes = 0, removed = 0; + final int nodecount0 = 0; + int j = 0; + long total_number_of_d = 0, total_number_of_ext_nodes = 0, sum_costs = 0; + double sum_tree_heights = 0.0, sum_subtree_diff = 0.0; + Phylogeny species_tree = null; + String filename = null; + String[] filenames = null; + Phylogeny[] trees = null; + final int[] duplications = new int[ MAX_EXT_NODES - 1 ], // For dup + // distribution. + sizes = new int[ MAX_EXT_NODES - 1 ]; // For ext nodes dist.of + // successfully assigned trees. + File outtree = null; + PrintWriter out = null, out_ddist = null, out_sdist = null; + final File ddist_outfile = new File( outfile + "_Ddist" ), sdist_outfile = new File( outfile + "_Sdist" ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.0#####" ); + df.setDecimalSeparatorAlwaysShown( true ); + if ( !indir.exists() || !indir.isDirectory() ) { + throw new IllegalArgumentException( indir + " does not exist or is not a directory." ); + } + if ( !outdir.exists() || !outdir.isDirectory() ) { + throw new IllegalArgumentException( outdir + " does not exist or is not a directory." ); + } + if ( outfile.exists() ) { + throw new IllegalArgumentException( outfile + " does already exist." ); + } + if ( ddist_outfile.exists() ) { + throw new IllegalArgumentException( ddist_outfile + " does already exist." ); + } + if ( sdist_outfile.exists() ) { + throw new IllegalArgumentException( sdist_outfile + " does already exist." ); + } + if ( !species_tree_file.exists() || !species_tree_file.isFile() ) { + throw new IllegalArgumentException( species_tree_file + " does not exist or is not a file." ); + } + if ( minimize_mapping_cost && minimize_sum_of_dup ) { + minimize_sum_of_dup = false; + } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ]; + filenames = indir.list(); + Arrays.sort( filenames ); + suffix = suffix.trim(); + out = new PrintWriter( new FileWriter( outfile ), true ); + //nodecount0 = PhylogenyNode.getNodeCount(); + for( int i = 0; i < filenames.length; ++i ) { + filename = filenames[ i ]; + if ( ( suffix.length() < 1 ) || filename.endsWith( suffix ) ) { + final File gene_tree_file = new File( indir.getPath(), filename ); + if ( gene_tree_file.exists() && gene_tree_file.isFile() ) { + out.print( j + "\t" + filename ); + System.out.println( j + ": " + filename ); + j++; + Phylogeny gene_tree = null; + gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ]; + // Removes from gene_tree all species not found in + // species_tree. + removed = PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree ); + if ( filename.length() < 8 ) { + out.print( "\t\t\t[-" + removed + "]" ); + } + else if ( filename.length() < 16 ) { + out.print( "\t\t[-" + removed + "]" ); + } + else { + out.print( "\t[-" + removed + "]" ); + } + if ( gene_tree.getNumberOfExternalNodes() < MIN_EXT_NODES ) { + out.print( "\t<" + MIN_EXT_NODES + "en\n" ); + number_of_too_small_trees++; + } + else if ( gene_tree.getNumberOfExternalNodes() > MAX_EXT_NODES ) { + out.print( "\t>" + MAX_EXT_NODES + "en\n" ); + number_of_too_large_trees++; + } + else { + SDIR sdiunrooted = null; + // PhylogenyNode.setNodeCount( nodecount0 ); + sdiunrooted = new SDIR(); + if ( minimize_mapping_cost || minimize_sum_of_dup || minimize_height ) { + trees = sdiunrooted.infer( gene_tree, + species_tree, + minimize_mapping_cost, + minimize_sum_of_dup, + minimize_height, + write_trees, + 1 ); + dups = sdiunrooted.getMinimalDuplications(); + } + else { + final SDIse sdi = new SDIse( gene_tree, species_tree ); + trees = new Phylogeny[ 1 ]; + trees[ 0 ] = gene_tree; + dups = sdi.getDuplicationsSum(); + c = sdi.computeMappingCostL(); + sum_costs += c; + out.print( "\t L=" + c ); + } + successful++; + ext_nodes = gene_tree.getNumberOfExternalNodes(); + total_number_of_ext_nodes += ext_nodes; + sizes[ ext_nodes ]++; + out.print( "\t " + ext_nodes + "en" ); + total_number_of_d += dups; + duplications[ dups ]++; + out.print( "\t " + dups + "d" ); + if ( minimize_mapping_cost ) { + c = sdiunrooted.getMinimalMappingCost(); + sum_costs += c; + out.print( "\t L=" + c ); + } + if ( minimize_height ) { + out.print( "\t h=" + df.format( sdiunrooted.getMinimalTreeHeight() ) ); + out.print( "\t d=" + df.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) ); + sum_tree_heights += sdiunrooted.getMinimalTreeHeight(); + sum_subtree_diff += sdiunrooted.getMinimalDiffInSubTreeHeights(); + } + out.println(); + if ( write_trees ) { + outtree = new File( outdir, new File( filenames[ i ] ).getName() ); + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( outtree, trees[ 0 ], 1 ); + } + } + } + } + } + //PhylogenyNode.setNodeCount( nodecount0 ); + if ( minimize_mapping_cost ) { + out.println( "\nRooted by minimizing mapping cost L." ); + System.out.println( "\nRooted by minimizing mapping cost L." ); + if ( minimize_height ) { + out.println( "Selected tree(s) with minimal height out of resulting trees." ); + System.out.println( "Selected tree(s) with minimal height out of resulting trees." ); + } + } + else if ( minimize_sum_of_dup ) { + out.println( "\nRooted by minimizing sum of duplications." ); + System.out.println( "\nRooted by minimizing sum of duplications." ); + if ( minimize_height ) { + out.println( "Selected tree(s) with minimal height out of resulting trees." ); + System.out.println( "Selected tree(s) with minimal height out of resulting trees." ); + } + } + else if ( minimize_height ) { + out.println( "\nRooted by minimizing tree height." ); + System.out.println( "\nRooted by minimizing tree height." ); + } + else { + out.println( "\nNo (re) rooting was performed." ); + System.out.println( "\nNo (re) rooting was performed." ); + } + out.println( "\nTrees directory : " + indir ); + out.println( "Suffix for trees : " + suffix ); + out.println( "Species tree : " + species_tree_file ); + out.println( "Output directory : " + outdir ); + out.println( "Output file : " + outfile ); + out.println( "\nTotal number of attempts (tree files read) : " + j ); + out.println( "Total number of successfully assigned trees : " + successful ); + out.println( "Number of too small trees : " + number_of_too_small_trees ); + out.println( "Number of too large trees : " + number_of_too_large_trees ); + out.println( "\nSum of duplications : " + total_number_of_d ); + if ( minimize_mapping_cost ) { + out.println( "Sum of mapping costs L : " + sum_costs ); + } + if ( minimize_height ) { + out.println( "Sum of tree heights : " + sum_tree_heights ); + out.println( "Sum of differences in subtree heights : " + sum_subtree_diff ); + } + out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes ); + out.close(); + System.out.println( "\nTotal number of attempts (tree files read) : " + j ); + System.out.println( "Total number of successfully assigned trees : " + successful ); + System.out.println( "Number of too small trees : " + number_of_too_small_trees ); + System.out.println( "Number of too large trees : " + number_of_too_large_trees ); + System.out.println( "\nSum of duplications : " + total_number_of_d ); + if ( minimize_mapping_cost ) { + System.out.println( "Sum of mapping costs L : " + sum_costs ); + } + if ( minimize_height ) { + System.out.println( "Sum of tree heights : " + sum_tree_heights ); + System.out.println( "Sum of differences in subtree heights : " + sum_subtree_diff ); + } + System.out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes ); + out_ddist = new PrintWriter( new FileWriter( ddist_outfile ), true ); + for( int i = 0; ( i < duplications.length ) && ( i <= MAX_DUP_DIST ); ++i ) { + out_ddist.println( i + " " + duplications[ i ] ); + } + out_ddist.close(); + out_sdist = new PrintWriter( new FileWriter( sdist_outfile ), true ); + for( int i = 0; ( i < sizes.length ) && ( i <= MAX_EXT_NODE_DIST ); ++i ) { + out_sdist.println( i + " " + sizes[ i ] ); + } + out_sdist.close(); + } // infer + + /** + * Main method for this class. + *

+ * (Last modified: 04/26/10) + * + * @param [args[0] + * -l to root by minimizing mapping cost L] + * @param [args[0] + * -d to root by minimizing sum of duplications] + * @param [args[0] + * -w to write out trees into outdir] + * @param [args[0] + * -h to root by minimizing tree height] + * @param [args[0] + * -n input trees are in New Hampshire format instead of NHX -- + * or gene tree is in NHX, but species information in gene tree + * is only present in the form of SWISS-PROT sequence names] + * @param args[0or1] + * trees directory name + * @param args[1or2] + * suffix for gene trees + * @param args[2or3] + * speciestree file name + * @param args[3or4] + * output directory name + * @param args[4or5] + * output file name + */ + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + // These are the default values. + boolean minimize_mapping_cost = false; + boolean minimize_sum_of_dup = false; + boolean minimize_height = false; + boolean write_trees = false; + File indir = null; + File speciestree_file = null; + File outdir = null; + File outfile = null; + String suffix = null; + if ( args.length == 5 ) { + indir = new File( args[ 0 ] ); + suffix = args[ 1 ]; + speciestree_file = new File( args[ 2 ] ); + outdir = new File( args[ 3 ] ); + outfile = new File( args[ 4 ] ); + } + else if ( args.length == 6 ) { + if ( args[ 0 ].startsWith( "-" ) ) { + minimize_mapping_cost = false; + minimize_sum_of_dup = false; + minimize_height = false; + write_trees = false; + if ( args[ 0 ].toLowerCase().indexOf( "w" ) != -1 ) { + write_trees = true; + } + if ( args[ 0 ].toLowerCase().indexOf( "l" ) != -1 ) { + minimize_mapping_cost = true; + } + if ( args[ 0 ].toLowerCase().indexOf( "d" ) != -1 ) { + minimize_sum_of_dup = true; + } + if ( args[ 0 ].toLowerCase().indexOf( "h" ) != -1 ) { + minimize_height = true; + } + } + else { + sdi_dir.errorInCommandLine(); + } + indir = new File( args[ 1 ] ); + suffix = args[ 2 ]; + speciestree_file = new File( args[ 3 ] ); + outdir = new File( args[ 4 ] ); + outfile = new File( args[ 5 ] ); + } + else { + sdi_dir.errorInCommandLine(); + } + if ( minimize_mapping_cost && minimize_sum_of_dup ) { + minimize_sum_of_dup = false; + } + try { + sdi_dir.infer( indir, + speciestree_file, + outdir, + outfile, + suffix, + write_trees, + minimize_mapping_cost, + minimize_sum_of_dup, + minimize_height ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, "error: " + e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "OK." ); + } +} diff --git a/forester/java/src/org/forester/application/sdi_r.java b/forester/java/src/org/forester/application/sdi_r.java new file mode 100644 index 0000000..258e0f4 --- /dev/null +++ b/forester/java/src/org/forester/application/sdi_r.java @@ -0,0 +1,245 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.sdi.SDIR; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class sdi_r { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String MIN_MAPPING_COST_OPTION = "ml"; + final static private String MIN_DUPS_OPTION = "md"; + final static private String MIN_HEIGHT_OPTION = "mh"; + final static private String PRG_NAME = "sdi_r"; + final static private String PRG_VERSION = "1.11"; + final static private String PRG_DATE = "2009.06.19"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org"; + // How many resulting trees "main" should return/display. + private final static int TREES_TO_RETURN = 5; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( ( args.length < 3 ) || ( cla.getNumberOfNames() != 2 ) ) { + System.out.println(); + System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); + System.out.println(); + printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( MIN_MAPPING_COST_OPTION ); + allowed_options.add( MIN_DUPS_OPTION ); + allowed_options.add( MIN_HEIGHT_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File outfile = new File( "sdir_outfile.xml" ); + if ( outfile.exists() ) { + ForesterUtil.fatalError( PRG_NAME, "outfile \"" + outfile + "\" already exists" ); + } + final File gene_tree_file = cla.getFile( 0 ); + final File species_tree_file = cla.getFile( 1 ); + boolean minimize_cost = false; + if ( cla.isOptionSet( MIN_MAPPING_COST_OPTION ) ) { + minimize_cost = true; + } + boolean minimize_sum_of_dup = false; + if ( cla.isOptionSet( MIN_DUPS_OPTION ) ) { + minimize_sum_of_dup = true; + } + boolean minimize_height = false; + if ( cla.isOptionSet( MIN_HEIGHT_OPTION ) ) { + minimize_height = true; + } + int r = 0; + Phylogeny[] gene_trees = null; + Phylogeny species_tree = null; + if ( minimize_cost && minimize_sum_of_dup ) { + minimize_sum_of_dup = false; + } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + try { + final PhylogenyParser pp = new PhyloXmlParser(); + species_tree = factory.create( species_tree_file, pp )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read species tree [" + species_tree_file + "]: " + + e.getLocalizedMessage() ); + } + if ( !species_tree.isRooted() ) { + ForesterUtil.fatalError( PRG_NAME, "species tree [" + species_tree_file + "] is not rooted" ); + } + try { + final PhylogenyParser pp = new PhyloXmlParser(); + gene_trees = factory.create( gene_tree_file, pp ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read gene trees [" + gene_tree_file + "]: " + + e.getLocalizedMessage() ); + } + // Removes from gene_tree all species not found in species_tree. + int gene_tree_counter = 0; + final List all_result_trees = new ArrayList(); + for( final Phylogeny gene_tree : gene_trees ) { + r = PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree ); + ForesterUtil.programMessage( PRG_NAME, "Removed " + r + " external nodes from gene tree" ); + final SDIR sdiunrooted = new SDIR(); + final long start_time = new Date().getTime(); + Phylogeny[] result_trees = null; + try { + result_trees = sdiunrooted.infer( gene_tree, + species_tree, + minimize_cost, + minimize_sum_of_dup, + minimize_height, + true, + sdi_r.TREES_TO_RETURN ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + final long time_req = new Date().getTime() - start_time; + if ( minimize_cost ) { + ForesterUtil.programMessage( PRG_NAME, "Rooted by minimizing mapping cost L" ); + if ( minimize_height ) { + ForesterUtil.programMessage( PRG_NAME, + "Selected tree(s) with minimal height out of resulting trees" ); + } + ForesterUtil.programMessage( PRG_NAME, "Number differently rooted trees minimizing criterion : " + + sdiunrooted.getCount() ); + ForesterUtil.programMessage( PRG_NAME, "Minimal cost : " + + sdiunrooted.getMinimalMappingCost() ); + ForesterUtil.programMessage( PRG_NAME, "Minimal duplications : " + + sdiunrooted.getMinimalDuplications() ); + if ( minimize_height ) { + ForesterUtil.programMessage( PRG_NAME, "Phylogeny height : " + + ForesterUtil.FORMATTER_06.format( sdiunrooted.getMinimalTreeHeight() ) ); + ForesterUtil.programMessage( PRG_NAME, "Difference in subtree heights : " + + ForesterUtil.FORMATTER_06.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) ); + } + } + else if ( minimize_sum_of_dup ) { + ForesterUtil.programMessage( PRG_NAME, "Rooted by minimizing sum of duplications" ); + if ( minimize_height ) { + ForesterUtil.programMessage( PRG_NAME, + "Selected tree(s) with minimal height out of resulting trees" ); + } + ForesterUtil.programMessage( PRG_NAME, "Number differently rooted trees minimizing criterion : " + + sdiunrooted.getCount() ); + ForesterUtil.programMessage( PRG_NAME, "Minimal duplications : " + + sdiunrooted.getMinimalDuplications() ); + if ( minimize_height ) { + ForesterUtil.programMessage( PRG_NAME, + "Phylogeny height : " + + ForesterUtil.FORMATTER_06.format( sdiunrooted + .getMinimalTreeHeight() ) ); + ForesterUtil.programMessage( PRG_NAME, + "Difference in subtree heights : " + + ForesterUtil.FORMATTER_06.format( sdiunrooted + .getMinimalDiffInSubTreeHeights() ) ); + } + } + else if ( minimize_height ) { + ForesterUtil.programMessage( PRG_NAME, "Rooted by minimizing tree height (midpoint rooting)." ); + ForesterUtil.programMessage( PRG_NAME, "Minimal tree height : " + + ForesterUtil.FORMATTER_06.format( sdiunrooted.getMinimalTreeHeight() ) ); + ForesterUtil.programMessage( PRG_NAME, "Minimal difference in subtree heights: " + + ForesterUtil.FORMATTER_06.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) ); + ForesterUtil.programMessage( PRG_NAME, "Duplications in midpoint rooted tree : " + + sdiunrooted.getMinimalDuplications() ); + } + else { + ForesterUtil.programMessage( PRG_NAME, "No (re) rooting was performed." ); + ForesterUtil.programMessage( PRG_NAME, "Duplications in tree: " + sdiunrooted.getMinimalDuplications() ); + } + ForesterUtil.programMessage( PRG_NAME, "Time requirement (minus I/O) : " + + time_req + "ms" ); + for( int i = 0; i < result_trees.length; ++i ) { + final String name = result_trees[ i ].getName(); + if ( ForesterUtil.isEmpty( name ) ) { + result_trees[ i ].setName( "SDIR result [gene tree + " + gene_tree_counter + "]" + " " + i ); + } + else { + result_trees[ i ].setName( name + " SDIR result [gene tree + " + gene_tree_counter + "]" + " " + i ); + } + all_result_trees.add( result_trees[ i ] ); + } + ++gene_tree_counter; + } // for( final Phylogeny gene_tree : gene_trees ) + try { + final PhylogenyWriter w = new PhylogenyWriter(); + w.toPhyloXML( outfile, all_result_trees, 0, ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failure to write output to [" + outfile + "]: " + + e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "Wrote: " + outfile ); + ForesterUtil.programMessage( PRG_NAME, "OK." ); + } + + private static void printHelp() { + System.out.println( "Usage: " + PRG_NAME + + " \"" ); + System.out.println( "\nOptions:" ); + System.out.println( " -" + MIN_MAPPING_COST_OPTION + + " to root by minimizing the mapping cost L (and also the sum of duplications)" ); + System.out.println( " -" + MIN_DUPS_OPTION + " to root by minimizing the sum of duplications" ); + System.out.println( " -" + MIN_HEIGHT_OPTION + + " to root by minimizing tree height (can be used together with -" + MIN_MAPPING_COST_OPTION + " or -" + + MIN_DUPS_OPTION + ")" ); + System.out.println( "" ); + } +} diff --git a/forester/java/src/org/forester/application/shin.java b/forester/java/src/org/forester/application/shin.java new file mode 100644 index 0000000..5e03b47 --- /dev/null +++ b/forester/java/src/org/forester/application/shin.java @@ -0,0 +1,194 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.sdi.Shin; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class shin { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String DEFAULT_OUTFILE = "out"; + final static private String PRG_NAME = "shin"; + final static private String PRG_VERSION = "0.001 alpha"; + final static private String PRG_DATE = "2009.10.14"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE ); + System.out.println(); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + System.out.println(); + print_help(); + System.exit( 0 ); + } + else if ( ( args.length != 3 ) ) { + System.out.println(); + System.out.println( "wrong number of arguments" ); + System.out.println(); + print_help(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + File gene_trees_dir = null; + File species_trees_file = null; + //File out_file = null; + File out_dir = null; + Phylogeny[] species_trees = null; + try { + gene_trees_dir = cla.getFile( 0 ); + species_trees_file = cla.getFile( 1 ); + out_dir = cla.getFile( 2 ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() ); + } + if ( ForesterUtil.isReadableFile( species_trees_file ) != null ) { + ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isReadableFile( species_trees_file ) ); + } + if ( !gene_trees_dir.isDirectory() || !gene_trees_dir.canRead() ) { + ForesterUtil.fatalError( PRG_NAME, "cannot read gene trees from [" + gene_trees_dir + "]" ); + } + // if ( ForesterUtil.isWritableFile( out_file ) != null ) { + // ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isWritableFile( out_file ) ); + // } + if ( !out_dir.exists() ) { + boolean success = false; + try { + success = out_dir.mkdir(); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to create [" + out_dir + "] [" + e.getMessage() + "]" ); + } + if ( !success ) { + ForesterUtil.fatalError( PRG_NAME, "failed to create [" + out_dir + "]" ); + } + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + species_trees = factory.create( species_trees_file, new PhyloXmlParser() ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read species trees from [" + species_trees_file + "] [" + + e.getMessage() + "]" ); + } + if ( ( species_trees == null ) || ( species_trees.length < 1 ) ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read species trees from [" + species_trees_file + "]" ); + } + ForesterUtil.programMessage( PRG_NAME, "read in " + species_trees.length + " species trees from [" + + species_trees_file + "]" ); + final FilenameFilter filter = new FilenameFilter() { + + public boolean accept( final File dir, final String name ) { + return ( !name.startsWith( "." ) && !name.startsWith( "00_" ) && name.endsWith( ".xml" ) ); + } + }; + final String[] gene_tree_names = gene_trees_dir.list( filter ); + Arrays.sort( gene_tree_names ); + final List gene_tree_files = new ArrayList(); + for( final String gene_tree_name : gene_tree_names ) { + final File gene_tree_file = new File( gene_trees_dir + ForesterUtil.FILE_SEPARATOR + gene_tree_name ); + if ( !gene_tree_file.isDirectory() ) { + gene_tree_files.add( gene_tree_file ); + } + } + ForesterUtil.programMessage( PRG_NAME, "going to analyze " + gene_tree_files.size() + " gene trees from [" + + gene_trees_dir + "]" ); + final Shin shin = new Shin(); + try { + shin.method1( gene_tree_files, species_trees, out_dir ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + e.printStackTrace(); + } + ForesterUtil.programMessage( PRG_NAME, "OK" ); + // System.out.println(); + // System.out.println( "Strip species tree: " + strip ); + // SDI sdi = null; + // final long start_time = new Date().getTime(); + // try { + // if ( use_sdise ) { + // System.out.println(); + // System.out.println( "Using SDIse algorithm." ); + // sdi = new SDIse( gene_tree, species_tree ); + // } + // else { + // System.out.println(); + // System.out.println( "Using GSDI algorithm." ); + // System.out.println(); + // System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); + // sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model ); + // } + // } + // catch ( final Exception e ) { + // ForesterUtil.unexpectedFatalError( PRG_NAME, e ); + // } + // System.out.println(); + // System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); + // try { + // final PhylogenyWriter writer = new PhylogenyWriter(); + // writer.toPhyloXML( out_file, gene_tree, 1 ); + // } + // catch ( final IOException e ) { + // ForesterUtil.fatalError( PRG_NAME, "Failed to write to \"" + out_file + "\" [" + e.getMessage() + "]" ); + // } + // System.out.println(); + // System.out.println( "Successfully wrote resulting gene tree to: " + out_file ); + // System.out.println(); + // System.out.println(); + } + + private static void print_help() { + System.out.println( "Usage: " + PRG_NAME + " [-options] " ); + System.out.println(); + System.out.println( "Options:" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/strip.java b/forester/java/src/org/forester/application/strip.java new file mode 100644 index 0000000..308878f --- /dev/null +++ b/forester/java/src/org/forester/application/strip.java @@ -0,0 +1,121 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.ForesterUtil; + +public class strip { + + public static void main( final String args[] ) { + if ( args.length < 4 ) { + System.out.println( "\nstrip: Wrong number of arguments.\n" ); + System.out + .println( "Usage: \"strip [name1] [name2] ... OR [phylogenyfile]\"\n" ); + System.out.println( " Options: -k to keep listed nodes" ); + System.out.println( " -r to remove listed nodes" ); + System.out.println( " -kp to keep nodes found in [phylogenyfile]" ); + System.out.println( " -rp to remove nodes found in [phylogenyfile]\n" ); + System.exit( -1 ); + } + final File infile = new File( args[ 0 ] ); + final File outfile = new File( args[ 1 ] ); + final String options = args[ 2 ]; + Phylogeny p = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( infile, true ); + p = factory.create( infile, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.out.println( "\nCould not read \"" + infile + "\" [" + e.getMessage() + "]\n" ); + System.exit( -1 ); + } + boolean keep = false; + boolean from_p0 = false; + if ( options.trim().toLowerCase().equals( "-k" ) ) { + keep = true; + } + else if ( options.trim().toLowerCase().equals( "-kp" ) ) { + keep = true; + from_p0 = true; + } + else if ( options.trim().toLowerCase().equals( "-rp" ) ) { + from_p0 = true; + } + else if ( !options.trim().toLowerCase().equals( "-r" ) ) { + System.out.println( "\nUnknown option \"" + options + "\"\n" ); + System.exit( -1 ); + } + String[] names = null; + if ( from_p0 ) { + names = strip.readInNamesFromPhylogeny( args[ 3 ] ); + } + else { + names = new String[ args.length - 3 ]; + for( int i = 0; i < args.length - 3; ++i ) { + names[ i ] = args[ i + 3 ]; + } + } + if ( keep ) { + PhylogenyMethods.deleteExternalNodesPositiveSelection( names, p ); + } + else { + PhylogenyMethods.deleteExternalNodesNegativeSelection( names, p ); + } + try { + final PhylogenyWriter w = new PhylogenyWriter(); + w.toPhyloXML( outfile, p, 1 ); + } + catch ( final IOException e ) { + System.out.println( "\nFailure to write output [" + e.getMessage() + "]\n" ); + System.exit( -1 ); + } + } + + private static String[] readInNamesFromPhylogeny( final String file ) { + Phylogeny p0 = null; + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final File f = new File( file ); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( f, true ); + p0 = factory.create( f, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.out.println( "\nCould not read \"" + file + "\" [" + e.getMessage() + "]\n" ); + System.exit( -1 ); + } + return p0.getAllExternalNodeNames(); + } +} diff --git a/forester/java/src/org/forester/application/support_statistics.java b/forester/java/src/org/forester/application/support_statistics.java new file mode 100644 index 0000000..7e26be7 --- /dev/null +++ b/forester/java/src/org/forester/application/support_statistics.java @@ -0,0 +1,219 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.CommandLineArguments; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +public final class support_statistics { + + final static private int PLACES = 2; + final static private String HELP_OPTION = "help"; + final static private String OUTPUTFILE_OPTION = "o"; + final static private String PRG_NAME = "support_statistics"; + final static private String PRG_VERSION = "1.0"; + final static private String PRG_DATE = "2008.08.29"; + + private static StringBuffer analyze( final File[] phylogenies_infiles, final Phylogeny[] phylogenies ) { + final DescriptiveStatistics[] dss = new DescriptiveStatistics[ phylogenies.length ]; + for( int i = 0; i < phylogenies.length; i++ ) { + dss[ i ] = new BasicDescriptiveStatistics(); + final Phylogeny p = phylogenies[ i ]; + for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !node.isRoot() && !node.isExternal() ) { + double s = PhylogenyMethods.getConfidenceValue( node ); + if ( s < 0.0 ) { + s = 0.0; + } + dss[ i ].addValue( s ); + } + } + } + DescriptiveStatistics dss_comp = null; + if ( dss.length > 2 ) { + dss_comp = new BasicDescriptiveStatistics(); + for( final DescriptiveStatistics element : dss ) { + dss_comp.addValue( element.arithmeticMean() ); + } + } + int max_length = 30; + for( int i = 0; i < phylogenies.length; i++ ) { + final int l = phylogenies_infiles[ i ].getName().length(); + if ( l > max_length ) { + max_length = l; + } + } + final StringBuffer sb = new StringBuffer(); + sb.append( "\t" + ForesterUtil.normalizeString( "name:", max_length, true, ' ' ) + "\t" ); + sb.append( "median:" + "\t" ); + sb.append( "mean:" + "\t" ); + sb.append( "sd:" + "\t" ); + sb.append( "min:" + "\t" ); + sb.append( "max:" + "\t" ); + sb.append( "n:" + "\t" ); + if ( dss_comp != null ) { + sb.append( "\"z-score\":" ); + } + sb.append( ForesterUtil.getLineSeparator() ); + for( int i = 0; i < phylogenies.length; i++ ) { + sb.append( i + 1 + ":\t" + + ForesterUtil.normalizeString( phylogenies_infiles[ i ].getName(), max_length, true, ' ' ) + "\t" ); + sb.append( ForesterUtil.round( dss[ i ].median(), support_statistics.PLACES ) + "\t" ); + sb.append( ForesterUtil.round( dss[ i ].arithmeticMean(), support_statistics.PLACES ) + "\t" ); + try { + sb.append( ForesterUtil.round( dss[ i ].sampleStandardDeviation(), support_statistics.PLACES ) + "\t" ); + } + catch ( final ArithmeticException ex ) { + sb.append( "n/a\t" ); + } + sb.append( ForesterUtil.round( dss[ i ].getMin(), support_statistics.PLACES ) + "\t" ); + sb.append( ForesterUtil.round( dss[ i ].getMax(), support_statistics.PLACES ) + "\t" ); + sb.append( dss[ i ].getN() + "\t" ); + if ( dss_comp != null ) { + final double z_score = dss_comp.sampleStandardUnit( dss[ i ].arithmeticMean() ); + sb.append( ForesterUtil.round( z_score, support_statistics.PLACES ) + "\t" ); + } + sb.append( ForesterUtil.getLineSeparator() ); + } + if ( dss_comp != null ) { + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "\t" + ForesterUtil.normalizeString( "values for support means:", max_length, true, ' ' ) + + "\t\t" ); + sb.append( ForesterUtil.round( dss_comp.arithmeticMean(), support_statistics.PLACES ) + "\t" ); + sb.append( ForesterUtil.round( dss_comp.sampleStandardDeviation(), support_statistics.PLACES ) + "\t" ); + sb.append( ForesterUtil.round( dss_comp.getMin(), support_statistics.PLACES ) + "\t" ); + sb.append( ForesterUtil.round( dss_comp.getMax(), support_statistics.PLACES ) + "\t" ); + } + return sb; + } + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( support_statistics.PRG_NAME, + support_statistics.PRG_VERSION, + support_statistics.PRG_DATE ); + if ( ( args.length < 1 ) ) { + System.out.println(); + System.out.println( "wrong number of arguments" ); + System.out.println(); + support_statistics.printHelp(); + System.exit( -1 ); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( support_statistics.HELP_OPTION ) ) { + System.out.println(); + support_statistics.printHelp(); + System.exit( 0 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( support_statistics.OUTPUTFILE_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( support_statistics.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File[] phylogenies_infiles = new File[ cla.getNumberOfNames() ]; + for( int i = 0; i < phylogenies_infiles.length; ++i ) { + phylogenies_infiles[ i ] = cla.getFile( i ); + } + File outfile = null; + if ( cla.isOptionSet( support_statistics.OUTPUTFILE_OPTION ) ) { + try { + outfile = new File( cla.getOptionValue( support_statistics.OUTPUTFILE_OPTION ) ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( support_statistics.PRG_NAME, "error in command line: " + e.getMessage() ); + } + final String error = ForesterUtil.isWritableFile( outfile ); + if ( error != null ) { + ForesterUtil.fatalError( support_statistics.PRG_NAME, error ); + } + } + final Phylogeny[] phylogenies = new Phylogeny[ phylogenies_infiles.length ]; + for( int i = 0; i < phylogenies_infiles.length; i++ ) { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil + .createParserDependingOnFileType( phylogenies_infiles[ i ], true ); + phylogenies[ i ] = factory.create( phylogenies_infiles[ i ], pp )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( support_statistics.PRG_NAME, "could not read \"" + phylogenies_infiles[ i ] + + "\": " + e.getMessage() ); + } + } + final StringBuffer sb = support_statistics.analyze( phylogenies_infiles, phylogenies ); + System.out.println(); + System.out.println( sb ); + System.out.println(); + if ( outfile != null ) { + try { + final PrintWriter out = new PrintWriter( outfile ); + out.println( sb ); + out.flush(); + out.close(); + System.out.println( "wrote file: " + outfile ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( support_statistics.PRG_NAME, "failed to write output: " + e.getMessage() ); + } + } + System.out.println( support_statistics.PRG_NAME + ": successfully completed" ); + System.out.println(); + } + + private static void printHelp() { + System.out.println( "usage:" ); + System.out.println(); + System.out.println( support_statistics.PRG_NAME + " [-o=] " + + " ..." ); + System.out.println(); + System.out.println( " options: " ); + System.out.println(); + System.out.println( " -o= : write output to file" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/support_transfer.java b/forester/java/src/org/forester/application/support_transfer.java new file mode 100644 index 0000000..12e0a41 --- /dev/null +++ b/forester/java/src/org/forester/application/support_transfer.java @@ -0,0 +1,175 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public final class support_transfer { + + /** + * Transfers branch length values from one Phylogeny to another. It is + * mainly a "main method" for method "copyBranchLengthValuesFrom( Phylogeny )" + * of org.forester.phylogeny.Phylogeny, to be used in other (Perl) programs. + * + * @param args[0] + * Filename (String) for Phylogeny which has correct branch + * length values + * @param args[1] + * String Filename (String) for Phylogeny to which the branch + * lengths of the first Phylogeny are to be copied, both Trees + * must only differ in their branch length values, i.e. topology + * and sequence names, etc. must be the same + * @param args[2] + * String Filename (String) for outputfile + * @param args[3] + * String [number of tree with correct bl to use in case treefile contains more than one, default 0] + + */ + public static void main( final String args[] ) { + Phylogeny phylogeny_w_bl = null; // Has correct branch lengths + Phylogeny phylogeny_w_support_vals = null; // Has bootsrap in the b.l. + // field (will be + // transferred + // to the bootstrap field by the Phylogeny constructor) or + // has regular boostraps (NHX, :B=...). + File infile_bl = null; + File infile_support_vals = null; + File outfile = null; + int index_of_tree_w_bl = 0; + if ( ( args.length != 3 ) && ( args.length != 4 ) ) { + System.err.println( "SupportTransfer: Wrong number" + " of arguments. Usage: \"java transfersBranchLenghts" + + " " + " " + + "[number of tree with correct bl to use in case treefile contains more than one, default 0]\"" ); + System.exit( -1 ); + } + if ( args.length == 4 ) { + index_of_tree_w_bl = ( new Integer( args[ 3 ] ) ).intValue(); + } + try { + infile_bl = new File( args[ 0 ] ); + infile_support_vals = new File( args[ 1 ] ); + outfile = new File( args[ 2 ] ); + if ( outfile.exists() ) { + System.out.println( "transfersBranchLenghts: " + outfile.getAbsolutePath() + " does already exist." ); + System.exit( -1 ); + } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp_bl = ForesterUtil.createParserDependingOnFileType( infile_bl, true ); + final PhylogenyParser pp_s = ForesterUtil.createParserDependingOnFileType( infile_support_vals, true ); + if ( pp_bl instanceof NHXParser ) { + ( ( NHXParser ) pp_bl ).setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.YES ); + } + phylogeny_w_bl = factory.create( infile_bl, pp_bl )[ index_of_tree_w_bl ]; + phylogeny_w_support_vals = factory.create( infile_support_vals, pp_s )[ 0 ]; + } + catch ( final IOException e ) { + System.out.println( "SupportTransfer: Could not read tree(s): " + e ); + System.exit( -1 ); + } + try { + final double max_bs = PhylogenyMethods.getMaximumConfidenceValue( phylogeny_w_support_vals ); + PhylogenyMethods.normalizeBootstrapValues( phylogeny_w_support_vals, max_bs, 100 ); + support_transfer.transferSupportValues( phylogeny_w_support_vals, phylogeny_w_bl ); + } + catch ( final IllegalArgumentException e ) { + System.out.println( e.getMessage() ); + System.exit( -1 ); + } + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( outfile, phylogeny_w_bl, 0 ); + } + catch ( final IOException e ) { + System.out.println( "Failure to write phylogeny \'" + outfile + "\" [" + e.getMessage() + "]" ); + System.exit( -1 ); + } + } + + /** + * Moves the values in the branch length field to the bootstrap field, for + * each PhylogenyNode of this Phylogeny. Converts a Phylogeny originating + * from a phylip treefile after bootstrapping and which therefore has its + * bootstrap values where the branch lenghts would be. + */ + public final static void moveBranchLengthsToBootstrap( final Phylogeny p ) { + for( final PhylogenyNodeIterator iter = p.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isInternal() && ( node.getDistanceToParent() > 0 ) ) { + PhylogenyMethods.setBootstrapConfidence( node, node.getDistanceToParent() ); + } + else { + PhylogenyMethods.setBootstrapConfidence( node, Confidence.CONFIDENCE_DEFAULT_VALUE ); + } + node.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + } + } // moveBranchLengthsToBootstrap() + + /** + * Modifies Phylogeny to with the support values from Phylogeny from. + * Important (but obvious): The topology of both trees needs to be the same. + * The method is not robust, and might produce wrong results if the internal + * topology differs or if the external node names are not unique. + * + * @param from + * the Phylogeny to copy the support values from + * @param to + * the Phylogeny to copy the support values to + */ + public final static void transferSupportValues( final Phylogeny from, final Phylogeny to ) { + to: for( final PhylogenyNodeIterator it_to = to.iteratorPostorder(); it_to.hasNext(); ) { + final PhylogenyNode node_to = it_to.next(); + if ( !node_to.isExternal() ) { + final List ext_children_to = node_to.getAllExternalDescendantsNames(); + for( final PhylogenyNodeIterator it_from = from.iteratorPostorder(); it_from.hasNext(); ) { + final PhylogenyNode node_from = it_from.next(); + final List ext_children_from = node_from.getAllExternalDescendantsNames(); + if ( ( ext_children_from.size() == ext_children_to.size() ) + && ext_children_from.containsAll( ext_children_to ) ) { + PhylogenyMethods.setBootstrapConfidence( node_to, PhylogenyMethods + .getConfidenceValue( node_from ) ); + continue to; + } + } + final String message = "Attempt to transfer support values from nonidentical topologies"; + throw new IllegalArgumentException( message ); + } + } + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/application/surf_paup.java b/forester/java/src/org/forester/application/surf_paup.java new file mode 100644 index 0000000..297e16f --- /dev/null +++ b/forester/java/src/org/forester/application/surf_paup.java @@ -0,0 +1,169 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; +import org.forester.io.parsers.nexus.NexusCharactersParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nexus.PaupLogParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.surfacing.DomainParsimonyCalculator; +import org.forester.surfacing.SurfacingUtil; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class surf_paup { + + final static private String PRG_VERSION = "0.90"; + final static private String PRG_DATE = "2008.03.28"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + private static final String PRG_NAME = "surf_paup"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); + final List allowed_options = new ArrayList(); + allowed_options.add( HELP_OPTION_1 ); + allowed_options.add( HELP_OPTION_2 ); + if ( ( args.length < 2 ) ) { + printHelp(); + System.exit( -1 ); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + printHelp(); + System.exit( 0 ); + } + if ( cla.getNumberOfNames() != 3 ) { + printHelp(); + System.exit( -1 ); + } + final File surfacing_nexus_outfile = cla.getFile( 0 ); + final File paup_log_file = cla.getFile( 1 ); + final String outfile_name = cla.getFile( 2 ).toString(); + final NexusCharactersParser nex_char_parser = new NexusCharactersParser(); + try { + nex_char_parser.setSource( surfacing_nexus_outfile ); + nex_char_parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "problem with parsing character labels from [" + + surfacing_nexus_outfile + "]: " + e.getMessage() ); + e.printStackTrace(); + } + final String[] labels = nex_char_parser.getCharStateLabels(); + ForesterUtil.programMessage( PRG_NAME, "read in " + labels.length + " character labels" ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final NexusPhylogeniesParser phylogeny_parser = new NexusPhylogeniesParser(); + Phylogeny[] phylogenies = null; + try { + phylogenies = factory.create( surfacing_nexus_outfile, phylogeny_parser ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "problem with parsing phylogeny [" + surfacing_nexus_outfile + "]: " + + e.getMessage() ); + e.printStackTrace(); + } + if ( phylogenies.length != 1 ) { + ForesterUtil.fatalError( PRG_NAME, "failed to parse one phylogeny from [" + surfacing_nexus_outfile + + "], got " + phylogenies.length + " instead" ); + } + final Phylogeny phylogeny = phylogenies[ 0 ]; + if ( !phylogeny.isRooted() ) { + ForesterUtil.fatalError( PRG_NAME, "phylogeny from [" + surfacing_nexus_outfile + "] is not rooted" ); + } + ForesterUtil.postOrderRelabelInternalNodes( phylogeny, phylogeny.getNumberOfExternalNodes() + 1 ); + CharacterStateMatrix matrix = null; + final PaupLogParser paup_log_parser = new PaupLogParser(); + try { + paup_log_parser.setSource( paup_log_file ); + matrix = paup_log_parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to parse matrix from [" + paup_log_file + "]: " + + e.getMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "read in character state matrix of size " + + matrix.getNumberOfIdentifiers() + "x" + matrix.getNumberOfCharacters() ); + final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator.createInstance( phylogeny ); + domain_parsimony.executeOnGivenBinaryStatesMatrix( matrix, labels ); + final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), + outfile_name + "_paup_gl", + Format.FORESTER ); + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), + outfile_name + "_paup_glc", + Format.FORESTER ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + "_paup_gains", + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + "_paup_losses", + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name + + "_paup_present", sep, ForesterUtil.LINE_SEPARATOR, null ); + final String date_time = ForesterUtil.getCurrentDateTime(); + SurfacingUtil.preparePhylogeny( phylogeny, domain_parsimony, date_time, "parsimony (paup)", "paup_" + + outfile_name, "" ); + SurfacingUtil.writePhylogenyToFile( phylogeny, outfile_name + "_paup.xml" ); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + } + + private static void printHelp() { + System.out.println(); + System.out.println( "Usage:" ); + System.out.println(); + System.out + .println( "% java -cp forester.jar org.forester.applications." + + PRG_NAME + + " " ); + System.out.println(); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java new file mode 100644 index 0000000..4f1073c --- /dev/null +++ b/forester/java/src/org/forester/application/surfacing.java @@ -0,0 +1,2658 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.io.parsers.HmmscanPerDomainTableParser; +import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.surfacing.BasicDomainSimilarityCalculator; +import org.forester.surfacing.BasicGenomeWideCombinableDomains; +import org.forester.surfacing.BasicSpecies; +import org.forester.surfacing.BinaryDomainCombination; +import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator; +import org.forester.surfacing.DomainCountsDifferenceUtil; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.DomainLengthsTable; +import org.forester.surfacing.DomainParsimonyCalculator; +import org.forester.surfacing.DomainSimilarity; +import org.forester.surfacing.DomainSimilarityCalculator; +import org.forester.surfacing.GenomeWideCombinableDomains; +import org.forester.surfacing.MappingResults; +import org.forester.surfacing.PairwiseDomainSimilarityCalculator; +import org.forester.surfacing.PairwiseGenomeComparator; +import org.forester.surfacing.PrintableDomainSimilarity; +import org.forester.surfacing.Protein; +import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.Species; +import org.forester.surfacing.SurfacingUtil; +import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring; +import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; +import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; + +public class surfacing { + + public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; + public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; + public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; + // gain/loss: + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc"; + // gain/loss counts: + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc"; + // tables: + // public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc"; + // public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html"; + // public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc"; + // public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html"; + // public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; + // public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; + // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; + // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_GOID_D = "_dollo_gains_goid_d"; + // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; + // public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; + //public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; + // public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_GOID_D = "_dollo_present_goid_d"; + //public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; + public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; + public final static String BDC_PRESENT_NEXUS = "_dc.nex"; + // --- + public final static String PRG_NAME = "surfacing"; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex"; + public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex"; + public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex"; + public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features"; + public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; + public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_BIOLOGICAL_PROCESS = "_dollo_biol_proc_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_CELLULAR_COMPONENT = "_dollo_cell_comp_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_MOLECULAR_FUNCTION = "_dollo_mol_funct_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_BIOLOGICAL_PROCESS = "_fitch_biol_proc_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_CELLULAR_COMPONENT = "_fitch_cell_comp_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_MOLECULAR_FUNCTION = "_fitch_mol_funct_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String OUTPUT_DIR_OPTION = "out_dir"; + final static private String SCORING_OPTION = "scoring"; + private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + final static private String SCORING_DOMAIN_COUNT_BASED = "domains"; + final static private String SCORING_PROTEIN_COUNT_BASED = "proteins"; + final static private String SCORING_COMBINATION_BASED = "combinations"; + final static private String DETAILEDNESS_OPTION = "detail"; + private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + final static private String SPECIES_MATRIX_OPTION = "smatrix"; + final static private String DETAILEDNESS_BASIC = "basic"; + final static private String DETAILEDNESS_LIST_IDS = "list_ids"; + final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious"; + final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort"; + private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + final static private String DOMAIN_SIMILARITY_SORT_MIN = "min"; + final static private String DOMAIN_SIMILARITY_SORT_MAX = "max"; + final static private String DOMAIN_SIMILARITY_SORT_SD = "sd"; + final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean"; + final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff"; + final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species"; + final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha"; + final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first"; + final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort"; + private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot"; + final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb"; + final static private String CUTOFF_SCORE_FILE_OPTION = "cos"; + final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; + final static private String MAX_E_VALUE_OPTION = "e"; + final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; + final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; + final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; + final static private String OUTPUT_FILE_OPTION = "o"; + final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; + final static private String GO_OBO_FILE_USE_OPTION = "obo"; + final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace"; + final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function"; + final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process"; + final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component"; + final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output"; + private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML; + final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains"; + final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids"; + final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false; + final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains"; + final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false; + final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd"; + final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd"; + final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd"; + final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String DISPLAY_M_HISTOGRAMS_OPTION = "mhisto"; + // final static private boolean DISPLAY_M_HISTOGRAMS_OPTION_DEFAULT = false; + final static private String JACKNIFE_OPTION = "jack"; + final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; + final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; + private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100; + final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; + final static private double JACKNIFE_RATIO_DEFAULT = 0.5; + //final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference"; + final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh"; + final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh"; + final static private String FILTER_POSITIVE_OPTION = "pos_filter"; + final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; + final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; + final static private String INPUT_FILES_FROM_FILE_OPTION = "input"; + final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; + final static private String SEQ_EXTRACT_OPTION = "prot_extract"; + final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; + final static private String PRG_VERSION = "2.003"; + final static private String PRG_DATE = "2010.12.03"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; + final static private boolean IGNORE_DUFS_DEFAULT = true; + final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; + final static private double MAX_E_VALUE_DEFAULT = -1; + final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + final static private String DEFAULT_SEARCH_PARAMETER = "ls"; + final private static boolean VERBOSE_DEFAULT = true; + private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; + private static final String SEQ_EXTRACT_SUFFIX = ".prot"; + private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; + private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; + private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; + private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; + private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; + private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; + private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; + private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; + private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + private static final boolean VERBOSE = false; + private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; + private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; + private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; + private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; + public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; + public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; + public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; + public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains"; + public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains"; + public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc"; + public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc"; + public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS"; + public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS"; + public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities"; + private static final String LOG_FILE_SUFFIX = "_log.txt"; + private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; + private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; + private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; + + // final String error = ForesterUtil.isReadableFile( new File( + // input_file_properties[ i ][ 0 ] ) ); + // if ( !ForesterUtil.isEmpty( error ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + // } + private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + "_" + + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + } + } + + private static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "E-value: " + e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + /** + * Warning: This sideeffects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @throws IOException + */ + private static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + SurfacingUtil.checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + private static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List plus_minus_analysis_numbers ) { + final Set all_spec = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_spec.add( gwcd.getSpecies().getSpeciesId() ); + } + final File html_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX_HTML ); + final File plain_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX ); + final File html_out_dc = new File( output_file + PLUS_MINUS_DC_SUFFIX_HTML ); + final File all_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); + final File passing_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + final File proteins_file_base = new File( output_file + "" ); + final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); + final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); + try { + DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, + protein_lists_per_species, + plus_minus_analysis_high_copy_base, + plus_minus_analysis_high_copy_target, + plus_minus_analysis_low_copy, + min_diff, + factor, + plain_out_dom, + html_out_dom, + html_out_dc, + domain_id_to_go_ids_map, + go_id_to_term_map, + all_domains_go_ids_out_dom, + passing_domains_go_ids_out_dom, + proteins_file_base ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + plain_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \"" + + passing_domains_go_ids_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" + + all_domains_go_ids_out_dom + "\"" ); + } + + private static Phylogeny[] getIntrees( final File[] intree_files, + final int number_of_genomes, + final String[][] input_file_properties ) { + final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; + int i = 0; + for( final File intree_file : intree_files ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + + error ); + } + try { + final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ForesterUtil.createParserDependingOnFileType( intree_file, true ) ); + if ( p_array.length < 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( p_array.length > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = p_array[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + + "]: " + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "number of external nodes [" + + intree.getNumberOfExternalNodes() + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + number_of_genomes + "]" ); + } + final StringBuilder parent_names = new StringBuilder(); + final int nodes_lacking_name = SurfacingUtil.getNumberOfNodesLackingName( intree, parent_names ); + if ( nodes_lacking_name > 0 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has " + + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); + } + preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); + if ( !intree.isCompletelyBinary() ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file + + "] is not completely binary" ); + } + intrees[ i++ ] = intree; + } + return intrees; + } + + private static List inferSpeciesTrees( final File outfile, final List distances_list ) { + final NeighborJoining nj = NeighborJoining.createInstance(); + final List phylogenies = nj.execute( distances_list ); + final PhylogenyWriter w = new PhylogenyWriter(); + try { + w.toNewHampshire( phylogenies, true, true, outfile, ";" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); + } + return phylogenies; + } + + private static void log( final String msg, final Writer w ) { + try { + w.write( msg ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static void main( final String args[] ) { + final long start_time = new Date().getTime(); + // final StringBuffer log = new StringBuffer(); + final StringBuilder html_desc = new StringBuilder(); + ForesterUtil.printProgramInformation( surfacing.PRG_NAME, + surfacing.PRG_VERSION, + surfacing.PRG_DATE, + surfacing.E_MAIL, + surfacing.WWW ); + final String nl = ForesterUtil.LINE_SEPARATOR; + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( surfacing.HELP_OPTION_1 ) || cla.isOptionSet( surfacing.HELP_OPTION_2 ) ) { + surfacing.printHelp(); + System.exit( 0 ); + } + if ( ( args.length < 1 ) ) { + surfacing.printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( surfacing.NOT_IGNORE_DUFS_OPTION ); + allowed_options.add( surfacing.MAX_E_VALUE_OPTION ); + allowed_options.add( surfacing.DETAILEDNESS_OPTION ); + allowed_options.add( surfacing.OUTPUT_FILE_OPTION ); + allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ); + allowed_options.add( surfacing.SPECIES_MATRIX_OPTION ); + allowed_options.add( surfacing.SCORING_OPTION ); + allowed_options.add( surfacing.MAX_ALLOWED_OVERLAP_OPTION ); + allowed_options.add( surfacing.NO_ENGULFING_OVERLAP_OPTION ); + allowed_options.add( surfacing.DOMAIN_COUNT_SORT_OPTION ); + allowed_options.add( surfacing.CUTOFF_SCORE_FILE_OPTION ); + allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ); + allowed_options.add( surfacing.OUTPUT_DIR_OPTION ); + allowed_options.add( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION ); + allowed_options.add( surfacing.PFAM_TO_GO_FILE_USE_OPTION ); + allowed_options.add( surfacing.GO_OBO_FILE_USE_OPTION ); + allowed_options.add( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ); + allowed_options.add( surfacing.GO_NAMESPACE_LIMIT_OPTION ); + allowed_options.add( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + allowed_options.add( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); + allowed_options.add( surfacing.DISPLAY_M_HISTOGRAMS_OPTION ); + allowed_options.add( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); + allowed_options.add( JACKNIFE_OPTION ); + allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); + allowed_options.add( JACKNIFE_RATIO_OPTION ); + allowed_options.add( INPUT_SPECIES_TREE_OPTION ); + //allowed_options.add( INFER_SPECIES_TREES_OPTION ); + allowed_options.add( FILTER_POSITIVE_OPTION ); + allowed_options.add( FILTER_NEGATIVE_OPTION ); + allowed_options.add( INPUT_FILES_FROM_FILE_OPTION ); + allowed_options.add( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION ); + allowed_options.add( IGNORE_VIRAL_IDS ); + allowed_options.add( SEQ_EXTRACT_OPTION ); + allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); + allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); + allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); + allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); + allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT; + boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; + double e_value_max = surfacing.MAX_E_VALUE_DEFAULT; + int max_allowed_overlap = surfacing.MAX_ALLOWED_OVERLAP_DEFAULT; + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean output_binary_domain_combinationsfor_graph_analysis = false; + if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { + output_binary_domain_combinationsfor_graph_analysis = true; + } + if ( cla.isOptionSet( surfacing.MAX_E_VALUE_OPTION ) ) { + try { + e_value_max = cla.getOptionValueAsDouble( surfacing.MAX_E_VALUE_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" ); + } + } + if ( cla.isOptionSet( surfacing.MAX_ALLOWED_OVERLAP_OPTION ) ) { + try { + max_allowed_overlap = cla.getOptionValueAsInt( surfacing.MAX_ALLOWED_OVERLAP_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for maximal allowed domain overlap" ); + } + } + boolean no_engulfing_overlaps = false; + if ( cla.isOptionSet( surfacing.NO_ENGULFING_OVERLAP_OPTION ) ) { + no_engulfing_overlaps = true; + } + boolean ignore_virus_like_ids = false; + if ( cla.isOptionSet( surfacing.IGNORE_VIRAL_IDS ) ) { + ignore_virus_like_ids = true; + } + if ( cla.isOptionSet( surfacing.NOT_IGNORE_DUFS_OPTION ) ) { + ignore_dufs = false; + } + if ( cla.isOptionSet( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) { + ignore_combination_with_same = true; + } + boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT; + if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) { + ignore_domains_without_combs_in_all_spec = true; + } + boolean ignore_species_specific_domains = IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION ) ) { + ignore_species_specific_domains = true; + } + File output_file = null; + if ( cla.isOptionSet( surfacing.OUTPUT_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.OUTPUT_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "no value for domain combinations similarities output file: -" + + surfacing.OUTPUT_FILE_OPTION + "=" ); + } + output_file = new File( cla.getOptionValue( surfacing.OUTPUT_FILE_OPTION ) ); + SurfacingUtil.checkForOutputFileWriteability( output_file ); + } + File cutoff_scores_file = null; + Map individual_score_cutoffs = null; + if ( cla.isOptionSet( surfacing.CUTOFF_SCORE_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.CUTOFF_SCORE_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for individual domain score cutoffs file: -" + + surfacing.CUTOFF_SCORE_FILE_OPTION + "=" ); + } + cutoff_scores_file = new File( cla.getOptionValue( surfacing.CUTOFF_SCORE_FILE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( cutoff_scores_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read individual domain score cutoffs file: " + + error ); + } + try { + final BasicTable scores_table = BasicTableParser.parse( cutoff_scores_file, " " ); + individual_score_cutoffs = scores_table.getColumnsAsMapDouble( 0, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from individual score cutoffs file: " + e ); + } + } + BinaryDomainCombination.DomainCombinationType dc_type = BinaryDomainCombination.DomainCombinationType.BASIC; + if ( cla.isOptionSet( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED; + } + if ( cla.isOptionSet( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT; + } + File out_dir = null; + if ( cla.isOptionSet( surfacing.OUTPUT_DIR_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.OUTPUT_DIR_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for output directory: -" + + surfacing.OUTPUT_DIR_OPTION + "=" ); + } + out_dir = new File( cla.getOptionValue( surfacing.OUTPUT_DIR_OPTION ) ); + if ( out_dir.exists() && ( out_dir.listFiles().length > 0 ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "\"" + out_dir + "\" aready exists and is not empty" ); + } + if ( !out_dir.exists() ) { + final boolean success = out_dir.mkdir(); + if ( !success || !out_dir.exists() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to create \"" + out_dir + "\"" ); + } + } + if ( !out_dir.canWrite() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot write to \"" + out_dir + "\"" ); + } + } + File positive_filter_file = null; + File negative_filter_file = null; + File negative_domains_filter_file = null; + if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) && cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use both negative and positive protein filter" ); + } + if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) + && ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) || cla + .isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) ) { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "attempt to use both negative or positive protein filter together wirh a negative domains filter" ); + } + if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.FILTER_NEGATIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for negative filter: -" + + surfacing.FILTER_NEGATIVE_OPTION + "=" ); + } + negative_filter_file = new File( cla.getOptionValue( surfacing.FILTER_NEGATIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + negative_filter_file + "\": " + + msg ); + } + } + else if ( cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for positive filter: -" + + surfacing.FILTER_POSITIVE_OPTION + "=" ); + } + positive_filter_file = new File( cla.getOptionValue( surfacing.FILTER_POSITIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( positive_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + positive_filter_file + "\": " + + msg ); + } + } + else if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for negative domains filter: -" + + surfacing.FILTER_NEGATIVE_DOMAINS_OPTION + "=" ); + } + negative_domains_filter_file = new File( cla.getOptionValue( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_domains_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + negative_domains_filter_file + + "\": " + msg ); + } + } + final List plus_minus_analysis_high_copy_base_species = new ArrayList(); + final List plus_minus_analysis_high_copy_target_species = new ArrayList(); + final List plus_minus_analysis_high_low_copy_species = new ArrayList(); + final List plus_minus_analysis_numbers = new ArrayList(); + processPlusMinusAnalysisOption( cla, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + plus_minus_analysis_numbers ); + File input_files_file = null; + String[] input_file_names_from_file = null; + if ( cla.isOptionSet( surfacing.INPUT_FILES_FROM_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.INPUT_FILES_FROM_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for input files file: -" + + surfacing.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + input_files_file = new File( cla.getOptionValue( surfacing.INPUT_FILES_FROM_FILE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( input_files_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + input_files_file + "\": " + msg ); + } + try { + input_file_names_from_file = ForesterUtil.file2array( input_files_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read from \"" + input_files_file + "\": " + e ); + } + } + if ( ( cla.getNumberOfNames() < 1 ) + && ( ( input_file_names_from_file == null ) || ( input_file_names_from_file.length < 1 ) ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "No hmmpfam output file indicated is input: use comand line directly or " + + surfacing.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; + if ( cla.isOptionSet( surfacing.SCORING_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.SCORING_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "no value for scoring method for domain combinations similarity calculation: -" + + surfacing.SCORING_OPTION + "=<" + + surfacing.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing.SCORING_PROTEIN_COUNT_BASED + "|" + + surfacing.SCORING_COMBINATION_BASED + ">\"" ); + } + final String scoring_str = cla.getOptionValue( surfacing.SCORING_OPTION ); + if ( scoring_str.equals( surfacing.SCORING_DOMAIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.DOMAINS; + } + else if ( scoring_str.equals( surfacing.SCORING_COMBINATION_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + } + else if ( scoring_str.equals( surfacing.SCORING_PROTEIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.PROTEINS; + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + scoring_str + + "\" for scoring method for domain combinations similarity calculation: \"-" + + surfacing.SCORING_OPTION + "=<" + surfacing.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing.SCORING_PROTEIN_COUNT_BASED + "|" + surfacing.SCORING_COMBINATION_BASED + ">\"" ); + } + } + boolean sort_by_species_count_first = false; + if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ) ) { + sort_by_species_count_first = true; + } + boolean species_matrix = false; + if ( cla.isOptionSet( surfacing.SPECIES_MATRIX_OPTION ) ) { + species_matrix = true; + } + boolean output_protein_lists_for_all_domains = false; + if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) { + output_protein_lists_for_all_domains = true; + } + Detailedness detailedness = DETAILEDNESS_DEFAULT; + if ( cla.isOptionSet( surfacing.DETAILEDNESS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.DETAILEDNESS_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for -" + surfacing.DETAILEDNESS_OPTION + "=<" + + surfacing.DETAILEDNESS_BASIC + "|" + surfacing.DETAILEDNESS_LIST_IDS + "|" + + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" ); + } + final String detness = cla.getOptionValue( surfacing.DETAILEDNESS_OPTION ).toLowerCase(); + if ( detness.equals( surfacing.DETAILEDNESS_BASIC ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.BASIC; + } + else if ( detness.equals( surfacing.DETAILEDNESS_LIST_IDS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES; + } + else if ( detness.equals( surfacing.DETAILEDNESS_PUNCTILIOUS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + detness + "\" for detailedness: \"-" + + surfacing.DETAILEDNESS_OPTION + "=<" + surfacing.DETAILEDNESS_BASIC + "|" + + surfacing.DETAILEDNESS_LIST_IDS + "|" + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" ); + } + } + String automated_pairwise_comparison_suffix = null; + boolean perform_pwc = false; + boolean write_pwc_files = false; + if ( cla.isOptionSet( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + perform_pwc = true; + if ( !cla.isOptionValueSet( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + write_pwc_files = false; + } + else { + write_pwc_files = true; + automated_pairwise_comparison_suffix = "_" + + cla.getOptionValue( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + } + } + String query_domain_ids = null; + if ( cla.isOptionSet( surfacing.SEQ_EXTRACT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.SEQ_EXTRACT_OPTION ) ) { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "no domain ids given for sequences with given domains to be extracted : -" + + surfacing.SEQ_EXTRACT_OPTION + + "=" ); + } + query_domain_ids = cla.getOptionValue( surfacing.SEQ_EXTRACT_OPTION ); + } + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field = DOMAIN_SORT_FILD_DEFAULT; + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT; + if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for domain combinations similarities sorting: -" + + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|" + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|" + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + surfacing.DOMAIN_SIMILARITY_SORT_SD + + ">\"" ); + } + final String sort_str = cla.getOptionValue( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ).toLowerCase(); + if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_ALPHA ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MAX ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MIN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MIN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MEAN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MEAN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MEAN; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SPECIES_COUNT; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_SD ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SD; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort_str + + "\" for domain combinations similarities sorting: \"-" + + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|" + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|" + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + "|" + + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + surfacing.DOMAIN_SIMILARITY_SORT_SD + + ">\"" ); + } + } + PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option = DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for print option: -" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.HTML; + } + else if ( sort.equals( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML ) ) { + // domain_similarity_print_option = + // DomainSimilarity.PRINT_OPTION.SIMPLE_HTML; + ForesterUtil.fatalError( surfacing.PRG_NAME, "simple HTML output not implemented yet :(" ); + } + else if ( sort.equals( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.SIMPLE_TAB_DELIMITED; + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort + "\" for print option: -" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + } + GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order = DOMAINS_SORT_ORDER_DEFAULT; + if ( cla.isOptionSet( surfacing.DOMAIN_COUNT_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.DOMAIN_COUNT_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for sorting of domain counts: -" + + surfacing.DOMAIN_COUNT_SORT_OPTION + "=<" + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|" + + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing.DOMAIN_COUNT_SORT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing.DOMAIN_COUNT_SORT_ALPHA ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + } + else if ( sort.equals( surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT; + } + else if ( sort.equals( surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT; + } + else if ( sort.equals( surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT; + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort + + "\" for sorting of domain counts: \"-" + surfacing.DOMAIN_COUNT_SORT_OPTION + "=<" + + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|" + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + } + String[][] input_file_properties = null; + if ( input_file_names_from_file != null ) { + input_file_properties = surfacing.processInputFileNames( input_file_names_from_file ); + } + else { + input_file_properties = surfacing.processInputFileNames( cla.getNames() ); + } + final int number_of_genomes = input_file_properties.length; + if ( number_of_genomes < 2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); + } + if ( ( number_of_genomes < 3 ) && perform_pwc ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use : -" + + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "= to turn on pairwise analyses with less than three input files" ); + } + checkWriteabilityForPairwiseComparisons( domain_similarity_print_option, + input_file_properties, + automated_pairwise_comparison_suffix, + out_dir ); + for( int i = 0; i < number_of_genomes; i++ ) { + File dcc_outfile = new File( input_file_properties[ i ][ 0 ] + + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + if ( out_dir != null ) { + dcc_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + dcc_outfile ); + } + SurfacingUtil.checkForOutputFileWriteability( dcc_outfile ); + } + File pfam_to_go_file = null; + Map> domain_id_to_go_ids_map = null; + int domain_id_to_go_ids_count = 0; + if ( cla.isOptionSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for Pfam to GO mapping file: -" + + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=" ); + } + pfam_to_go_file = new File( cla.getOptionValue( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( pfam_to_go_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read Pfam to GO mapping file: " + error ); + } + try { + final PfamToGoParser parser = new PfamToGoParser( pfam_to_go_file ); + final List pfam_to_go_mappings = parser.parse(); + domain_id_to_go_ids_map = SurfacingUtil.createDomainIdToGoIdMap( pfam_to_go_mappings ); + if ( parser.getMappingCount() < domain_id_to_go_ids_map.size() ) { + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, + "parser.getMappingCount() < domain_id_to_go_ids_map.size()" ); + } + domain_id_to_go_ids_count = parser.getMappingCount(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from Pfam to GO mapping file: " + e ); + } + } + File go_obo_file = null; + List go_terms = null; + if ( cla.isOptionSet( surfacing.GO_OBO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.GO_OBO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for GO OBO file: -" + + surfacing.GO_OBO_FILE_USE_OPTION + "=" ); + } + if ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use GO OBO file (-" + + surfacing.GO_OBO_FILE_USE_OPTION + "=) without Pfam to GO mapping file (" + + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + go_obo_file = new File( cla.getOptionValue( surfacing.GO_OBO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( go_obo_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read GO OBO file: " + error ); + } + try { + final OBOparser parser = new OBOparser( go_obo_file, OBOparser.ReturnType.BASIC_GO_TERM ); + go_terms = parser.parse(); + if ( parser.getGoTermCount() != go_terms.size() ) { + ForesterUtil + .unexpectedFatalError( surfacing.PRG_NAME, "parser.getGoTermCount() != go_terms.size()" ); + } + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from GO OBO file: " + e ); + } + } + Map go_id_to_term_map = null; + if ( ( ( domain_id_to_go_ids_map != null ) && ( domain_id_to_go_ids_map.size() > 0 ) ) + && ( ( go_terms != null ) && ( go_terms.size() > 0 ) ) ) { + go_id_to_term_map = GoUtils.createGoIdToGoTermMap( go_terms ); + } + GoNameSpace go_namespace_limit = null; + if ( cla.isOptionSet( surfacing.GO_NAMESPACE_LIMIT_OPTION ) ) { + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use GO namespace limit (-" + + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=) without Pfam to GO mapping file (" + + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=) and GO OBO file (-" + + surfacing.GO_OBO_FILE_USE_OPTION + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing.GO_NAMESPACE_LIMIT_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for GO namespace limit: \"-" + + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + final String go_namespace_limit_str = cla.getOptionValue( surfacing.GO_NAMESPACE_LIMIT_OPTION ) + .toLowerCase(); + if ( go_namespace_limit_str.equals( surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION ) ) { + go_namespace_limit = GoNameSpace.createMolecularFunction(); + } + else if ( go_namespace_limit_str.equals( surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS ) ) { + go_namespace_limit = GoNameSpace.createBiologicalProcess(); + } + else if ( go_namespace_limit_str.equals( surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT ) ) { + go_namespace_limit = GoNameSpace.createCellularComponent(); + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + go_namespace_limit_str + + "\" for GO namespace limit: \"-" + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + } + if ( ( domain_similarity_sort_field == DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE ) + && ( number_of_genomes > 2 ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + boolean jacknifed_distances = false; + int jacknife_resamplings = JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT; + double jacknife_ratio = JACKNIFE_RATIO_DEFAULT; + long random_seed = JACKNIFE_RANDOM_SEED_DEFAULT; + if ( cla.isOptionSet( surfacing.JACKNIFE_OPTION ) ) { + if ( ( number_of_genomes < 3 ) || !perform_pwc ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use jacknife resampling analysis (-" + + surfacing.JACKNIFE_OPTION + "[=]) without pairwise analyses (" + + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + jacknifed_distances = true; + if ( cla.isOptionHasAValue( surfacing.JACKNIFE_OPTION ) ) { + try { + jacknife_resamplings = cla.getOptionValueAsInt( surfacing.JACKNIFE_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for number of resamplings" ); + } + if ( jacknife_resamplings < 2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use less than 2 resamplings" ); + } + } + if ( cla.isOptionSet( surfacing.JACKNIFE_RATIO_OPTION ) + && cla.isOptionHasAValue( surfacing.JACKNIFE_RATIO_OPTION ) ) { + try { + jacknife_ratio = cla.getOptionValueAsDouble( surfacing.JACKNIFE_RATIO_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for jacknife ratio" ); + } + if ( ( jacknife_ratio <= 0.0 ) || ( jacknife_ratio >= 1.0 ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use illegal value for jacknife ratio: " + + jacknife_ratio ); + } + } + if ( cla.isOptionSet( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) + && cla.isOptionHasAValue( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) ) { + try { + random_seed = cla.getOptionValueAsLong( surfacing.JACKNIFE_RANDOM_SEED_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for random generator seed" ); + } + } + } + // boolean infer_species_trees = false; + // if ( cla.isOptionSet( surfacing.INFER_SPECIES_TREES_OPTION ) ) { + // if ( ( output_file == null ) || ( number_of_genomes < 3 ) + // || ForesterUtil.isEmpty( automated_pairwise_comparison_suffix ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer species trees (-" + // + surfacing.INFER_SPECIES_TREES_OPTION + " without pairwise analyses (" + // + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + // + "=)" ); + // } + // infer_species_trees = true; + // } + File[] intree_files = null; + Phylogeny[] intrees = null; + if ( cla.isOptionSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) { + // TODO FIXME if jacknife.... maybe not + if ( number_of_genomes < 3 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer gains and losses on input species trees (-" + + surfacing.INPUT_SPECIES_TREE_OPTION + " without pairwise analyses (" + + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for input tree: -" + + surfacing.INPUT_SPECIES_TREE_OPTION + "=" ); + } + final String intrees_str = cla.getOptionValue( surfacing.INPUT_SPECIES_TREE_OPTION ); + if ( intrees_str.indexOf( "#" ) > 0 ) { + final String[] intrees_strs = intrees_str.split( "#" ); + intree_files = new File[ intrees_strs.length ]; + int i = 0; + for( final String s : intrees_strs ) { + intree_files[ i++ ] = new File( s.trim() ); + } + } + else { + intree_files = new File[ 1 ]; + intree_files[ 0 ] = new File( intrees_str ); + } + intrees = getIntrees( intree_files, number_of_genomes, input_file_properties ); + } + long random_number_seed_for_fitch_parsimony = 0l; + boolean radomize_fitch_parsimony = false; + if ( cla.isOptionSet( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for random number seed: -" + + surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + "=" ); + } + try { + random_number_seed_for_fitch_parsimony = cla + .getOptionValueAsLong( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + radomize_fitch_parsimony = true; + } + SortedSet filter = null; + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + filter = new TreeSet(); + if ( positive_filter_file != null ) { + processFilter( positive_filter_file, filter ); + } + else if ( negative_filter_file != null ) { + processFilter( negative_filter_file, filter ); + } + else if ( negative_domains_filter_file != null ) { + processFilter( negative_domains_filter_file, filter ); + } + } + Map>[] domain_id_to_secondary_features_maps = null; + File[] secondary_features_map_files = null; + final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile ); + } + if ( cla.isOptionSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + if ( !cla.isOptionValueSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for secondary features map file: -" + + surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=" ); + } + final String[] secondary_features_map_files_strs = cla + .getOptionValue( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ).split( "#" ); + secondary_features_map_files = new File[ secondary_features_map_files_strs.length ]; + domain_id_to_secondary_features_maps = new Map[ secondary_features_map_files_strs.length ]; + int i = 0; + for( final String secondary_features_map_files_str : secondary_features_map_files_strs ) { + secondary_features_map_files[ i ] = new File( secondary_features_map_files_str ); + final String error = ForesterUtil.isReadableFile( secondary_features_map_files[ i ] ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read secondary features map file: " + error ); + } + try { + domain_id_to_secondary_features_maps[ i ] = SurfacingUtil + .createDomainIdToSecondaryFeaturesMap( secondary_features_map_files[ i ] ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read secondary features map file: " + + e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "problem with contents of features map file [" + + secondary_features_map_files[ i ] + "]: " + e.getMessage() ); + } + i++; + } + } + if ( out_dir == null ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no output directory indicated (-" + + surfacing.OUTPUT_DIR_OPTION + "=)" ); + } + if ( output_file == null ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no name for (main) output file indicated (-" + + surfacing.OUTPUT_FILE_OPTION + "=)" ); + } + if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "no (acceptable) Pfam to GO id mapping file provided ('pfam2go file') (-" + + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "no (acceptable) go id to term mapping file provided ('GO OBO file') (-" + + surfacing.GO_OBO_FILE_USE_OPTION + "=)" ); + } + boolean display_histograms = false; + if ( cla.isOptionSet( surfacing.DISPLAY_M_HISTOGRAMS_OPTION ) ) { + display_histograms = true; + } + System.out.println( "Output directory : " + out_dir ); + if ( input_file_names_from_file != null ) { + System.out.println( "Input files names from : " + input_files_file + " [" + + input_file_names_from_file.length + " input files]" ); + html_desc.append( "" + nl ); + } + if ( positive_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Positive protein filter : " + positive_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative protein filter : " + negative_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_domains_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative domain filter : " + negative_domains_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + String plus0 = ""; + for( final String s : plus_minus_analysis_high_copy_base_species ) { + plus0 += "+" + s + " "; + } + String plus1 = ""; + for( final String s : plus_minus_analysis_high_copy_target_species ) { + plus1 += "*" + s + " "; + } + String minus = ""; + for( final String s : plus_minus_analysis_high_low_copy_species ) { + minus += "-" + s + " "; + } + System.out.println( "Plus-minus analysis : " + plus1 + "&& " + plus0 + "&& " + minus ); + html_desc.append( "" + nl ); + } + if ( cutoff_scores_file != null ) { + System.out.println( "Cutoff scores file : " + cutoff_scores_file ); + html_desc.append( "" + nl ); + } + if ( e_value_max >= 0.0 ) { + System.out.println( "E-value maximum (inclusive) : " + e_value_max ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore DUFs : " + ignore_dufs ); + if ( ignore_virus_like_ids ) { + System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids ); + html_desc.append( "" + nl ); + } + html_desc.append( "" + nl ); + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + System.out.println( "Max allowed domain overlap : " + max_allowed_overlap ); + html_desc.append( "" + nl ); + } + if ( no_engulfing_overlaps ) { + System.out.println( "Ignore engulfed domains : " + no_engulfing_overlaps ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore singlet domains : " + ignore_domains_without_combs_in_all_spec ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore species specific doms: " + ignore_species_specific_domains ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore combination with self: " + ignore_combination_with_same ); + html_desc.append( "" + nl ); + ; + System.out.println( "Consider directedness : " + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) ); + html_desc.append( "" + nl ); + if ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) { + System.out.println( "Consider adjacency : " + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) ); + html_desc.append( "" + + nl ); + } + System.out.print( "Domain counts sort order : " ); + switch ( dc_sort_order ) { + case ALPHABETICAL_KEY_ID: + System.out.println( "alphabetical" ); + break; + case KEY_DOMAIN_COUNT: + System.out.println( "domain count" ); + break; + case KEY_DOMAIN_PROTEINS_COUNT: + System.out.println( "domain proteins count" ); + break; + case COMBINATIONS_COUNT: + System.out.println( "domain combinations count" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for dc sort order" ); + } + if ( domain_id_to_go_ids_map != null ) { + System.out.println( "Pfam to GO mappings from : " + pfam_to_go_file + " [" + domain_id_to_go_ids_count + + " mappings]" ); + html_desc.append( "" + nl ); + } + if ( go_terms != null ) { + System.out.println( "GO terms from : " + go_obo_file + " [" + go_terms.size() + " terms]" ); + html_desc.append( "" + nl ); + } + if ( go_namespace_limit != null ) { + System.out.println( "Limit GO terms to : " + go_namespace_limit.toString() ); + html_desc.append( "" + nl ); + } + if ( perform_pwc ) { + System.out.println( "Suffix for PWC files : " + automated_pairwise_comparison_suffix ); + html_desc.append( "" + nl ); + } + if ( out_dir != null ) { + System.out.println( "Output directory : " + out_dir ); + } + if ( query_domain_ids != null ) { + System.out.println( "Query domains (ordered) : " + query_domain_ids ); + html_desc.append( "" + nl ); + } + System.out.println( "Write similarities to : " + output_file ); + System.out.print( " Scoring method : " ); + html_desc.append( "" + nl ); + break; + case DOMAINS: + System.out.println( "domain counts based" ); + html_desc.append( "domain counts based" + "" + nl ); + break; + case PROTEINS: + System.out.println( "domain proteins counts based" ); + html_desc.append( "domain proteins counts based" + "" + nl ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for scoring" ); + } + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + System.out.print( " Detailedness : " ); + switch ( detailedness ) { + case BASIC: + System.out.println( "basic" ); + break; + case LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES: + System.out.println( "list combining domains for each species" ); + break; + case PUNCTILIOUS: + System.out.println( "punctilious" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for detailedness" ); + } + System.out.print( " Print option : " ); + switch ( domain_similarity_print_option ) { + case HTML: + System.out.println( "HTML" ); + break; + case SIMPLE_TAB_DELIMITED: + System.out.println( "simple tab delimited" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for print option" ); + } + System.out.print( " Species matrix : " + species_matrix ); + System.out.println(); + final File dc_data_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + DATA_FILE_SUFFIX ); + System.out.println( "Domain comb data output : " + dc_data_file ); + html_desc.append( "" ); + System.out.println(); + if ( perform_pwc ) { + System.out.println( "Pairwise comparisons: " ); + html_desc.append( "" ); + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + if ( jacknifed_distances ) { + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + System.out.println( " Jacknife : " + jacknife_resamplings + " resamplings" ); + System.out.println( " Ratio : " + ForesterUtil.round( jacknife_ratio, 2 ) ); + System.out.println( " Random number seed : " + random_seed ); + } + // if ( infer_species_trees ) { + // html_desc.append( "" + nl ); + // System.out.println( " Infer species trees : true" ); + // } + if ( ( intrees != null ) && ( intrees.length > 0 ) ) { + for( final File intree_file : intree_files ) { + html_desc.append( "" + nl ); + System.out.println( " Intree for gain/loss pars.: " + intree_file ); + } + } + if ( radomize_fitch_parsimony ) { + html_desc.append( "" + nl ); + System.out.println( " Random number seed : " + random_number_seed_for_fitch_parsimony ); + } + if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + for( int i = 0; i < secondary_features_map_files.length; i++ ) { + html_desc.append( "" + nl ); + System.out.println( "Secondary features map file : " + secondary_features_map_files[ i ] + + " [mappings for " + domain_id_to_secondary_features_maps[ i ].size() + " domain ids]" ); + if ( VERBOSE ) { + System.out.println(); + System.out.println( "Domain ids to secondary features map:" ); + for( final DomainId domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { + System.out.print( domain_id.getId() ); + System.out.print( " => " ); + for( final String sec : domain_id_to_secondary_features_maps[ i ].get( domain_id ) ) { + System.out.print( sec ); + System.out.print( " " ); + } + System.out.println(); + } + } + } + } + } // if ( perform_pwc ) { + System.out.println(); + html_desc.append( "" + nl ); + System.out.println( "Command line : " + cla.getCommandLineArgsAsString() ); + BufferedWriter[] query_domains_writer_ary = null; + List[] query_domain_ids_array = null; + if ( query_domain_ids != null ) { + final String[] query_domain_ids_str_array = query_domain_ids.split( "#" ); + query_domain_ids_array = new ArrayList[ query_domain_ids_str_array.length ]; + query_domains_writer_ary = new BufferedWriter[ query_domain_ids_str_array.length ]; + for( int i = 0; i < query_domain_ids_str_array.length; i++ ) { + String query_domain_ids_str = query_domain_ids_str_array[ i ]; + final String[] query_domain_ids_str_ary = query_domain_ids_str.split( "~" ); + final List query = new ArrayList(); + for( final String element : query_domain_ids_str_ary ) { + query.add( new DomainId( element ) ); + } + query_domain_ids_array[ i ] = query; + query_domain_ids_str = query_domain_ids_str.replace( '~', '_' ); + String protein_names_writer_str = query_domain_ids_str + surfacing.SEQ_EXTRACT_SUFFIX; + if ( out_dir != null ) { + protein_names_writer_str = out_dir + ForesterUtil.FILE_SEPARATOR + protein_names_writer_str; + } + try { + query_domains_writer_ary[ i ] = new BufferedWriter( new FileWriter( protein_names_writer_str ) ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "Could not open [" + protein_names_writer_str + "]: " + + e.getLocalizedMessage() ); + } + } + } + SortedMap> protein_lists_per_species = null; //This will only be created if neede. + boolean need_protein_lists_per_species = false; + if ( ( plus_minus_analysis_high_copy_base_species.size() > 0 ) || output_protein_lists_for_all_domains ) { + need_protein_lists_per_species = true; + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species = new TreeMap>(); + } + final List gwcd_list = new ArrayList( number_of_genomes ); + final SortedSet all_domains_encountered = new TreeSet(); + final SortedSet all_bin_domain_combinations_encountered = new TreeSet(); + List all_bin_domain_combinations_gained_fitch = null; + List all_bin_domain_combinations_lost_fitch = null; + if ( ( intrees != null ) && ( intrees.length == 1 ) ) { + all_bin_domain_combinations_gained_fitch = new ArrayList(); + all_bin_domain_combinations_lost_fitch = new ArrayList(); + } + final DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); + final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + + output_file + D_PROMISCUITY_FILE_SUFFIX ); + BufferedWriter per_genome_domain_promiscuity_statistics_writer = null; + try { + per_genome_domain_promiscuity_statistics_writer = new BufferedWriter( new FileWriter( per_genome_domain_promiscuity_statistics_file ) ); + per_genome_domain_promiscuity_statistics_writer.write( "Species:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Mean:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "SD:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Median:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Min:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "N:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max Promiscuous Domains:" + + ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() ); + } + final File log_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + LOG_FILE_SUFFIX ); + BufferedWriter log_writer = null; + try { + log_writer = new BufferedWriter( new FileWriter( log_file ) ); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() ); + } + BufferedWriter dc_data_writer = null; + try { + dc_data_writer = new BufferedWriter( new FileWriter( dc_data_file ) ); + dc_data_writer.write( DATA_FILE_DESC ); + dc_data_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() ); + } + for( int i = 0; i < number_of_genomes; ++i ) { + System.out.println(); + System.out.println( ( i + 1 ) + "/" + number_of_genomes ); + log( ( i + 1 ) + "/" + number_of_genomes, log_writer ); + System.out.println( "Processing : " + input_file_properties[ i ][ 0 ] ); + log( "Genome : " + input_file_properties[ i ][ 0 ], log_writer ); + HmmscanPerDomainTableParser parser = null; + INDIVIDUAL_SCORE_CUTOFF ind_score_cutoff = INDIVIDUAL_SCORE_CUTOFF.NONE; + if ( individual_score_cutoffs != null ) { + ind_score_cutoff = INDIVIDUAL_SCORE_CUTOFF_DEFAULT; + } + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + HmmscanPerDomainTableParser.FilterType filter_type = HmmscanPerDomainTableParser.FilterType.NONE; + if ( positive_filter_file != null ) { + filter_type = HmmscanPerDomainTableParser.FilterType.POSITIVE_PROTEIN; + } + else if ( negative_filter_file != null ) { + filter_type = HmmscanPerDomainTableParser.FilterType.NEGATIVE_PROTEIN; + } + else if ( negative_domains_filter_file != null ) { + filter_type = HmmscanPerDomainTableParser.FilterType.NEGATIVE_DOMAIN; + } + parser = new HmmscanPerDomainTableParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + filter, + filter_type, + ind_score_cutoff ); + } + else { + parser = new HmmscanPerDomainTableParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + ind_score_cutoff ); + } + if ( e_value_max >= 0.0 ) { + parser.setEValueMaximum( e_value_max ); + } + parser.setIgnoreDufs( ignore_dufs ); + parser.setIgnoreVirusLikeIds( ignore_virus_like_ids ); + parser.setIgnoreEngulfedDomains( no_engulfing_overlaps ); + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parser.setMaxAllowedOverlap( max_allowed_overlap ); + } + parser + .setReturnType( HmmscanPerDomainTableParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + if ( individual_score_cutoffs != null ) { + parser.setIndividualScoreCutoffs( individual_score_cutoffs ); + } + List protein_list = null; + try { + protein_list = parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, e.getMessage(), e ); + } + if ( VERBOSE ) { + System.out.println( "Domains ignored due to negative domain filter: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToNegativeDomainFilterCountsMap() ); + System.out.println( "Domains ignored due to virus like id: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() ); + } + System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() ); + log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer ); + System.out.println( "Number of proteins stored : " + protein_list.size() ); + log( "Number of proteins stored : " + protein_list.size(), log_writer ); + System.out.println( "Domains encountered : " + parser.getDomainsEncountered() ); + log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer ); + System.out.println( "Domains stored : " + parser.getDomainsStored() ); + log( "Domains stored : " + parser.getDomainsStored(), log_writer ); + System.out.println( "Distinct domains stored : " + + parser.getDomainsStoredSet().size() ); + log( "Distinct domains stored : " + parser.getDomainsStoredSet().size(), log_writer ); + System.out.println( "Domains ignored due to individual score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() ); + log( "Domains ignored due to individual score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff(), log_writer ); + System.out.println( "Domains ignored due to E-value : " + + parser.getDomainsIgnoredDueToEval() ); + log( "Domains ignored due to E-value : " + parser.getDomainsIgnoredDueToEval(), log_writer ); + System.out.println( "Domains ignored due to DUF designation : " + + parser.getDomainsIgnoredDueToDuf() ); + log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(), log_writer ); + if ( ignore_virus_like_ids ) { + System.out.println( "Domains ignored due virus like ids : " + + parser.getDomainsIgnoredDueToVirusLikeIds() ); + log( "Domains ignored due virus like ids : " + parser.getDomainsIgnoredDueToVirusLikeIds(), + log_writer ); + } + System.out.println( "Domains ignored due negative domain filter : " + + parser.getDomainsIgnoredDueToNegativeDomainFilter() ); + log( "Domains ignored due negative domain filter : " + + parser.getDomainsIgnoredDueToNegativeDomainFilter(), log_writer ); + System.out.println( "Domains ignored due to overlap : " + + parser.getDomainsIgnoredDueToOverlap() ); + log( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap(), + log_writer ); + if ( negative_filter_file != null ) { + System.out.println( "Proteins ignored due to negative filter : " + + parser.getProteinsIgnoredDueToFilter() ); + log( "Proteins ignored due to negative filter : " + parser.getProteinsIgnoredDueToFilter(), + log_writer ); + } + if ( positive_filter_file != null ) { + System.out.println( "Proteins ignored due to positive filter : " + + parser.getProteinsIgnoredDueToFilter() ); + log( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter(), + log_writer ); + } + System.out.println( "Time for processing : " + parser.getTime() + "ms" ); + log( "", log_writer ); + html_desc.append( "" + nl ); + // domain_partner_counts_array[ i ] = + // Methods.getDomainPartnerCounts( protein_domain_collections_array[ + // i ], + // false, input_file_properties[ i ][ 1 ] ); + try { + int count = 0; + for( final Protein protein : protein_list ) { + dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" ) + .toString() ); + ++count; + } + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() ); + } + gwcd_list.add( BasicGenomeWideCombinableDomains + .createInstance( protein_list, + ignore_combination_with_same, + new BasicSpecies( input_file_properties[ i ][ 1 ] ), + domain_id_to_go_ids_map, + dc_type ) ); + domain_lengths_table.addLengths( protein_list ); + if ( gwcd_list.get( i ).getSize() > 0 ) { + SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, + out_dir, + per_genome_domain_promiscuity_statistics_writer, + gwcd_list.get( i ), + i, + dc_sort_order ); + if ( output_binary_domain_combinationsfor_graph_analysis ) { + SurfacingUtil.writeBinaryDomainCombinationsFileForGraphAnalysis( input_file_properties, + out_dir, + gwcd_list.get( i ), + i, + dc_sort_order ); + } + SurfacingUtil.addAllDomainIdsToSet( gwcd_list.get( i ), all_domains_encountered ); + SurfacingUtil.addAllBinaryDomainCombinationToSet( gwcd_list.get( i ), + all_bin_domain_combinations_encountered ); + } + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + SurfacingUtil.extractProteinNames( protein_list, + query_domain_ids_array[ j ], + query_domains_writer_ary[ j ], + "\t" ); + query_domains_writer_ary[ j ].flush(); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species.put( new BasicSpecies( input_file_properties[ i ][ 1 ] ), protein_list ); + } + try { + log_writer.flush(); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); + } + System.gc(); + } // for( int i = 0; i < number_of_genomes; ++i ) { + try { + per_genome_domain_promiscuity_statistics_writer.flush(); + per_genome_domain_promiscuity_statistics_writer.close(); + dc_data_writer.flush(); + dc_data_writer.close(); + log_writer.flush(); + log_writer.close(); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); + } + ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + + per_genome_domain_promiscuity_statistics_file ); + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + query_domains_writer_ary[ j ].close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() ); + } + } + } + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + try { + SurfacingUtil.executeDomainLengthAnalysis( input_file_properties, + number_of_genomes, + domain_lengths_table, + domain_lengths_analysis_outfile ); + } + catch ( final IOException e1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e1.toString() ); + } + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "Wrote domain length data to: " + domain_lengths_analysis_outfile ); + System.out.println(); + } + final long analysis_start_time = new Date().getTime(); + PairwiseDomainSimilarityCalculator pw_calc = null; + // double[] values_for_all_scores_histogram = null; + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, + sort_by_species_count_first, + number_of_genomes == 2 ); + switch ( scoring ) { + case COMBINATIONS: + pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); + break; + case DOMAINS: + pw_calc = new DomainCountsBasedPairwiseSimilarityCalculator(); + break; + case PROTEINS: + pw_calc = new ProteinCountsBasedPairwiseDomainSimilarityCalculator(); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for scoring" ); + } + DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE; + if ( domain_id_to_go_ids_map != null ) { + go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL; + } + final SortedSet similarities = calc + .calculateSimilarities( pw_calc, + gwcd_list, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains ); + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, + detailedness, + go_annotation_output, + go_id_to_term_map, + go_namespace_limit ); + DescriptiveStatistics pw_stats = null; + try { + String my_outfile = output_file.toString(); + if ( !my_outfile.endsWith( ".html" ) ) { + my_outfile += ".html"; + } + final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? my_outfile : out_dir + + ForesterUtil.FILE_SEPARATOR + my_outfile ) ); + List species_order = null; + if ( species_matrix ) { + species_order = new ArrayList(); + for( int i = 0; i < number_of_genomes; i++ ) { + species_order.add( new BasicSpecies( input_file_properties[ i ][ 1 ] ) ); + } + } + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "
Produced by:" + surfacing.PRG_NAME + "
Version:" + surfacing.PRG_VERSION + "
Release Date:" + surfacing.PRG_DATE + "
Contact:" + surfacing.E_MAIL + "
WWW:" + surfacing.WWW + "
Input files names from:" + input_files_file + " [" + + input_file_names_from_file.length + " input files]
Positive protein filter:" + positive_filter_file + " [" + filter_size + + " domain ids]
Negative protein filter:" + negative_filter_file + " [" + filter_size + + " domain ids]
Negative domain filter:" + negative_domains_filter_file + " [" + + filter_size + " domain ids]
Plus-minus analysis:" + plus1 + "&& " + plus0 + "&& " + minus + + "
Cutoff scores file:" + cutoff_scores_file + "
E-value maximum (inclusive):" + e_value_max + "
Ignore virus, phage, transposition related ids:" + + ignore_virus_like_ids + "
Ignore DUFs:" + ignore_dufs + "
Max allowed domain overlap:" + max_allowed_overlap + "
Ignore (lower confidence) engulfed domains:" + no_engulfing_overlaps + + "
Ignore singlet domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_domains_without_combs_in_all_spec + "
Ignore species specific domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_species_specific_domains + "
Ignore combination with self for domain combination similarity analyses:" + + ignore_combination_with_same + "
Consider directedness of binary domain combinations:" + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) + "
Consider djacency of binary domain combinations:" + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "
Pfam to GO mappings from:" + pfam_to_go_file + " [" + + domain_id_to_go_ids_count + " mappings]" + "
GO terms from:" + go_obo_file + " [" + go_terms.size() + " terms]" + + "
Limit GO terms to" + go_namespace_limit + "
Suffix for PWC files" + automated_pairwise_comparison_suffix + + "
" + query_domain_ids + "
Scoring method:" ); + switch ( scoring ) { + case COMBINATIONS: + System.out.println( "domain combinations based" ); + html_desc.append( "domain combinations based" + "
Sort by:" ); + switch ( domain_similarity_sort_field ) { + case MIN: + System.out.print( "score minimum" ); + html_desc.append( "score minimum" ); + break; + case MAX: + System.out.print( "score maximum" ); + html_desc.append( "score maximum" ); + break; + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case SD: + System.out.print( "score standard deviation" ); + html_desc.append( "score standard deviation" ); + break; + case SPECIES_COUNT: + System.out.print( "species number" ); + html_desc.append( "species number" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "(maximal) difference" ); + html_desc.append( "(maximal) difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute (maximal) counts difference" ); + html_desc.append( "absolute (maximal) counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "(maximal) counts difference" ); + html_desc.append( "(maximal) counts difference" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for similarities" ); + } + if ( sort_by_species_count_first ) { + System.out.println( " (sort by species count first)" ); + html_desc.append( " (sort by species count first)" ); + } + else { + System.out.println(); + } + html_desc.append( "
Domain combination data output: " + dc_data_file + "
Pairwise comparisons:
Sort by:" ); + switch ( domain_similarity_sort_field_for_automated_pwc ) { + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "difference" ); + html_desc.append( "difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute counts difference" ); + html_desc.append( "absolute counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "counts difference" ); + html_desc.append( "counts difference" ); + break; + default: + ForesterUtil + .unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for similarities" ); + } + System.out.println(); + html_desc.append( "
Jacknife:" + jacknife_resamplings + " resamplings
Jacknife ratio:" + ForesterUtil.round( jacknife_ratio, 2 ) + + "
Jacknife random number seed:" + random_seed + "
Infer species trees:true
Intree for gain/loss parsimony analysis:" + intree_file + + "
Random number seed for Fitch parsimony analysis:" + + random_number_seed_for_fitch_parsimony + "
Secondary features map file:" + + secondary_features_map_files[ i ] + "
Command line:" + cla.getCommandLineArgsAsString() + "
" + input_file_properties[ i ][ 0 ] + " [species: " + + input_file_properties[ i ][ 1 ] + "]" + ":domains analyzed: " + + parser.getDomainsStored() + "; domains ignored: [ind score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() + "] [E-value cutoff: " + + parser.getDomainsIgnoredDueToEval() + "] [DUF: " + parser.getDomainsIgnoredDueToDuf() + + "] [virus like ids: " + parser.getDomainsIgnoredDueToVirusLikeIds() + + "] [negative domain filter: " + parser.getDomainsIgnoredDueToNegativeDomainFilter() + + "] [overlap: " + parser.getDomainsIgnoredDueToOverlap() + "]" ); + if ( negative_filter_file != null ) { + html_desc.append( "; proteins ignored due to negative filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + if ( positive_filter_file != null ) { + html_desc.append( "; proteins ignored due to positive filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + html_desc.append( "
Sum of all distinct binary combinations:" + + all_bin_domain_combinations_encountered.size() + "
Sum of all distinct domains:" + all_domains_encountered.size() + + "
Analysis date/time:" + + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + + "
" + nl ); + pw_stats = SurfacingUtil + .writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( number_of_genomes + " genomes" ), + writer, + similarities, + number_of_genomes == 2, + species_order, + domain_similarity_print_option, + domain_similarity_sort_field, + scoring, + true ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "Failed to write similarites to: \"" + output_file + "\" [" + + e.getMessage() + "]" ); + } + System.out.println(); + // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); + final Species[] species = new Species[ number_of_genomes ]; + for( int i = 0; i < number_of_genomes; ++i ) { + species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); + } + List inferred_trees = null; + if ( ( number_of_genomes > 2 ) && perform_pwc ) { + final PairwiseGenomeComparator pwgc = new PairwiseGenomeComparator(); + pwgc.performPairwiseComparisons( html_desc, + sort_by_species_count_first, + detailedness, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains, + domain_similarity_sort_field_for_automated_pwc, + domain_similarity_print_option, + scoring, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + species, + number_of_genomes, + gwcd_list, + pw_calc, + automated_pairwise_comparison_suffix, + true, + surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, + surfacing.PRG_NAME, + display_histograms, + out_dir, + write_pwc_files ); + String matrix_output_file = new String( output_file.toString() ); + if ( matrix_output_file.indexOf( '.' ) > 1 ) { + matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); + } + if ( out_dir != null ) { + matrix_output_file = out_dir + ForesterUtil.FILE_SEPARATOR + matrix_output_file; + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + final Phylogeny nj_gd = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans() + .get( 0 ) ); + final Phylogeny nj_bc = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing.NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances().get( 0 ) ); + final Phylogeny nj_d = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing.NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances().get( 0 ) ); + inferred_trees = new ArrayList(); + inferred_trees.add( nj_gd ); + inferred_trees.add( nj_bc ); + inferred_trees.add( nj_d ); + // final List histogram_datas = pwgc.getHistogramDatas(); + // if ( infer_species_trees ) { + // inferred_trees = new ArrayList(); + // final List inferred_trees_bc = inferSpeciesTrees( new File( output_file + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // final List inferred_trees_d = inferSpeciesTrees( new File( output_file + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedDomainsBasedDistances() ); + // inferred_trees.addAll( inferred_trees_bc ); + // inferred_trees.addAll( inferred_trees_d ); + // } + if ( jacknifed_distances ) { + pwgc.performPairwiseComparisonsJacknifed( species, + number_of_genomes, + gwcd_list, + true, + jacknife_resamplings, + jacknife_ratio, + random_seed ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + // if ( infer_species_trees ) { + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); + // } + } + if ( display_histograms ) { + // final List histogram_datas_all = new ArrayList(); + // histogram_datas_all.add( new HistogramData( "all", + // values_for_all_scores_histogram, + // null, + // 20 ) ); + // final HistogramsFrame hf_all = new HistogramsFrame( histogram_datas_all ); + // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); + // hf_all.setVisible( true ); + // hf.setVisible( true ); + } + } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) + if ( ( out_dir != null ) && ( !perform_pwc ) ) { + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { + final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + e_value_max, + max_allowed_overlap, + no_engulfing_overlaps, + cutoff_scores_file, + dc_type ); + String s = "_"; + if ( radomize_fitch_parsimony ) { + s += random_number_seed_for_fitch_parsimony + "_"; + } + int i = 0; + for( final Phylogeny intree : intrees ) { + final String outfile_name = ForesterUtil.removeSuffix( output_file.toString() ) + s + + ForesterUtil.removeSuffix( intree_files[ i ].toString() ); + final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator.createInstance( intree, + gwcd_list ); + SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + radomize_fitch_parsimony, + outfile_name, + domain_parsimony, + intree, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + parameters_sb.toString(), + domain_id_to_secondary_features_maps, + positive_filter_file == null ? null : filter, + output_binary_domain_combinationsfor_graph_analysis, + all_bin_domain_combinations_gained_fitch, + all_bin_domain_combinations_lost_fitch, + dc_type ); + // Listing of all domain combinations gained is only done if only one input tree is used. + if ( ( domain_id_to_secondary_features_maps != null ) + && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + int j = 0; + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + final Map mapping_results_map = new TreeMap(); + final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator + .createInstance( intree, gwcd_list, domain_id_to_secondary_features_map ); + SurfacingUtil + .executeParsimonyAnalysisForSecondaryFeatures( outfile_name + + "_" + + secondary_features_map_files[ j++ ], + secondary_features_parsimony, + intree, + parameters_sb.toString(), + mapping_results_map ); + if ( i == 0 ) { + System.out.println(); + System.out.println( "Mapping to secondary features:" ); + for( final Species spec : mapping_results_map.keySet() ) { + final MappingResults mapping_results = mapping_results_map.get( spec ); + final int total_domains = mapping_results.getSumOfFailures() + + mapping_results.getSumOfSuccesses(); + System.out.print( spec + ":" ); + System.out.print( " mapped domains = " + mapping_results.getSumOfSuccesses() ); + System.out.print( ", not mapped domains = " + mapping_results.getSumOfFailures() ); + if ( total_domains > 0 ) { + System.out.println( ", mapped ratio = " + + ( 100 * mapping_results.getSumOfSuccesses() / total_domains ) + "%" ); + } + else { + System.out.println( ", mapped ratio = n/a (total domains = 0 )" ); + } + } + } + } + } + i++; + } // for( final Phylogeny intree : intrees ) { + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + executePlusMinusAnalysis( output_file, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + gwcd_list, + protein_lists_per_species, + domain_id_to_go_ids_map, + go_id_to_term_map, + plus_minus_analysis_numbers ); + } + if ( output_protein_lists_for_all_domains ) { + writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list ); + } + // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) { + // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + // e_value_max, + // max_allowed_overlap, + // no_engulfing_overlaps, + // cutoff_scores_file ); + // String s = "_"; + // if ( radomize_fitch_parsimony ) { + // s += random_number_seed_for_fitch_parsimony + "_"; + // } + // int i = 0; + // for( final Phylogeny inferred_tree : inferred_trees ) { + // if ( !inferred_tree.isRooted() ) { + // intrees[ 0 ].getRoot().getName(); + // inferred_tree.r + // } + // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s; + // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator + // .createInstance( inferred_tree, gwcd_list ); + // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + // radomize_fitch_parsimony, + // outfile_name, + // domain_parsimony, + // inferred_tree, + // domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // parameters_sb.toString() ); + // i++; + // } + // } + if ( all_bin_domain_combinations_gained_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_gained_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + true ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + if ( all_bin_domain_combinations_lost_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_lost_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + false ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + final Runtime rt = java.lang.Runtime.getRuntime(); + final long free_memory = rt.freeMemory() / 1000000; + final long total_memory = rt.totalMemory() / 1000000; + System.out.println(); + System.out.println( "Time for analysis : " + ( new Date().getTime() - analysis_start_time ) + "ms" ); + System.out.println( "Total running time: " + ( new Date().getTime() - start_time ) + "ms " ); + System.out.println( "Free memory : " + free_memory + "MB, total memory: " + total_memory + "MB" ); + System.out.println(); + System.out.println( "If this application is useful to you, please cite:" ); + System.out.println( surfacing.WWW ); + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, + final String[][] input_file_properties ) { + final String[] genomes = new String[ input_file_properties.length ]; + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] is not unique in input tree " + intree.getName() ); + } + genomes[ i ] = input_file_properties[ i ][ 1 ]; + } + // + final PhylogenyNodeIterator it = intree.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( ForesterUtil.isEmpty( n.getName() ) ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + n.setName( n.getNodeData().getTaxonomy().getScientificName() ); + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "node without both name and scientific taxonomy name found" ); + } + } + } + // + final List igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); + if ( igns.size() > 0 ) { + System.out.println( "Not using the following " + igns.size() + " nodes:" ); + for( int i = 0; i < igns.size(); ++i ) { + System.out.println( " " + i + ": " + igns.get( i ) ); + } + System.out.println( "--" ); + } + for( int i = 0; i < input_file_properties.length; ++i ) { + try { + intree.getNode( input_file_properties[ i ][ 1 ] ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] not present/not unique in input tree" ); + } + } + } + + // public static StringBuffer stringCombinableDomainsMapToStringBuffer( + // final SortedMap map ) { + // final StringBuffer sb = new StringBuffer(); + // for( final Iterator iter = map.keySet().iterator(); + // iter.hasNext(); ) { + // final Object key = iter.next(); + // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ', + // false ) ); + // final CombinableDomains domain_combination = map.get( key ); + // sb.append( ForesterUtil.pad( new StringBuffer( "" + + // domain_combination.getNumberOfCombiningDomains() ), 8, + // ' ', false ) ); + // sb.append( domain_combination.toStringBuffer() ); + // sb.append( ForesterUtil.getLineSeparator() ); + // } + // return sb; + // } + private static void printHelp() { + System.out.println(); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( "% java -Xms256m -Xmx512m -cp forester.jar org.forester.applications." + surfacing.PRG_NAME + + " [options] [external node name 1] [name 2] ... [name n]" ); + System.out.println(); + System.out.println( " Note: This software might need a significant amount of memory (heap space);" ); + System.out + .println( " hence use \"-Xms128m -Xmx512m\" (or more) to prevent a \"java.lang.OutOfMemoryError\"." ); + System.out.println(); + System.out.println( " Options: " ); + System.out.println( surfacing.DETAILEDNESS_OPTION + ": level of detail for similarities output file (default:" + + DETAILEDNESS_DEFAULT + ")" ); + System.out.println( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION + + ": to ignore combinations with self (default: not to ignore)" ); + System.out + .println( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION + + ": to ignore domains without combinations in any species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out + .println( surfacing.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION + + ": to ignore domains specific to one species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out.println( surfacing.NOT_IGNORE_DUFS_OPTION + + ": to _not_ ignore DUFs (domains with unknown function) (default: ignore DUFs)" ); + System.out + .println( surfacing.IGNORE_VIRAL_IDS + + ": to ignore domains with ids containing 'vir', 'retro', 'transpos', 'phage', or starting with 'rv' or 'gag_'" ); + System.out.println( surfacing.DOMAIN_SIMILARITY_SORT_OPTION + ": sorting for similarities (default: " + + DOMAIN_SORT_FILD_DEFAULT + ")" ); + System.out.println( surfacing.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" ); + System.out.println( surfacing.MAX_E_VALUE_OPTION + ": max (inclusive) E-value" ); + System.out.println( surfacing.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" ); + System.out.println( surfacing.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" ); + System.out.println( surfacing.SPECIES_MATRIX_OPTION + ": species matrix" ); + System.out.println( surfacing.SCORING_OPTION + ": scoring (default:" + SCORING_DEFAULT + ")" ); + System.out.println( surfacing.DOMAIN_COUNT_SORT_OPTION + ": sorting for domain counts (default:" + + DOMAINS_SORT_ORDER_DEFAULT + ")" ); + System.out.println( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION + ": domain similarity print option (default:" + + DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT + ")" ); + System.out.println( surfacing.CUTOFF_SCORE_FILE_OPTION + ": cutoff score file" ); + System.out.println( surfacing.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION + + ": sort by species count first" ); + System.out.println( surfacing.OUTPUT_DIR_OPTION + ": output directory" ); + System.out.println( surfacing.PFAM_TO_GO_FILE_USE_OPTION + ": Pfam to GO mapping file" ); + System.out.println( surfacing.GO_OBO_FILE_USE_OPTION + ": GO terms file (OBO format)" ); + System.out.println( surfacing.GO_NAMESPACE_LIMIT_OPTION + ": limit GO term to one GO namespace" ); + System.out.println( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "[=]: to perform pairwise comparison based analyses" ); + System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION + + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); + System.out + .println( surfacing.DISPLAY_M_HISTOGRAMS_OPTION + ": to display multiple histograms (using fluorite)" ); + System.out + .println( JACKNIFE_OPTION + + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " + + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); + System.out.println( JACKNIFE_RATIO_OPTION + ": ratio for jacknife resampling [default: " + + JACKNIFE_RATIO_DEFAULT + "]" ); + System.out.println( JACKNIFE_RANDOM_SEED_OPTION + + ": seed for random number generator for jacknife resampling [default: " + + JACKNIFE_RANDOM_SEED_DEFAULT + "]" ); + // System.out.println( surfacing.INFER_SPECIES_TREES_OPTION + // + ": to infer NJ species trees based on shared domains/binary domain combinations" ); + System.out + .println( surfacing.INPUT_SPECIES_TREE_OPTION + + "=: to infer domain/binary domain combination gains/losses on given species trees" ); + System.out.println( surfacing.FILTER_POSITIVE_OPTION + + "=: to filter out proteins not containing at least one domain listed in " ); + System.out.println( surfacing.FILTER_NEGATIVE_OPTION + + "=: to filter out proteins containing at least one domain listed in " ); + System.out.println( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION + + "=: to filter out (ignore) domains listed in " ); + System.out.println( surfacing.INPUT_FILES_FROM_FILE_OPTION + "=: to read input files from " ); + System.out + .println( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + + "=: seed for random number generator for Fitch Parsimony analysis (type: long, default: no randomization - given a choice, prefer absence" ); + System.out.println( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS + + ": to consider directedness in binary combinations: e.g. A-B != B-A" ); + System.out.println( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY + + ": to consider directedness and adjacency in binary combinations" ); + System.out + .println( surfacing.SEQ_EXTRACT_OPTION + + "=: to extract sequence names of sequences containing matching domains and/or domain-sequences (order N to C) (domain separator: '~', domain sequences speparator: '#', e.g. 'NACHT#BIR~CARD')" ); + System.out.println( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + + "=: to perfom parsimony analysis on secondary features" ); + System.out.println( surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=: to presence/absence genome analysis" ); + System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + + ": to output binary domain combinations for (downstream) graph analysis" ); + System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); + System.out.println(); + System.out.println(); + System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar" + + "org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST" + + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo " + + "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo " + + "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION + + "=50 human mouse brafl strpu" ); + System.out.println(); + } + + private static void processFilter( final File filter_file, final SortedSet filter ) { + SortedSet filter_str = null; + try { + filter_str = ForesterUtil.file2set( filter_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( filter_str != null ) { + for( final String string : filter_str ) { + filter.add( new DomainId( string ) ); + } + } + if ( VERBOSE ) { + System.out.println( "Filter:" ); + for( final DomainId domainId : filter ) { + System.out.println( domainId.getId() ); + } + } + } + + private static String[][] processInputFileNames( final String[] names ) { + final String[][] input_file_properties = new String[ names.length ][]; + for( int i = 0; i < names.length; ++i ) { + if ( names[ i ].indexOf( SEPARATOR_FOR_INPUT_VALUES ) < 0 ) { + input_file_properties[ i ] = new String[ 2 ]; + input_file_properties[ i ][ 0 ] = names[ i ]; + input_file_properties[ i ][ 1 ] = names[ i ]; + } + else { + input_file_properties[ i ] = names[ i ].split( surfacing.SEPARATOR_FOR_INPUT_VALUES + "" ); + if ( input_file_properties[ i ].length != 3 ) { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "properties for the input files (hmmpfam output) are expected " + + "to be in the following format \"#\" (or just one word, which is both the filename and the species id), instead received \"" + + names[ i ] + "\"" ); + } + } + final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + return input_file_properties; + } + + private static void processPlusMinusAnalysisOption( final CommandLineArguments cla, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -" + + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=" ); + } + final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( plus_minus_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg ); + } + processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); + } + } + + // First numbers is minimal difference, second is factor. + private static void processPlusMinusFile( final File plus_minus_file, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + Set species_set = null; + int min_diff = PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; + double factor = PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; + try { + species_set = ForesterUtil.file2set( plus_minus_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( species_set != null ) { + for( final String species : species_set ) { + final String species_trimmed = species.substring( 1 ); + if ( species.startsWith( "+" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_base.add( species_trimmed ); + } + else if ( species.startsWith( "*" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_target.add( species_trimmed ); + } + else if ( species.startsWith( "-" ) ) { + if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + low_copy.add( species_trimmed ); + } + else if ( species.startsWith( "$D" ) ) { + try { + min_diff = Integer.parseInt( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "could not parse integer value for minimal difference from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "$F" ) ) { + try { + factor = Double.parseDouble( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "#" ) ) { + // Comment, ignore. + } + else { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); + } + numbers.add( new Integer( min_diff + "" ) ); + numbers.add( new Double( factor + "" ) ); + } + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" ); + } + } + + private static void writePresentToNexus( final File output_file, + final File positive_filter_file, + final SortedSet filter, + final List gwcd_list ) { + try { + SurfacingUtil + .writeMatrixToFile( DomainParsimonyCalculator + .createMatrixOfDomainPresenceOrAbsence( gwcd_list, positive_filter_file == null ? null + : filter ), output_file + DOMAINS_PRESENT_NEXUS, Format.NEXUS_BINARY ); + SurfacingUtil.writeMatrixToFile( DomainParsimonyCalculator + .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), output_file + + BDC_PRESENT_NEXUS, Format.NEXUS_BINARY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + private static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final DomainId domain : all_domains ) { + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); + SurfacingUtil.checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + SurfacingUtil.extractProteinNames( protein_lists_per_species, domain, proteins_file_writer, "\t" ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } +} diff --git a/forester/java/src/org/forester/application/surfacing_hmmpfam.java b/forester/java/src/org/forester/application/surfacing_hmmpfam.java new file mode 100644 index 0000000..9185725 --- /dev/null +++ b/forester/java/src/org/forester/application/surfacing_hmmpfam.java @@ -0,0 +1,2575 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.io.parsers.HmmPfamOutputParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.surfacing.BasicDomainSimilarityCalculator; +import org.forester.surfacing.BasicGenomeWideCombinableDomains; +import org.forester.surfacing.BasicSpecies; +import org.forester.surfacing.BinaryDomainCombination; +import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator; +import org.forester.surfacing.DomainCountsDifferenceUtil; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.DomainLengthsTable; +import org.forester.surfacing.DomainParsimonyCalculator; +import org.forester.surfacing.DomainSimilarity; +import org.forester.surfacing.DomainSimilarityCalculator; +import org.forester.surfacing.GenomeWideCombinableDomains; +import org.forester.surfacing.MappingResults; +import org.forester.surfacing.PairwiseDomainSimilarityCalculator; +import org.forester.surfacing.PairwiseGenomeComparator; +import org.forester.surfacing.PrintableDomainSimilarity; +import org.forester.surfacing.Protein; +import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.Species; +import org.forester.surfacing.SurfacingUtil; +import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring; +import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; +import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; + +public class surfacing_hmmpfam { + + public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; + public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; + public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; + // gain/loss: + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc"; + // gain/loss counts: + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc"; + // tables: + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_GOID_D = "_dollo_gains_goid_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_GOID_D = "_dollo_present_goid_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; + public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; + public final static String BDC_PRESENT_NEXUS = "_dc.nex"; + // --- + public final static String PRG_NAME = "surfacing"; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex"; + public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex"; + public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex"; + public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features"; + public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; + public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_BIOLOGICAL_PROCESS = "_dollo_biol_proc_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_CELLULAR_COMPONENT = "_dollo_cell_comp_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_MOLECULAR_FUNCTION = "_dollo_mol_funct_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_BIOLOGICAL_PROCESS = "_fitch_biol_proc_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_CELLULAR_COMPONENT = "_fitch_cell_comp_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_MOLECULAR_FUNCTION = "_fitch_mol_funct_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String OUTPUT_DIR_OPTION = "out_dir"; + final static private String SCORING_OPTION = "scoring"; + private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + final static private String SCORING_DOMAIN_COUNT_BASED = "domains"; + final static private String SCORING_PROTEIN_COUNT_BASED = "proteins"; + final static private String SCORING_COMBINATION_BASED = "combinations"; + final static private String DETAILEDNESS_OPTION = "detail"; + private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + final static private String SPECIES_MATRIX_OPTION = "smatrix"; + final static private String DETAILEDNESS_BASIC = "basic"; + final static private String DETAILEDNESS_LIST_IDS = "list_ids"; + final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious"; + final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort"; + private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + final static private String DOMAIN_SIMILARITY_SORT_MIN = "min"; + final static private String DOMAIN_SIMILARITY_SORT_MAX = "max"; + final static private String DOMAIN_SIMILARITY_SORT_SD = "sd"; + final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean"; + final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff"; + final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species"; + final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha"; + final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first"; + final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort"; + private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot"; + final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb"; + final static private String CUTOFF_SCORE_FILE_OPTION = "cos"; + final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; + final static private String MAX_E_VALUE_OPTION = "e"; + final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; + final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; + final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; + final static private String OUTPUT_FILE_OPTION = "o"; + final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; + final static private String GO_OBO_FILE_USE_OPTION = "obo"; + final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace"; + final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function"; + final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process"; + final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component"; + final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output"; + private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML; + final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains"; + final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids"; + final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false; + final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains"; + final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false; + final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd"; + final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd"; + final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd"; + final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String DISPLAY_M_HISTOGRAMS_OPTION = "mhisto"; + // final static private boolean DISPLAY_M_HISTOGRAMS_OPTION_DEFAULT = false; + final static private String JACKNIFE_OPTION = "jack"; + final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; + final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; + private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100; + final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; + final static private double JACKNIFE_RATIO_DEFAULT = 0.5; + //final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference"; + final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh"; + final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh"; + final static private String FILTER_POSITIVE_OPTION = "pos_filter"; + final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; + final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; + final static private String INPUT_FILES_FROM_FILE_OPTION = "input"; + final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; + final static private String SEQ_EXTRACT_OPTION = "prot_extract"; + final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "2009.07.06"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; + final static private boolean IGNORE_DUFS_DEFAULT = true; + final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; + final static private double MAX_E_VALUE_DEFAULT = -1; + final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + final static private String DEFAULT_SEARCH_PARAMETER = "ls"; + final private static boolean ALLOW_NON_UNIQUE_QUERY_IN_HMMPFAM_OUTPUT_DEFAULT = true; + final private static boolean VERBOSE_DEFAULT = true; + private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; + private static final String SEQ_EXTRACT_SUFFIX = ".prot"; + private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; + private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; + private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; + private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; + private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; + private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; + private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; + private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; + private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + private static final boolean VERBOSE = false; + private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; + private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; + private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; + private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; + public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; + public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; + public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; + public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains"; + public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains"; + public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc"; + public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc"; + public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS"; + public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS"; + public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities"; + + // final String error = ForesterUtil.isReadableFile( new File( + // input_file_properties[ i ][ 0 ] ) ); + // if ( !ForesterUtil.isEmpty( error ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + // } + private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + "_" + + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, error ); + } + } + } + } + + private static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "E-value: " + e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + /** + * Warning: This sideeffects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @throws IOException + */ + private static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + SurfacingUtil.checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + private static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List plus_minus_analysis_numbers ) { + final Set all_spec = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_spec.add( gwcd.getSpecies().getSpeciesId() ); + } + final File html_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX_HTML ); + final File plain_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX ); + final File html_out_dc = new File( output_file + PLUS_MINUS_DC_SUFFIX_HTML ); + final File all_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); + final File passing_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + final File proteins_file_base = new File( output_file + "" ); + final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); + final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); + try { + DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, + protein_lists_per_species, + plus_minus_analysis_high_copy_base, + plus_minus_analysis_high_copy_target, + plus_minus_analysis_low_copy, + min_diff, + factor, + plain_out_dom, + html_out_dom, + html_out_dc, + domain_id_to_go_ids_map, + go_id_to_term_map, + all_domains_go_ids_out_dom, + passing_domains_go_ids_out_dom, + proteins_file_base ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + plain_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dc + "\"" ); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, + "Wrote plus minus domain analysis based passing GO ids to \"" + + passing_domains_go_ids_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, + "Wrote plus minus domain analysis based all GO ids to \"" + + all_domains_go_ids_out_dom + "\"" ); + } + + private static Phylogeny[] getIntrees( final File[] intree_files, + final int number_of_genomes, + final String[][] input_file_properties ) { + final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; + int i = 0; + for( final File intree_file : intree_files ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read input tree file [" + intree_file + + "]: " + error ); + } + try { + final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ForesterUtil.createParserDependingOnFileType( intree_file, true ) ); + if ( p_array.length < 1 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( p_array.length > 1 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = p_array[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "failed to read input tree from file [" + + intree_file + "]: " + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "number of external nodes [" + + intree.getNumberOfExternalNodes() + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + number_of_genomes + "]" ); + } + final StringBuilder parent_names = new StringBuilder(); + final int nodes_lacking_name = SurfacingUtil.getNumberOfNodesLackingName( intree, parent_names ); + if ( nodes_lacking_name > 0 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "input tree [" + intree_file + "] has " + + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); + } + preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); + if ( !intree.isCompletelyBinary() ) { + ForesterUtil.printWarningMessage( surfacing_hmmpfam.PRG_NAME, "input tree [" + intree_file + + "] is not completely binary" ); + } + intrees[ i++ ] = intree; + } + return intrees; + } + + private static List inferSpeciesTrees( final File outfile, final List distances_list ) { + final NeighborJoining nj = NeighborJoining.createInstance(); + final List phylogenies = nj.execute( distances_list ); + final PhylogenyWriter w = new PhylogenyWriter(); + try { + w.toNewHampshire( phylogenies, true, true, outfile, ";" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); + } + return phylogenies; + } + + public static void main( final String args[] ) { + final long start_time = new Date().getTime(); + // final StringBuffer log = new StringBuffer(); + final StringBuilder html_desc = new StringBuilder(); + ForesterUtil.printProgramInformation( surfacing_hmmpfam.PRG_NAME, + surfacing_hmmpfam.PRG_VERSION, + surfacing_hmmpfam.PRG_DATE, + surfacing_hmmpfam.E_MAIL, + surfacing_hmmpfam.WWW ); + final String nl = ForesterUtil.LINE_SEPARATOR; + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( surfacing_hmmpfam.HELP_OPTION_1 ) || cla.isOptionSet( surfacing_hmmpfam.HELP_OPTION_2 ) ) { + surfacing_hmmpfam.printHelp(); + System.exit( 0 ); + } + if ( ( args.length < 1 ) ) { + surfacing_hmmpfam.printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( surfacing_hmmpfam.NOT_IGNORE_DUFS_OPTION ); + allowed_options.add( surfacing_hmmpfam.MAX_E_VALUE_OPTION ); + allowed_options.add( surfacing_hmmpfam.DETAILEDNESS_OPTION ); + allowed_options.add( surfacing_hmmpfam.OUTPUT_FILE_OPTION ); + allowed_options.add( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION ); + allowed_options.add( surfacing_hmmpfam.SPECIES_MATRIX_OPTION ); + allowed_options.add( surfacing_hmmpfam.SCORING_OPTION ); + allowed_options.add( surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_OPTION ); + allowed_options.add( surfacing_hmmpfam.NO_ENGULFING_OVERLAP_OPTION ); + allowed_options.add( surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION ); + allowed_options.add( surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION ); + allowed_options.add( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ); + allowed_options.add( surfacing_hmmpfam.OUTPUT_DIR_OPTION ); + allowed_options.add( surfacing_hmmpfam.IGNORE_COMBINATION_WITH_SAME_OPTION ); + allowed_options.add( surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION ); + allowed_options.add( surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION ); + allowed_options.add( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION ); + allowed_options.add( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION ); + allowed_options.add( surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + allowed_options.add( surfacing_hmmpfam.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); + allowed_options.add( surfacing_hmmpfam.DISPLAY_M_HISTOGRAMS_OPTION ); + allowed_options.add( surfacing_hmmpfam.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); + allowed_options.add( JACKNIFE_OPTION ); + allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); + allowed_options.add( JACKNIFE_RATIO_OPTION ); + allowed_options.add( INPUT_SPECIES_TREE_OPTION ); + //allowed_options.add( INFER_SPECIES_TREES_OPTION ); + allowed_options.add( FILTER_POSITIVE_OPTION ); + allowed_options.add( FILTER_NEGATIVE_OPTION ); + allowed_options.add( INPUT_FILES_FROM_FILE_OPTION ); + allowed_options.add( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION ); + allowed_options.add( IGNORE_VIRAL_IDS ); + allowed_options.add( SEQ_EXTRACT_OPTION ); + allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); + allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); + allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); + allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); + allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + boolean ignore_dufs = surfacing_hmmpfam.IGNORE_DUFS_DEFAULT; + boolean ignore_combination_with_same = surfacing_hmmpfam.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; + double e_value_max = surfacing_hmmpfam.MAX_E_VALUE_DEFAULT; + int max_allowed_overlap = surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_DEFAULT; + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean output_binary_domain_combinationsfor_graph_analysis = false; + if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { + output_binary_domain_combinationsfor_graph_analysis = true; + } + if ( cla.isOptionSet( surfacing_hmmpfam.MAX_E_VALUE_OPTION ) ) { + try { + e_value_max = cla.getOptionValueAsDouble( surfacing_hmmpfam.MAX_E_VALUE_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no acceptable value for E-value maximum" ); + } + } + if ( cla.isOptionSet( surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_OPTION ) ) { + try { + max_allowed_overlap = cla.getOptionValueAsInt( surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no acceptable value for maximal allowed domain overlap" ); + } + } + boolean no_engulfing_overlaps = false; + if ( cla.isOptionSet( surfacing_hmmpfam.NO_ENGULFING_OVERLAP_OPTION ) ) { + no_engulfing_overlaps = true; + } + boolean ignore_virus_like_ids = false; + if ( cla.isOptionSet( surfacing_hmmpfam.IGNORE_VIRAL_IDS ) ) { + ignore_virus_like_ids = true; + } + if ( cla.isOptionSet( surfacing_hmmpfam.NOT_IGNORE_DUFS_OPTION ) ) { + ignore_dufs = false; + } + if ( cla.isOptionSet( surfacing_hmmpfam.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) { + ignore_combination_with_same = true; + } + boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) { + ignore_domains_without_combs_in_all_spec = true; + } + boolean ignore_species_specific_domains = IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION ) ) { + ignore_species_specific_domains = true; + } + File output_file = null; + if ( cla.isOptionSet( surfacing_hmmpfam.OUTPUT_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.OUTPUT_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no value for domain combinations similarities output file: -" + + surfacing_hmmpfam.OUTPUT_FILE_OPTION + "=" ); + } + output_file = new File( cla.getOptionValue( surfacing_hmmpfam.OUTPUT_FILE_OPTION ) ); + SurfacingUtil.checkForOutputFileWriteability( output_file ); + } + File cutoff_scores_file = null; + Map individual_domain_score_cutoffs = null; + if ( cla.isOptionSet( surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no value for individual domain score cutoffs file: -" + + surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION + "=" ); + } + cutoff_scores_file = new File( cla.getOptionValue( surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( cutoff_scores_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "cannot read individual domain score cutoffs file: " + error ); + } + try { + final BasicTable scores_table = BasicTableParser.parse( cutoff_scores_file, " " ); + individual_domain_score_cutoffs = scores_table.getColumnsAsMap( 0, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "cannot read from individual domain score cutoffs file: " + e ); + } + } + BinaryDomainCombination.DomainCombinationType dc_type = BinaryDomainCombination.DomainCombinationType.BASIC; + if ( cla.isOptionSet( surfacing_hmmpfam.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED; + } + if ( cla.isOptionSet( surfacing_hmmpfam.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT; + } + File out_dir = null; + if ( cla.isOptionSet( surfacing_hmmpfam.OUTPUT_DIR_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.OUTPUT_DIR_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for output directory: -" + + surfacing_hmmpfam.OUTPUT_DIR_OPTION + "=" ); + } + out_dir = new File( cla.getOptionValue( surfacing_hmmpfam.OUTPUT_DIR_OPTION ) ); + if ( out_dir.exists() && ( out_dir.listFiles().length > 0 ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "\"" + out_dir + + "\" aready exists and is not empty" ); + } + if ( !out_dir.exists() ) { + final boolean success = out_dir.mkdir(); + if ( !success || !out_dir.exists() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "failed to create \"" + out_dir + "\"" ); + } + } + if ( !out_dir.canWrite() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot write to \"" + out_dir + "\"" ); + } + } + File positive_filter_file = null; + File negative_filter_file = null; + File negative_domains_filter_file = null; + if ( cla.isOptionSet( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION ) + && cla.isOptionSet( surfacing_hmmpfam.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "attempt to use both negative and positive protein filter" ); + } + if ( cla.isOptionSet( surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION ) + && ( cla.isOptionSet( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION ) || cla + .isOptionSet( surfacing_hmmpfam.FILTER_POSITIVE_OPTION ) ) ) { + ForesterUtil + .fatalError( surfacing_hmmpfam.PRG_NAME, + "attempt to use both negative or positive protein filter together wirh a negative domains filter" ); + } + if ( cla.isOptionSet( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for negative filter: -" + + surfacing_hmmpfam.FILTER_NEGATIVE_OPTION + "=" ); + } + negative_filter_file = new File( cla.getOptionValue( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "can not read from \"" + negative_filter_file + + "\": " + msg ); + } + } + else if ( cla.isOptionSet( surfacing_hmmpfam.FILTER_POSITIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for positive filter: -" + + surfacing_hmmpfam.FILTER_POSITIVE_OPTION + "=" ); + } + positive_filter_file = new File( cla.getOptionValue( surfacing_hmmpfam.FILTER_POSITIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( positive_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "can not read from \"" + positive_filter_file + + "\": " + msg ); + } + } + else if ( cla.isOptionSet( surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for negative domains filter: -" + + surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION + "=" ); + } + negative_domains_filter_file = new File( cla + .getOptionValue( surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_domains_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "can not read from \"" + + negative_domains_filter_file + "\": " + msg ); + } + } + final List plus_minus_analysis_high_copy_base_species = new ArrayList(); + final List plus_minus_analysis_high_copy_target_species = new ArrayList(); + final List plus_minus_analysis_high_low_copy_species = new ArrayList(); + final List plus_minus_analysis_numbers = new ArrayList(); + processPlusMinusAnalysisOption( cla, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + plus_minus_analysis_numbers ); + File input_files_file = null; + String[] input_file_names_from_file = null; + if ( cla.isOptionSet( surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for input files file: -" + + surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + input_files_file = new File( cla.getOptionValue( surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( input_files_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "can not read from \"" + input_files_file + "\": " + + msg ); + } + try { + input_file_names_from_file = ForesterUtil.file2array( input_files_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "failed to read from \"" + input_files_file + + "\": " + e ); + } + } + if ( ( cla.getNumberOfNames() < 1 ) + && ( ( input_file_names_from_file == null ) || ( input_file_names_from_file.length < 1 ) ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "No hmmpfam output file indicated is input: use comand line directly or " + + surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.SCORING_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.SCORING_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no value for scoring method for domain combinations similarity calculation: -" + + surfacing_hmmpfam.SCORING_OPTION + "=<" + + surfacing_hmmpfam.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing_hmmpfam.SCORING_PROTEIN_COUNT_BASED + "|" + + surfacing_hmmpfam.SCORING_COMBINATION_BASED + ">\"" ); + } + final String scoring_str = cla.getOptionValue( surfacing_hmmpfam.SCORING_OPTION ); + if ( scoring_str.equals( surfacing_hmmpfam.SCORING_DOMAIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.DOMAINS; + } + else if ( scoring_str.equals( surfacing_hmmpfam.SCORING_COMBINATION_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + } + else if ( scoring_str.equals( surfacing_hmmpfam.SCORING_PROTEIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.PROTEINS; + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + scoring_str + + "\" for scoring method for domain combinations similarity calculation: \"-" + + surfacing_hmmpfam.SCORING_OPTION + "=<" + surfacing_hmmpfam.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing_hmmpfam.SCORING_PROTEIN_COUNT_BASED + "|" + + surfacing_hmmpfam.SCORING_COMBINATION_BASED + ">\"" ); + } + } + boolean sort_by_species_count_first = false; + if ( cla.isOptionSet( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ) ) { + sort_by_species_count_first = true; + } + boolean species_matrix = false; + if ( cla.isOptionSet( surfacing_hmmpfam.SPECIES_MATRIX_OPTION ) ) { + species_matrix = true; + } + boolean output_protein_lists_for_all_domains = false; + if ( cla.isOptionSet( surfacing_hmmpfam.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) { + output_protein_lists_for_all_domains = true; + } + Detailedness detailedness = DETAILEDNESS_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.DETAILEDNESS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.DETAILEDNESS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for -" + + surfacing_hmmpfam.DETAILEDNESS_OPTION + "=<" + surfacing_hmmpfam.DETAILEDNESS_BASIC + "|" + + surfacing_hmmpfam.DETAILEDNESS_LIST_IDS + "|" + surfacing_hmmpfam.DETAILEDNESS_PUNCTILIOUS + + ">\"" ); + } + final String detness = cla.getOptionValue( surfacing_hmmpfam.DETAILEDNESS_OPTION ).toLowerCase(); + if ( detness.equals( surfacing_hmmpfam.DETAILEDNESS_BASIC ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.BASIC; + } + else if ( detness.equals( surfacing_hmmpfam.DETAILEDNESS_LIST_IDS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES; + } + else if ( detness.equals( surfacing_hmmpfam.DETAILEDNESS_PUNCTILIOUS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + detness + + "\" for detailedness: \"-" + surfacing_hmmpfam.DETAILEDNESS_OPTION + "=<" + + surfacing_hmmpfam.DETAILEDNESS_BASIC + "|" + surfacing_hmmpfam.DETAILEDNESS_LIST_IDS + "|" + + surfacing_hmmpfam.DETAILEDNESS_PUNCTILIOUS + ">\"" ); + } + } + String automated_pairwise_comparison_suffix = null; + boolean perform_pwc = false; + boolean write_pwc_files = false; + if ( cla.isOptionSet( surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + perform_pwc = true; + if ( !cla.isOptionValueSet( surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + write_pwc_files = false; + } + else { + write_pwc_files = true; + automated_pairwise_comparison_suffix = "_" + + cla.getOptionValue( surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + } + } + String query_domain_ids = null; + if ( cla.isOptionSet( surfacing_hmmpfam.SEQ_EXTRACT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.SEQ_EXTRACT_OPTION ) ) { + ForesterUtil + .fatalError( surfacing_hmmpfam.PRG_NAME, + "no domain ids given for sequences with given domains to be extracted : -" + + surfacing_hmmpfam.SEQ_EXTRACT_OPTION + + "=" ); + } + query_domain_ids = cla.getOptionValue( surfacing_hmmpfam.SEQ_EXTRACT_OPTION ); + } + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field = DOMAIN_SORT_FILD_DEFAULT; + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no value for domain combinations similarities sorting: -" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MAX + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MEAN + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_DIFF + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SD + ">\"" ); + } + final String sort_str = cla.getOptionValue( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION ).toLowerCase(); + if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ALPHA ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MAX ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MIN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MIN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MEAN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MEAN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MEAN; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SPECIES_COUNT; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SD ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SD; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + else if ( sort_str.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + sort_str + + "\" for domain combinations similarities sorting: \"-" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MAX + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_MEAN + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_DIFF + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_SD + ">\"" ); + } + } + PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option = DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for print option: -" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.HTML; + } + else if ( sort.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML ) ) { + // domain_similarity_print_option = + // DomainSimilarity.PRINT_OPTION.SIMPLE_HTML; + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "simple HTML output not implemented yet :(" ); + } + else if ( sort.equals( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.SIMPLE_TAB_DELIMITED; + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + sort + + "\" for print option: -" + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + + "|" + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + } + GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order = DOMAINS_SORT_ORDER_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for sorting of domain counts: -" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION + "=<" + surfacing_hmmpfam.DOMAIN_COUNT_SORT_ALPHA + + "|" + surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing_hmmpfam.DOMAIN_COUNT_SORT_ALPHA ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + } + else if ( sort.equals( surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT; + } + else if ( sort.equals( surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT; + } + else if ( sort.equals( surfacing_hmmpfam.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT; + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + sort + + "\" for sorting of domain counts: \"-" + surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION + "=<" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_ALPHA + "|" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing_hmmpfam.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + } + String[][] input_file_properties = null; + if ( input_file_names_from_file != null ) { + input_file_properties = surfacing_hmmpfam.processInputFileNames( input_file_names_from_file ); + } + else { + input_file_properties = surfacing_hmmpfam.processInputFileNames( cla.getNames() ); + } + final int number_of_genomes = input_file_properties.length; + if ( number_of_genomes < 2 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot analyze less than two files" ); + } + if ( ( number_of_genomes < 3 ) && perform_pwc ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot use : -" + + surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "= to turn on pairwise analyses with less than three input files" ); + } + checkWriteabilityForPairwiseComparisons( domain_similarity_print_option, + input_file_properties, + automated_pairwise_comparison_suffix, + out_dir ); + for( int i = 0; i < number_of_genomes; i++ ) { + File dcc_outfile = new File( input_file_properties[ i ][ 0 ] + + surfacing_hmmpfam.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + if ( out_dir != null ) { + dcc_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + dcc_outfile ); + } + SurfacingUtil.checkForOutputFileWriteability( dcc_outfile ); + } + File pfam_to_go_file = null; + Map> domain_id_to_go_ids_map = null; + int domain_id_to_go_ids_count = 0; + if ( cla.isOptionSet( surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for Pfam to GO mapping file: -" + + surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION + "=" ); + } + pfam_to_go_file = new File( cla.getOptionValue( surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( pfam_to_go_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read Pfam to GO mapping file: " + error ); + } + try { + final PfamToGoParser parser = new PfamToGoParser( pfam_to_go_file ); + final List pfam_to_go_mappings = parser.parse(); + domain_id_to_go_ids_map = SurfacingUtil.createDomainIdToGoIdMap( pfam_to_go_mappings ); + if ( parser.getMappingCount() < domain_id_to_go_ids_map.size() ) { + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, + "parser.getMappingCount() < domain_id_to_go_ids_map.size()" ); + } + domain_id_to_go_ids_count = parser.getMappingCount(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read from Pfam to GO mapping file: " + e ); + } + } + File go_obo_file = null; + List go_terms = null; + if ( cla.isOptionSet( surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for GO OBO file: -" + + surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION + "=" ); + } + if ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot use GO OBO file (-" + + surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION + "=) without Pfam to GO mapping file (" + + surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + go_obo_file = new File( cla.getOptionValue( surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( go_obo_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read GO OBO file: " + error ); + } + try { + final OBOparser parser = new OBOparser( go_obo_file, OBOparser.ReturnType.BASIC_GO_TERM ); + go_terms = parser.parse(); + if ( parser.getGoTermCount() != go_terms.size() ) { + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, + "parser.getGoTermCount() != go_terms.size()" ); + } + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read from GO OBO file: " + + e.getLocalizedMessage() ); + } + } + Map go_id_to_term_map = null; + if ( ( ( domain_id_to_go_ids_map != null ) && ( domain_id_to_go_ids_map.size() > 0 ) ) + && ( ( go_terms != null ) && ( go_terms.size() > 0 ) ) ) { + go_id_to_term_map = GoUtils.createGoIdToGoTermMap( go_terms ); + } + GoNameSpace go_namespace_limit = null; + if ( cla.isOptionSet( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION ) ) { + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot use GO namespace limit (-" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION + + "=) without Pfam to GO mapping file (" + + surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION + "=) and GO OBO file (-" + + surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for GO namespace limit: \"-" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + final String go_namespace_limit_str = cla.getOptionValue( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION ) + .toLowerCase(); + if ( go_namespace_limit_str.equals( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION ) ) { + go_namespace_limit = GoNameSpace.createMolecularFunction(); + } + else if ( go_namespace_limit_str.equals( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS ) ) { + go_namespace_limit = GoNameSpace.createBiologicalProcess(); + } + else if ( go_namespace_limit_str.equals( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT ) ) { + go_namespace_limit = GoNameSpace.createCellularComponent(); + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "unknown value \"" + go_namespace_limit_str + + "\" for GO namespace limit: \"-" + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + } + if ( ( domain_similarity_sort_field == DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE ) + && ( number_of_genomes > 2 ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + boolean jacknifed_distances = false; + int jacknife_resamplings = JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT; + double jacknife_ratio = JACKNIFE_RATIO_DEFAULT; + long random_seed = JACKNIFE_RANDOM_SEED_DEFAULT; + if ( cla.isOptionSet( surfacing_hmmpfam.JACKNIFE_OPTION ) ) { + if ( ( number_of_genomes < 3 ) || !perform_pwc ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot use jacknife resampling analysis (-" + + surfacing_hmmpfam.JACKNIFE_OPTION + "[=]) without pairwise analyses (" + + surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + jacknifed_distances = true; + if ( cla.isOptionHasAValue( surfacing_hmmpfam.JACKNIFE_OPTION ) ) { + try { + jacknife_resamplings = cla.getOptionValueAsInt( surfacing_hmmpfam.JACKNIFE_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "illegal format for number of resamplings" ); + } + if ( jacknife_resamplings < 2 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "attempt to use less than 2 resamplings" ); + } + } + if ( cla.isOptionSet( surfacing_hmmpfam.JACKNIFE_RATIO_OPTION ) + && cla.isOptionHasAValue( surfacing_hmmpfam.JACKNIFE_RATIO_OPTION ) ) { + try { + jacknife_ratio = cla.getOptionValueAsDouble( surfacing_hmmpfam.JACKNIFE_RATIO_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "illegal format for jacknife ratio" ); + } + if ( ( jacknife_ratio <= 0.0 ) || ( jacknife_ratio >= 1.0 ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "attempt to use illegal value for jacknife ratio: " + jacknife_ratio ); + } + } + if ( cla.isOptionSet( surfacing_hmmpfam.JACKNIFE_RANDOM_SEED_OPTION ) + && cla.isOptionHasAValue( surfacing_hmmpfam.JACKNIFE_RANDOM_SEED_OPTION ) ) { + try { + random_seed = cla.getOptionValueAsLong( surfacing_hmmpfam.JACKNIFE_RANDOM_SEED_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "illegal format for random generator seed" ); + } + } + } + // boolean infer_species_trees = false; + // if ( cla.isOptionSet( surfacing.INFER_SPECIES_TREES_OPTION ) ) { + // if ( ( output_file == null ) || ( number_of_genomes < 3 ) + // || ForesterUtil.isEmpty( automated_pairwise_comparison_suffix ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer species trees (-" + // + surfacing.INFER_SPECIES_TREES_OPTION + " without pairwise analyses (" + // + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + // + "=)" ); + // } + // infer_species_trees = true; + // } + File[] intree_files = null; + Phylogeny[] intrees = null; + if ( cla.isOptionSet( surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION ) ) { + // TODO FIXME if jacknife.... maybe not + if ( number_of_genomes < 3 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "cannot infer gains and losses on input species trees (-" + + surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION + + " without pairwise analyses (" + + surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for input tree: -" + + surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION + "=" ); + } + final String intrees_str = cla.getOptionValue( surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION ); + if ( intrees_str.indexOf( "#" ) > 0 ) { + final String[] intrees_strs = intrees_str.split( "#" ); + intree_files = new File[ intrees_strs.length ]; + int i = 0; + for( final String s : intrees_strs ) { + intree_files[ i++ ] = new File( s.trim() ); + } + } + else { + intree_files = new File[ 1 ]; + intree_files[ 0 ] = new File( intrees_str ); + } + intrees = getIntrees( intree_files, number_of_genomes, input_file_properties ); + } + long random_number_seed_for_fitch_parsimony = 0l; + boolean radomize_fitch_parsimony = false; + if ( cla.isOptionSet( surfacing_hmmpfam.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for random number seed: -" + + surfacing_hmmpfam.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + "=" ); + } + try { + random_number_seed_for_fitch_parsimony = cla + .getOptionValueAsLong( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage() ); + } + radomize_fitch_parsimony = true; + } + SortedSet filter = null; + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + filter = new TreeSet(); + if ( positive_filter_file != null ) { + processFilter( positive_filter_file, filter ); + } + else if ( negative_filter_file != null ) { + processFilter( negative_filter_file, filter ); + } + else if ( negative_domains_filter_file != null ) { + processFilter( negative_domains_filter_file, filter ); + } + } + Map>[] domain_id_to_secondary_features_maps = null; + File[] secondary_features_map_files = null; + final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile ); + } + if ( cla.isOptionSet( surfacing_hmmpfam.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for secondary features map file: -" + + surfacing_hmmpfam.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=" ); + } + final String[] secondary_features_map_files_strs = cla + .getOptionValue( surfacing_hmmpfam.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ).split( "#" ); + secondary_features_map_files = new File[ secondary_features_map_files_strs.length ]; + domain_id_to_secondary_features_maps = new Map[ secondary_features_map_files_strs.length ]; + int i = 0; + for( final String secondary_features_map_files_str : secondary_features_map_files_strs ) { + secondary_features_map_files[ i ] = new File( secondary_features_map_files_str ); + final String error = ForesterUtil.isReadableFile( secondary_features_map_files[ i ] ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read secondary features map file: " + + error ); + } + try { + domain_id_to_secondary_features_maps[ i ] = SurfacingUtil + .createDomainIdToSecondaryFeaturesMap( secondary_features_map_files[ i ] ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "cannot read secondary features map file: " + + e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "problem with contents of features map file [" + + secondary_features_map_files[ i ] + "]: " + e.getMessage() ); + } + i++; + } + } + if ( out_dir == null ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no output directory indicated (-" + + surfacing_hmmpfam.OUTPUT_DIR_OPTION + "=)" ); + } + if ( output_file == null ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no name for (main) output file indicated (-" + + surfacing_hmmpfam.OUTPUT_FILE_OPTION + "=)" ); + } + if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no (acceptable) Pfam to GO id mapping file provided ('pfam2go file') (-" + + surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "no (acceptable) go id to term mapping file provided ('GO OBO file') (-" + + surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION + "=)" ); + } + boolean display_histograms = false; + if ( cla.isOptionSet( surfacing_hmmpfam.DISPLAY_M_HISTOGRAMS_OPTION ) ) { + display_histograms = true; + } + System.out.println( "Output directory : " + out_dir ); + if ( input_file_names_from_file != null ) { + System.out.println( "Input files names from : " + input_files_file + " [" + + input_file_names_from_file.length + " input files]" ); + html_desc.append( "" + nl ); + } + if ( positive_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Positive protein filter : " + positive_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative protein filter : " + negative_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_domains_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative domain filter : " + negative_domains_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + String plus0 = ""; + for( final String s : plus_minus_analysis_high_copy_base_species ) { + plus0 += "+" + s + " "; + } + String plus1 = ""; + for( final String s : plus_minus_analysis_high_copy_target_species ) { + plus1 += "*" + s + " "; + } + String minus = ""; + for( final String s : plus_minus_analysis_high_low_copy_species ) { + minus += "-" + s + " "; + } + System.out.println( "Plus-minus analysis : " + plus1 + "&& " + plus0 + "&& " + minus ); + html_desc.append( "" + nl ); + } + if ( cutoff_scores_file != null ) { + System.out.println( "Cutoff scores file : " + cutoff_scores_file ); + html_desc.append( "" + nl ); + } + if ( e_value_max >= 0.0 ) { + System.out.println( "E-value maximum (inclusive) : " + e_value_max ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore DUFs : " + ignore_dufs ); + if ( ignore_virus_like_ids ) { + System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids ); + html_desc.append( "" + nl ); + } + html_desc.append( "" + nl ); + if ( max_allowed_overlap != surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_DEFAULT ) { + System.out.println( "Max allowed domain overlap : " + max_allowed_overlap ); + html_desc.append( "" + nl ); + } + if ( no_engulfing_overlaps ) { + System.out.println( "Ignore engulfed domains : " + no_engulfing_overlaps ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore singlet domains : " + ignore_domains_without_combs_in_all_spec ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore species specific doms: " + ignore_species_specific_domains ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore combination with self: " + ignore_combination_with_same ); + html_desc.append( "" + nl ); + ; + System.out.println( "Consider directedness : " + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) ); + html_desc.append( "" + nl ); + if ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) { + System.out.println( "Consider adjacency : " + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) ); + html_desc.append( "" + + nl ); + } + System.out.print( "Domain counts sort order : " ); + switch ( dc_sort_order ) { + case ALPHABETICAL_KEY_ID: + System.out.println( "alphabetical" ); + break; + case KEY_DOMAIN_COUNT: + System.out.println( "domain count" ); + break; + case KEY_DOMAIN_PROTEINS_COUNT: + System.out.println( "domain proteins count" ); + break; + case COMBINATIONS_COUNT: + System.out.println( "domain combinations count" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, "unknown value for dc sort order" ); + } + if ( domain_id_to_go_ids_map != null ) { + System.out.println( "Pfam to GO mappings from : " + pfam_to_go_file + " [" + domain_id_to_go_ids_count + + " mappings]" ); + html_desc.append( "" + nl ); + } + if ( go_terms != null ) { + System.out.println( "GO terms from : " + go_obo_file + " [" + go_terms.size() + " terms]" ); + html_desc.append( "" + nl ); + } + if ( go_namespace_limit != null ) { + System.out.println( "Limit GO terms to : " + go_namespace_limit.toString() ); + html_desc.append( "" + nl ); + } + if ( perform_pwc ) { + System.out.println( "Suffix for PWC files : " + automated_pairwise_comparison_suffix ); + html_desc.append( "" + nl ); + } + if ( out_dir != null ) { + System.out.println( "Output directory : " + out_dir ); + } + if ( query_domain_ids != null ) { + System.out.println( "Query domains (ordered) : " + query_domain_ids ); + html_desc.append( "" + nl ); + } + System.out.println( "Write similarities to : " + output_file ); + System.out.print( " Scoring method : " ); + html_desc.append( "" + nl ); + break; + case DOMAINS: + System.out.println( "domain counts based" ); + html_desc.append( "domain counts based" + "" + nl ); + break; + case PROTEINS: + System.out.println( "domain proteins counts based" ); + html_desc.append( "domain proteins counts based" + "" + nl ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, "unknown value for sorting for scoring" ); + } + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + System.out.print( " Detailedness : " ); + switch ( detailedness ) { + case BASIC: + System.out.println( "basic" ); + break; + case LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES: + System.out.println( "list combining domains for each species" ); + break; + case PUNCTILIOUS: + System.out.println( "punctilious" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, + "unknown value for sorting for detailedness" ); + } + System.out.print( " Print option : " ); + switch ( domain_similarity_print_option ) { + case HTML: + System.out.println( "HTML" ); + break; + case SIMPLE_TAB_DELIMITED: + System.out.println( "simple tab delimited" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, "unknown value for print option" ); + } + System.out.print( " Species matrix : " + species_matrix ); + System.out.println(); + if ( perform_pwc ) { + System.out.println( "Pairwise comparisons: " ); + html_desc.append( "" ); + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + if ( jacknifed_distances ) { + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + System.out.println( " Jacknife : " + jacknife_resamplings + " resamplings" ); + System.out.println( " Ratio : " + ForesterUtil.round( jacknife_ratio, 2 ) ); + System.out.println( " Random number seed : " + random_seed ); + } + // if ( infer_species_trees ) { + // html_desc.append( "" + nl ); + // System.out.println( " Infer species trees : true" ); + // } + if ( ( intrees != null ) && ( intrees.length > 0 ) ) { + for( final File intree_file : intree_files ) { + html_desc.append( "" + nl ); + System.out.println( " Intree for gain/loss pars.: " + intree_file ); + } + } + if ( radomize_fitch_parsimony ) { + html_desc.append( "" + nl ); + System.out.println( " Random number seed : " + random_number_seed_for_fitch_parsimony ); + } + if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + for( int i = 0; i < secondary_features_map_files.length; i++ ) { + html_desc.append( "" + nl ); + System.out.println( "Secondary features map file : " + secondary_features_map_files[ i ] + + " [mappings for " + domain_id_to_secondary_features_maps[ i ].size() + " domain ids]" ); + if ( VERBOSE ) { + System.out.println(); + System.out.println( "Domain ids to secondary features map:" ); + for( final DomainId domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { + System.out.print( domain_id.getId() ); + System.out.print( " => " ); + for( final String sec : domain_id_to_secondary_features_maps[ i ].get( domain_id ) ) { + System.out.print( sec ); + System.out.print( " " ); + } + System.out.println(); + } + } + } + } + } // if ( perform_pwc ) { + System.out.println(); + html_desc.append( "" + nl ); + System.out.println( "Command line : " + cla.getCommandLineArgsAsString() ); + BufferedWriter[] query_domains_writer_ary = null; + List[] query_domain_ids_array = null; + if ( query_domain_ids != null ) { + final String[] query_domain_ids_str_array = query_domain_ids.split( "#" ); + query_domain_ids_array = new ArrayList[ query_domain_ids_str_array.length ]; + query_domains_writer_ary = new BufferedWriter[ query_domain_ids_str_array.length ]; + for( int i = 0; i < query_domain_ids_str_array.length; i++ ) { + String query_domain_ids_str = query_domain_ids_str_array[ i ]; + final String[] query_domain_ids_str_ary = query_domain_ids_str.split( "~" ); + final List query = new ArrayList(); + for( final String element : query_domain_ids_str_ary ) { + query.add( new DomainId( element ) ); + } + query_domain_ids_array[ i ] = query; + query_domain_ids_str = query_domain_ids_str.replace( '~', '_' ); + String protein_names_writer_str = query_domain_ids_str + surfacing_hmmpfam.SEQ_EXTRACT_SUFFIX; + if ( out_dir != null ) { + protein_names_writer_str = out_dir + ForesterUtil.FILE_SEPARATOR + protein_names_writer_str; + } + try { + query_domains_writer_ary[ i ] = new BufferedWriter( new FileWriter( protein_names_writer_str ) ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "Could not open [" + protein_names_writer_str + + "]: " + e.getLocalizedMessage() ); + } + } + } + SortedMap> protein_lists_per_species = null; //This will only be created if neede. + boolean need_protein_lists_per_species = false; + if ( ( plus_minus_analysis_high_copy_base_species.size() > 0 ) || output_protein_lists_for_all_domains ) { + need_protein_lists_per_species = true; + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species = new TreeMap>(); + } + final List gwcd_list = new ArrayList( number_of_genomes ); + final SortedSet all_domains_encountered = new TreeSet(); + final SortedSet all_bin_domain_combinations_encountered = new TreeSet(); + List all_bin_domain_combinations_gained_fitch = null; + List all_bin_domain_combinations_lost_fitch = null; + if ( ( intrees != null ) && ( intrees.length == 1 ) ) { + all_bin_domain_combinations_gained_fitch = new ArrayList(); + all_bin_domain_combinations_lost_fitch = new ArrayList(); + } + final DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); + final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + + output_file + D_PROMISCUITY_FILE_SUFFIX ); + BufferedWriter per_genome_domain_promiscuity_statistics_writer = null; + try { + per_genome_domain_promiscuity_statistics_writer = new BufferedWriter( new FileWriter( per_genome_domain_promiscuity_statistics_file ) ); + per_genome_domain_promiscuity_statistics_writer.write( "Species:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Mean:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "SD:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Median:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Min:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "N:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max Promiscuous Domains:" + + ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e2.getMessage() ); + } + for( int i = 0; i < number_of_genomes; ++i ) { + System.out.println(); + System.out.println( ( i + 1 ) + "/" + number_of_genomes ); + System.out.println( "Processing : " + input_file_properties[ i ][ 0 ] ); + HmmPfamOutputParser parser = null; + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + HmmPfamOutputParser.FilterType filter_type = HmmPfamOutputParser.FilterType.NONE; + if ( positive_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN; + } + else if ( negative_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.NEGATIVE_PROTEIN; + } + else if ( negative_domains_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.NEGATIVE_DOMAIN; + } + parser = new HmmPfamOutputParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + input_file_properties[ i ][ 2 ], + filter, + filter_type ); + } + else { + parser = new HmmPfamOutputParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + input_file_properties[ i ][ 2 ] ); + } + if ( e_value_max >= 0.0 ) { + parser.setEValueMaximum( e_value_max ); + } + parser.setIgnoreDufs( ignore_dufs ); + parser.setIgnoreVirusLikeIds( ignore_virus_like_ids ); + parser.setIgnoreEngulfedDomains( no_engulfing_overlaps ); + if ( max_allowed_overlap != surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parser.setMaxAllowedOverlap( max_allowed_overlap ); + } + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + if ( individual_domain_score_cutoffs != null ) { + parser.setIndividualDomainScoreCutoffs( individual_domain_score_cutoffs ); + } + parser.setAllowNonUniqueQuery( ALLOW_NON_UNIQUE_QUERY_IN_HMMPFAM_OUTPUT_DEFAULT ); + parser.setVerbose( VERBOSE_DEFAULT ); + List protein_list = null; + try { + protein_list = parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage(), e ); + } + if ( VERBOSE ) { + System.out.println( "Domains ignored due to negative domain filter: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToNegativeDomainFilterCountsMap() ); + System.out.println( "Domains ignored due to virus like id: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() ); + } + System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() ); + System.out.println( "Number of proteins stored : " + protein_list.size() ); + System.out.println( "Domains encountered : " + parser.getDomainsEncountered() ); + System.out.println( "Domains stored : " + parser.getDomainsStored() ); + System.out.println( "Distinct domains stored : " + + parser.getDomainsStoredSet().size() ); + System.out.println( "Domains ignored due to individual score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() ); + System.out.println( "Domains ignored due to E-value : " + + parser.getDomainsIgnoredDueToEval() ); + System.out.println( "Domains ignored due to DUF designation : " + + parser.getDomainsIgnoredDueToDuf() ); + if ( ignore_virus_like_ids ) { + System.out.println( "Domains ignored due virus like ids : " + + parser.getDomainsIgnoredDueToVirusLikeIds() ); + } + System.out.println( "Domains ignored due negative domain filter : " + + parser.getDomainsIgnoredDueToNegativeDomainFilter() ); + System.out.println( "Domains ignored due to overlap : " + + parser.getDomainsIgnoredDueToOverlap() ); + if ( negative_filter_file != null ) { + System.out.println( "Proteins ignored due to negative filter : " + + parser.getProteinsIgnoredDueToFilter() ); + } + if ( positive_filter_file != null ) { + System.out.println( "Proteins ignored due to positive filter : " + + parser.getProteinsIgnoredDueToFilter() ); + } + System.out.println( "Time for processing : " + parser.getTime() + "ms" ); + html_desc.append( "" + nl ); + // domain_partner_counts_array[ i ] = + // Methods.getDomainPartnerCounts( protein_domain_collections_array[ + // i ], + // false, input_file_properties[ i ][ 1 ] ); + gwcd_list.add( BasicGenomeWideCombinableDomains + .createInstance( protein_list, + ignore_combination_with_same, + new BasicSpecies( input_file_properties[ i ][ 1 ] ), + domain_id_to_go_ids_map, + dc_type ) ); + domain_lengths_table.addLengths( protein_list ); + if ( gwcd_list.get( i ).getSize() > 0 ) { + SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, + out_dir, + per_genome_domain_promiscuity_statistics_writer, + gwcd_list.get( i ), + i, + dc_sort_order ); + if ( output_binary_domain_combinationsfor_graph_analysis ) { + SurfacingUtil.writeBinaryDomainCombinationsFileForGraphAnalysis( input_file_properties, + out_dir, + gwcd_list.get( i ), + i, + dc_sort_order ); + } + SurfacingUtil.addAllDomainIdsToSet( gwcd_list.get( i ), all_domains_encountered ); + SurfacingUtil.addAllBinaryDomainCombinationToSet( gwcd_list.get( i ), + all_bin_domain_combinations_encountered ); + } + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + SurfacingUtil.extractProteinNames( protein_list, + query_domain_ids_array[ j ], + query_domains_writer_ary[ j ], + "\t" ); + query_domains_writer_ary[ j ].flush(); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species.put( new BasicSpecies( input_file_properties[ i ][ 1 ] ), protein_list ); + } + System.gc(); + } // for( int i = 0; i < number_of_hmmpfam_files_to_analyze; ++i ) { + try { + per_genome_domain_promiscuity_statistics_writer.flush(); + per_genome_domain_promiscuity_statistics_writer.close(); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e2.toString() ); + } + ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + + per_genome_domain_promiscuity_statistics_file ); + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + query_domains_writer_ary[ j ].close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.toString() ); + } + } + } + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + try { + SurfacingUtil.executeDomainLengthAnalysis( input_file_properties, + number_of_genomes, + domain_lengths_table, + domain_lengths_analysis_outfile ); + } + catch ( final IOException e1 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e1.toString() ); + } + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "Wrote domain length data to: " + domain_lengths_analysis_outfile ); + System.out.println(); + } + final long analysis_start_time = new Date().getTime(); + PairwiseDomainSimilarityCalculator pw_calc = null; + // double[] values_for_all_scores_histogram = null; + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, + sort_by_species_count_first, + number_of_genomes == 2 ); + switch ( scoring ) { + case COMBINATIONS: + pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); + break; + case DOMAINS: + pw_calc = new DomainCountsBasedPairwiseSimilarityCalculator(); + break; + case PROTEINS: + pw_calc = new ProteinCountsBasedPairwiseDomainSimilarityCalculator(); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, "unknown value for sorting for scoring" ); + } + DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE; + if ( domain_id_to_go_ids_map != null ) { + go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL; + } + final SortedSet similarities = calc + .calculateSimilarities( pw_calc, + gwcd_list, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains ); + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, + detailedness, + go_annotation_output, + go_id_to_term_map, + go_namespace_limit ); + DescriptiveStatistics pw_stats = null; + try { + String my_outfile = output_file.toString(); + if ( !my_outfile.endsWith( ".html" ) ) { + my_outfile += ".html"; + } + final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? my_outfile : out_dir + + ForesterUtil.FILE_SEPARATOR + my_outfile ) ); + List species_order = null; + if ( species_matrix ) { + species_order = new ArrayList(); + for( int i = 0; i < number_of_genomes; i++ ) { + species_order.add( new BasicSpecies( input_file_properties[ i ][ 1 ] ) ); + } + } + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "
Produced by:" + surfacing_hmmpfam.PRG_NAME + "
Version:" + surfacing_hmmpfam.PRG_VERSION + "
Release Date:" + surfacing_hmmpfam.PRG_DATE + "
Contact:" + surfacing_hmmpfam.E_MAIL + "
WWW:" + surfacing_hmmpfam.WWW + "
Input files names from:" + input_files_file + " [" + + input_file_names_from_file.length + " input files]
Positive protein filter:" + positive_filter_file + " [" + filter_size + + " domain ids]
Negative protein filter:" + negative_filter_file + " [" + filter_size + + " domain ids]
Negative domain filter:" + negative_domains_filter_file + " [" + + filter_size + " domain ids]
Plus-minus analysis:" + plus1 + "&& " + plus0 + "&& " + minus + + "
Cutoff scores file:" + cutoff_scores_file + "
E-value maximum (inclusive):" + e_value_max + "
Ignore virus, phage, transposition related ids:" + + ignore_virus_like_ids + "
Ignore DUFs:" + ignore_dufs + "
Max allowed domain overlap:" + max_allowed_overlap + "
Ignore (lower confidence) engulfed domains:" + no_engulfing_overlaps + + "
Ignore singlet domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_domains_without_combs_in_all_spec + "
Ignore species specific domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_species_specific_domains + "
Ignore combination with self for domain combination similarity analyses:" + + ignore_combination_with_same + "
Consider directedness of binary domain combinations:" + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) + "
Consider djacency of binary domain combinations:" + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "
Pfam to GO mappings from:" + pfam_to_go_file + " [" + + domain_id_to_go_ids_count + " mappings]" + "
GO terms from:" + go_obo_file + " [" + go_terms.size() + " terms]" + + "
Limit GO terms to" + go_namespace_limit + "
Suffix for PWC files" + automated_pairwise_comparison_suffix + + "
" + query_domain_ids + "
Scoring method:" ); + switch ( scoring ) { + case COMBINATIONS: + System.out.println( "domain combinations based" ); + html_desc.append( "domain combinations based" + "
Sort by:" ); + switch ( domain_similarity_sort_field ) { + case MIN: + System.out.print( "score minimum" ); + html_desc.append( "score minimum" ); + break; + case MAX: + System.out.print( "score maximum" ); + html_desc.append( "score maximum" ); + break; + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case SD: + System.out.print( "score standard deviation" ); + html_desc.append( "score standard deviation" ); + break; + case SPECIES_COUNT: + System.out.print( "species number" ); + html_desc.append( "species number" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "(maximal) difference" ); + html_desc.append( "(maximal) difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute (maximal) counts difference" ); + html_desc.append( "absolute (maximal) counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "(maximal) counts difference" ); + html_desc.append( "(maximal) counts difference" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, + "unknown value for sorting for similarities" ); + } + if ( sort_by_species_count_first ) { + System.out.println( " (sort by species count first)" ); + html_desc.append( " (sort by species count first)" ); + } + else { + System.out.println(); + } + html_desc.append( "
Pairwise comparisons:
Sort by:" ); + switch ( domain_similarity_sort_field_for_automated_pwc ) { + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "difference" ); + html_desc.append( "difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute counts difference" ); + html_desc.append( "absolute counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "counts difference" ); + html_desc.append( "counts difference" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_hmmpfam.PRG_NAME, + "unknown value for sorting for similarities" ); + } + System.out.println(); + html_desc.append( "
Jacknife:" + jacknife_resamplings + " resamplings
Jacknife ratio:" + ForesterUtil.round( jacknife_ratio, 2 ) + + "
Jacknife random number seed:" + random_seed + "
Infer species trees:true
Intree for gain/loss parsimony analysis:" + intree_file + + "
Random number seed for Fitch parsimony analysis:" + + random_number_seed_for_fitch_parsimony + "
Secondary features map file:" + + secondary_features_map_files[ i ] + "
Command line:" + cla.getCommandLineArgsAsString() + "
" + input_file_properties[ i ][ 0 ] + " [species: " + + input_file_properties[ i ][ 1 ] + "]" + ":domains analyzed: " + + parser.getDomainsStored() + "; domains ignored: [ind score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() + "] [E-value cutoff: " + + parser.getDomainsIgnoredDueToEval() + "] [DUF: " + parser.getDomainsIgnoredDueToDuf() + + "] [virus like ids: " + parser.getDomainsIgnoredDueToVirusLikeIds() + + "] [negative domain filter: " + parser.getDomainsIgnoredDueToNegativeDomainFilter() + + "] [overlap: " + parser.getDomainsIgnoredDueToOverlap() + "]" ); + if ( negative_filter_file != null ) { + html_desc.append( "; proteins ignored due to negative filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + if ( positive_filter_file != null ) { + html_desc.append( "; proteins ignored due to positive filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + html_desc.append( "
Sum of all distinct binary combinations:" + + all_bin_domain_combinations_encountered.size() + "
Sum of all distinct domains:" + all_domains_encountered.size() + + "
Analysis date/time:" + + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + + "
" + nl ); + pw_stats = SurfacingUtil + .writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( number_of_genomes + " genomes" ), + writer, + similarities, + number_of_genomes == 2, + species_order, + domain_similarity_print_option, + domain_similarity_sort_field, + scoring, + true ); + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, + "Wrote main output (includes domain similarities) to: \"" + + ( out_dir == null ? my_outfile : out_dir + + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "Failed to write similarites to: \"" + output_file + + "\" [" + e.getMessage() + "]" ); + } + System.out.println(); + // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); + final Species[] species = new Species[ number_of_genomes ]; + for( int i = 0; i < number_of_genomes; ++i ) { + species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); + } + List inferred_trees = null; + if ( ( number_of_genomes > 2 ) && perform_pwc ) { + final PairwiseGenomeComparator pwgc = new PairwiseGenomeComparator(); + pwgc.performPairwiseComparisons( html_desc, + sort_by_species_count_first, + detailedness, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains, + domain_similarity_sort_field_for_automated_pwc, + domain_similarity_print_option, + scoring, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + species, + number_of_genomes, + gwcd_list, + pw_calc, + automated_pairwise_comparison_suffix, + true, + surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, + surfacing_hmmpfam.PRG_NAME, + display_histograms, + out_dir, + write_pwc_files ); + String matrix_output_file = new String( output_file.toString() ); + if ( matrix_output_file.indexOf( '.' ) > 1 ) { + matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); + } + if ( out_dir != null ) { + matrix_output_file = out_dir + ForesterUtil.FILE_SEPARATOR + matrix_output_file; + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getDomainDistanceScoresMeans() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + final Phylogeny nj_gd = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getDomainDistanceScoresMeans().get( 0 ) ); + final Phylogeny nj_bc = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances().get( 0 ) ); + final Phylogeny nj_d = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_hmmpfam.NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances().get( 0 ) ); + inferred_trees = new ArrayList(); + inferred_trees.add( nj_gd ); + inferred_trees.add( nj_bc ); + inferred_trees.add( nj_d ); + // final List histogram_datas = pwgc.getHistogramDatas(); + // if ( infer_species_trees ) { + // inferred_trees = new ArrayList(); + // final List inferred_trees_bc = inferSpeciesTrees( new File( output_file + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // final List inferred_trees_d = inferSpeciesTrees( new File( output_file + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedDomainsBasedDistances() ); + // inferred_trees.addAll( inferred_trees_bc ); + // inferred_trees.addAll( inferred_trees_d ); + // } + if ( jacknifed_distances ) { + pwgc.performPairwiseComparisonsJacknifed( species, + number_of_genomes, + gwcd_list, + true, + jacknife_resamplings, + jacknife_ratio, + random_seed ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing_hmmpfam.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing_hmmpfam.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + // if ( infer_species_trees ) { + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); + // } + } + if ( display_histograms ) { + // final List histogram_datas_all = new ArrayList(); + // histogram_datas_all.add( new HistogramData( "all", + // values_for_all_scores_histogram, + // null, + // 20 ) ); + // final HistogramsFrame hf_all = new HistogramsFrame( histogram_datas_all ); + // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); + // hf_all.setVisible( true ); + // hf.setVisible( true ); + } + } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) + if ( ( out_dir != null ) && ( !perform_pwc ) ) { + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + // writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { + final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + e_value_max, + max_allowed_overlap, + no_engulfing_overlaps, + cutoff_scores_file, + dc_type ); + String s = "_"; + if ( radomize_fitch_parsimony ) { + s += random_number_seed_for_fitch_parsimony + "_"; + } + int i = 0; + for( final Phylogeny intree : intrees ) { + final String outfile_name = ForesterUtil.removeSuffix( output_file.toString() ) + s + + ForesterUtil.removeSuffix( intree_files[ i ].toString() ); + final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator.createInstance( intree, + gwcd_list ); + SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + radomize_fitch_parsimony, + outfile_name, + domain_parsimony, + intree, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + parameters_sb.toString(), + domain_id_to_secondary_features_maps, + positive_filter_file == null ? null : filter, + output_binary_domain_combinationsfor_graph_analysis, + all_bin_domain_combinations_gained_fitch, + all_bin_domain_combinations_lost_fitch, + dc_type ); + // Listing of all domain combinations gained is only done if only one input tree is used. + if ( ( domain_id_to_secondary_features_maps != null ) + && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + int j = 0; + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + final Map mapping_results_map = new TreeMap(); + final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator + .createInstance( intree, gwcd_list, domain_id_to_secondary_features_map ); + SurfacingUtil + .executeParsimonyAnalysisForSecondaryFeatures( outfile_name + + "_" + + secondary_features_map_files[ j++ ], + secondary_features_parsimony, + intree, + parameters_sb.toString(), + mapping_results_map ); + if ( i == 0 ) { + System.out.println(); + System.out.println( "Mapping to secondary features:" ); + for( final Species spec : mapping_results_map.keySet() ) { + final MappingResults mapping_results = mapping_results_map.get( spec ); + final int total_domains = mapping_results.getSumOfFailures() + + mapping_results.getSumOfSuccesses(); + System.out.print( spec + ":" ); + System.out.print( " mapped domains = " + mapping_results.getSumOfSuccesses() ); + System.out.print( ", not mapped domains = " + mapping_results.getSumOfFailures() ); + if ( total_domains > 0 ) { + System.out.println( ", mapped ratio = " + + ( 100 * mapping_results.getSumOfSuccesses() / total_domains ) + "%" ); + } + else { + System.out.println( ", mapped ratio = n/a (total domains = 0 )" ); + } + } + } + } + } + i++; + } // for( final Phylogeny intree : intrees ) { + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + executePlusMinusAnalysis( output_file, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + gwcd_list, + protein_lists_per_species, + domain_id_to_go_ids_map, + go_id_to_term_map, + plus_minus_analysis_numbers ); + } + if ( output_protein_lists_for_all_domains ) { + writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list ); + } + // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) { + // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + // e_value_max, + // max_allowed_overlap, + // no_engulfing_overlaps, + // cutoff_scores_file ); + // String s = "_"; + // if ( radomize_fitch_parsimony ) { + // s += random_number_seed_for_fitch_parsimony + "_"; + // } + // int i = 0; + // for( final Phylogeny inferred_tree : inferred_trees ) { + // if ( !inferred_tree.isRooted() ) { + // intrees[ 0 ].getRoot().getName(); + // inferred_tree.r + // } + // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s; + // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator + // .createInstance( inferred_tree, gwcd_list ); + // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + // radomize_fitch_parsimony, + // outfile_name, + // domain_parsimony, + // inferred_tree, + // domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // parameters_sb.toString() ); + // i++; + // } + // } + if ( all_bin_domain_combinations_gained_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing_hmmpfam.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_gained_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + true ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + if ( all_bin_domain_combinations_lost_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing_hmmpfam.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_lost_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + false ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + final Runtime rt = java.lang.Runtime.getRuntime(); + final long free_memory = rt.freeMemory() / 1000000; + final long total_memory = rt.totalMemory() / 1000000; + System.out.println(); + System.out.println( "Time for analysis : " + ( new Date().getTime() - analysis_start_time ) + "ms" ); + System.out.println( "Total running time: " + ( new Date().getTime() - start_time ) + "ms " ); + System.out.println( "Free memory : " + free_memory + "MB, total memory: " + total_memory + "MB" ); + System.out.println(); + System.out.println( "If this application is useful to you, please cite:" ); + System.out.println( surfacing_hmmpfam.WWW ); + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, + final String[][] input_file_properties ) { + final String[] genomes = new String[ input_file_properties.length ]; + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] is not unique in input tree " + intree.getName() ); + } + genomes[ i ] = input_file_properties[ i ][ 1 ]; + } + PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); + for( int i = 0; i < input_file_properties.length; ++i ) { + try { + intree.getNode( input_file_properties[ i ][ 1 ] ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] not present/not unique in input tree" ); + } + } + } + + // public static StringBuffer stringCombinableDomainsMapToStringBuffer( + // final SortedMap map ) { + // final StringBuffer sb = new StringBuffer(); + // for( final Iterator iter = map.keySet().iterator(); + // iter.hasNext(); ) { + // final Object key = iter.next(); + // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ', + // false ) ); + // final CombinableDomains domain_combination = map.get( key ); + // sb.append( ForesterUtil.pad( new StringBuffer( "" + + // domain_combination.getNumberOfCombiningDomains() ), 8, + // ' ', false ) ); + // sb.append( domain_combination.toStringBuffer() ); + // sb.append( ForesterUtil.getLineSeparator() ); + // } + // return sb; + // } + private static void printHelp() { + System.out.println(); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( "% java -Xms256m -Xmx512m -cp forester.jar org.forester.applications." + + surfacing_hmmpfam.PRG_NAME + + " [options] [external node name 1] [name 2] ... [name n]" ); + System.out.println(); + System.out.println( " Note: This software might need a significant amount of memory (heap space);" ); + System.out + .println( " hence use \"-Xms128m -Xmx512m\" (or more) to prevent a \"java.lang.OutOfMemoryError\"." ); + System.out.println(); + System.out.println( " Options: " ); + System.out.println( surfacing_hmmpfam.DETAILEDNESS_OPTION + + ": level of detail for similarities output file (default:" + DETAILEDNESS_DEFAULT + ")" ); + System.out.println( surfacing_hmmpfam.IGNORE_COMBINATION_WITH_SAME_OPTION + + ": to ignore combinations with self (default: not to ignore)" ); + System.out + .println( surfacing_hmmpfam.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION + + ": to ignore domains without combinations in any species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out + .println( surfacing_hmmpfam.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION + + ": to ignore domains specific to one species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out.println( surfacing_hmmpfam.NOT_IGNORE_DUFS_OPTION + + ": to _not_ ignore DUFs (domains with unknown function) (default: ignore DUFs)" ); + System.out + .println( surfacing_hmmpfam.IGNORE_VIRAL_IDS + + ": to ignore domains with ids containing 'vir', 'retro', 'transpos', 'phage', or starting with 'rv' or 'gag_'" ); + System.out.println( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_OPTION + ": sorting for similarities (default: " + + DOMAIN_SORT_FILD_DEFAULT + ")" ); + System.out.println( surfacing_hmmpfam.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" ); + System.out.println( surfacing_hmmpfam.MAX_E_VALUE_OPTION + ": max (inclusive) E-value" ); + System.out.println( surfacing_hmmpfam.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" ); + System.out.println( surfacing_hmmpfam.NO_ENGULFING_OVERLAP_OPTION + + ": to ignore engulfed lower confidence domains" ); + System.out.println( surfacing_hmmpfam.SPECIES_MATRIX_OPTION + ": species matrix" ); + System.out.println( surfacing_hmmpfam.SCORING_OPTION + ": scoring (default:" + SCORING_DEFAULT + ")" ); + System.out.println( surfacing_hmmpfam.DOMAIN_COUNT_SORT_OPTION + ": sorting for domain counts (default:" + + DOMAINS_SORT_ORDER_DEFAULT + ")" ); + System.out.println( surfacing_hmmpfam.DOMAIN_SIMILARITY_PRINT_OPTION + + ": domain similarity print option (default:" + DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT + ")" ); + System.out.println( surfacing_hmmpfam.CUTOFF_SCORE_FILE_OPTION + ": cutoff score file" ); + System.out.println( surfacing_hmmpfam.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION + + ": sort by species count first" ); + System.out.println( surfacing_hmmpfam.OUTPUT_DIR_OPTION + ": output directory" ); + System.out.println( surfacing_hmmpfam.PFAM_TO_GO_FILE_USE_OPTION + ": Pfam to GO mapping file" ); + System.out.println( surfacing_hmmpfam.GO_OBO_FILE_USE_OPTION + ": GO terms file (OBO format)" ); + System.out.println( surfacing_hmmpfam.GO_NAMESPACE_LIMIT_OPTION + ": limit GO term to one GO namespace" ); + System.out.println( surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "[=]: to perform pairwise comparison based analyses" ); + System.out.println( surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION + + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); + System.out.println( surfacing_hmmpfam.DISPLAY_M_HISTOGRAMS_OPTION + + ": to display multiple histograms (using fluorite)" ); + System.out + .println( JACKNIFE_OPTION + + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " + + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); + System.out.println( JACKNIFE_RATIO_OPTION + ": ratio for jacknife resampling [default: " + + JACKNIFE_RATIO_DEFAULT + "]" ); + System.out.println( JACKNIFE_RANDOM_SEED_OPTION + + ": seed for random number generator for jacknife resampling [default: " + + JACKNIFE_RANDOM_SEED_DEFAULT + "]" ); + // System.out.println( surfacing.INFER_SPECIES_TREES_OPTION + // + ": to infer NJ species trees based on shared domains/binary domain combinations" ); + System.out + .println( surfacing_hmmpfam.INPUT_SPECIES_TREE_OPTION + + "=: to infer domain/binary domain combination gains/losses on given species trees" ); + System.out.println( surfacing_hmmpfam.FILTER_POSITIVE_OPTION + + "=: to filter out proteins not containing at least one domain listed in " ); + System.out.println( surfacing_hmmpfam.FILTER_NEGATIVE_OPTION + + "=: to filter out proteins containing at least one domain listed in " ); + System.out.println( surfacing_hmmpfam.FILTER_NEGATIVE_DOMAINS_OPTION + + "=: to filter out (ignore) domains listed in " ); + System.out + .println( surfacing_hmmpfam.INPUT_FILES_FROM_FILE_OPTION + "=: to read input files from " ); + System.out + .println( surfacing_hmmpfam.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + + "=: seed for random number generator for Fitch Parsimony analysis (type: long, default: no randomization - given a choice, prefer absence" ); + System.out.println( surfacing_hmmpfam.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS + + ": to consider directedness in binary combinations: e.g. A-B != B-A" ); + System.out.println( surfacing_hmmpfam.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY + + ": to consider directedness and adjacency in binary combinations" ); + System.out + .println( surfacing_hmmpfam.SEQ_EXTRACT_OPTION + + "=: to extract sequence names of sequences containing matching domains and/or domain-sequences (order N to C) (domain separator: '~', domain sequences speparator: '#', e.g. 'NACHT#BIR~CARD')" ); + System.out.println( surfacing_hmmpfam.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + + "=: to perfom parsimony analysis on secondary features" ); + System.out.println( surfacing_hmmpfam.PLUS_MINUS_ANALYSIS_OPTION + + "=: to presence/absence genome analysis" ); + System.out.println( surfacing_hmmpfam.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + + ": to output binary domain combinations for (downstream) graph analysis" ); + System.out.println( surfacing_hmmpfam.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + + ": to output all proteins per domain" ); + System.out.println(); + System.out.println(); + System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar" + + "org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST" + + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo " + + "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo " + + "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION + + "=50 human mouse brafl strpu" ); + System.out.println(); + } + + private static void processFilter( final File filter_file, final SortedSet filter ) { + SortedSet filter_str = null; + try { + filter_str = ForesterUtil.file2set( filter_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage() ); + } + if ( filter_str != null ) { + for( final String string : filter_str ) { + filter.add( new DomainId( string ) ); + } + } + if ( VERBOSE ) { + System.out.println( "Filter:" ); + for( final DomainId domainId : filter ) { + System.out.println( domainId.getId() ); + } + } + } + + private static String[][] processInputFileNames( final String[] names ) { + final String[][] input_file_properties = new String[ names.length ][]; + for( int i = 0; i < names.length; ++i ) { + if ( names[ i ].indexOf( SEPARATOR_FOR_INPUT_VALUES ) < 0 ) { + input_file_properties[ i ] = new String[ 3 ]; + input_file_properties[ i ][ 0 ] = names[ i ]; + input_file_properties[ i ][ 1 ] = names[ i ]; + input_file_properties[ i ][ 2 ] = DEFAULT_SEARCH_PARAMETER; + } + else { + input_file_properties[ i ] = names[ i ].split( surfacing_hmmpfam.SEPARATOR_FOR_INPUT_VALUES + "" ); + if ( input_file_properties[ i ].length != 3 ) { + ForesterUtil + .fatalError( surfacing_hmmpfam.PRG_NAME, + "properties for the input files (hmmpfam output) are expected " + + "to be in the following format \"##\" (or just one word, which is both the filename and the species id), instead received \"" + + names[ i ] + "\"" ); + } + } + final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, error ); + } + } + return input_file_properties; + } + + private static void processPlusMinusAnalysisOption( final CommandLineArguments cla, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + if ( cla.isOptionSet( surfacing_hmmpfam.PLUS_MINUS_ANALYSIS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_hmmpfam.PLUS_MINUS_ANALYSIS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "no value for 'plus-minus' file: -" + + surfacing_hmmpfam.PLUS_MINUS_ANALYSIS_OPTION + "=" ); + } + final File plus_minus_file = new File( cla.getOptionValue( surfacing_hmmpfam.PLUS_MINUS_ANALYSIS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( plus_minus_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + + msg ); + } + processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); + } + } + + // First numbers is minimal difference, second is factor. + private static void processPlusMinusFile( final File plus_minus_file, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + Set species_set = null; + int min_diff = PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; + double factor = PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; + try { + species_set = ForesterUtil.file2set( plus_minus_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getMessage() ); + } + if ( species_set != null ) { + for( final String species : species_set ) { + final String species_trimmed = species.substring( 1 ); + if ( species.startsWith( "+" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_base.add( species_trimmed ); + } + else if ( species.startsWith( "*" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_target.add( species_trimmed ); + } + else if ( species.startsWith( "-" ) ) { + if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + low_copy.add( species_trimmed ); + } + else if ( species.startsWith( "$D" ) ) { + try { + min_diff = Integer.parseInt( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "could not parse integer value for minimal difference from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "$F" ) ) { + try { + factor = Double.parseDouble( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, + "could not parse double value for factor from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "#" ) ) { + // Comment, ignore. + } + else { + ForesterUtil + .fatalError( surfacing_hmmpfam.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); + } + numbers.add( new Integer( min_diff + "" ) ); + numbers.add( new Double( factor + "" ) ); + } + } + else { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, "'plus minus' file [" + plus_minus_file + + "] appears empty" ); + } + } + + private static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final DomainId domain : all_domains ) { + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); + SurfacingUtil.checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + SurfacingUtil.extractProteinNames( protein_lists_per_species, domain, proteins_file_writer, "\t" ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_hmmpfam.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing_hmmpfam.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } +} diff --git a/forester/java/src/org/forester/application/surfacing_old.java b/forester/java/src/org/forester/application/surfacing_old.java new file mode 100644 index 0000000..07711ee --- /dev/null +++ b/forester/java/src/org/forester/application/surfacing_old.java @@ -0,0 +1,2583 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.go.PfamToGoMapping; +import org.forester.go.PfamToGoParser; +import org.forester.io.parsers.HmmPfamOutputParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.surfacing.BasicDomainSimilarityCalculator; +import org.forester.surfacing.BasicGenomeWideCombinableDomains; +import org.forester.surfacing.BasicSpecies; +import org.forester.surfacing.BinaryDomainCombination; +import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator; +import org.forester.surfacing.DomainCountsDifferenceUtil; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.DomainLengthsTable; +import org.forester.surfacing.DomainParsimonyCalculator; +import org.forester.surfacing.DomainSimilarity; +import org.forester.surfacing.DomainSimilarityCalculator; +import org.forester.surfacing.GenomeWideCombinableDomains; +import org.forester.surfacing.MappingResults; +import org.forester.surfacing.PairwiseDomainSimilarityCalculator; +import org.forester.surfacing.PairwiseGenomeComparator; +import org.forester.surfacing.PrintableDomainSimilarity; +import org.forester.surfacing.Protein; +import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator; +import org.forester.surfacing.Species; +import org.forester.surfacing.SurfacingUtil; +import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring; +import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; +import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; + +public class surfacing_old { + + public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; + public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; + public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; + // gain/loss: + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc"; + // gain/loss counts: + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc"; + // tables: + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_GOID_D = "_dollo_gains_goid_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_GOID_D = "_dollo_present_goid_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; + public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; + public final static String BDC_PRESENT_NEXUS = "_dc.nex"; + // --- + public final static String PRG_NAME = "surfacing"; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex"; + public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex"; + public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex"; + public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features"; + public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; + public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_BIOLOGICAL_PROCESS = "_dollo_biol_proc_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_CELLULAR_COMPONENT = "_dollo_cell_comp_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_MOLECULAR_FUNCTION = "_dollo_mol_funct_goid_d"; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_BIOLOGICAL_PROCESS = "_fitch_biol_proc_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_CELLULAR_COMPONENT = "_fitch_cell_comp_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_MOLECULAR_FUNCTION = "_fitch_mol_funct_goid_dc"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String OUTPUT_DIR_OPTION = "out_dir"; + final static private String SCORING_OPTION = "scoring"; + private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + final static private String SCORING_DOMAIN_COUNT_BASED = "domains"; + final static private String SCORING_PROTEIN_COUNT_BASED = "proteins"; + final static private String SCORING_COMBINATION_BASED = "combinations"; + final static private String DETAILEDNESS_OPTION = "detail"; + private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + final static private String SPECIES_MATRIX_OPTION = "smatrix"; + final static private String DETAILEDNESS_BASIC = "basic"; + final static private String DETAILEDNESS_LIST_IDS = "list_ids"; + final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious"; + final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort"; + private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + final static private String DOMAIN_SIMILARITY_SORT_MIN = "min"; + final static private String DOMAIN_SIMILARITY_SORT_MAX = "max"; + final static private String DOMAIN_SIMILARITY_SORT_SD = "sd"; + final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean"; + final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff"; + final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species"; + final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha"; + final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first"; + final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort"; + private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot"; + final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb"; + final static private String CUTOFF_SCORE_FILE_OPTION = "cos"; + final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; + final static private String MAX_E_VALUE_OPTION = "e"; + final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; + final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; + final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; + final static private String OUTPUT_FILE_OPTION = "o"; + final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; + final static private String GO_OBO_FILE_USE_OPTION = "obo"; + final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace"; + final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function"; + final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process"; + final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component"; + final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output"; + private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML; + final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains"; + final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids"; + final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false; + final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains"; + final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false; + final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd"; + final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd"; + final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd"; + final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String DISPLAY_M_HISTOGRAMS_OPTION = "mhisto"; + // final static private boolean DISPLAY_M_HISTOGRAMS_OPTION_DEFAULT = false; + final static private String JACKNIFE_OPTION = "jack"; + final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; + final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; + private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100; + final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; + final static private double JACKNIFE_RATIO_DEFAULT = 0.5; + //final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference"; + final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh"; + final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh"; + final static private String FILTER_POSITIVE_OPTION = "pos_filter"; + final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; + final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; + final static private String INPUT_FILES_FROM_FILE_OPTION = "input"; + final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; + final static private String SEQ_EXTRACT_OPTION = "prot_extract"; + final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "2009.07.06"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; + final static private boolean IGNORE_DUFS_DEFAULT = true; + final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; + final static private double MAX_E_VALUE_DEFAULT = -1; + final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + final static private String DEFAULT_SEARCH_PARAMETER = "ls"; + final private static boolean ALLOW_NON_UNIQUE_QUERY_IN_HMMPFAM_OUTPUT_DEFAULT = true; + final private static boolean VERBOSE_DEFAULT = true; + private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; + private static final String SEQ_EXTRACT_SUFFIX = ".prot"; + private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; + private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; + private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; + private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; + private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; + private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; + private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; + private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; + private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + private static final boolean VERBOSE = false; + private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; + private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; + private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; + private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; + public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; + public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; + public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; + public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains"; + public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains"; + public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc"; + public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc"; + public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS"; + public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS"; + public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities"; + + // final String error = ForesterUtil.isReadableFile( new File( + // input_file_properties[ i ][ 0 ] ) ); + // if ( !ForesterUtil.isEmpty( error ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + // } + private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + "_" + + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, error ); + } + } + } + } + + private static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "E-value: " + e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing_old.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + /** + * Warning: This sideeffects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @throws IOException + */ + private static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + SurfacingUtil.checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + private static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List plus_minus_analysis_numbers ) { + final Set all_spec = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_spec.add( gwcd.getSpecies().getSpeciesId() ); + } + final File html_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX_HTML ); + final File plain_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX ); + final File html_out_dc = new File( output_file + PLUS_MINUS_DC_SUFFIX_HTML ); + final File all_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); + final File passing_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + final File proteins_file_base = new File( output_file + "" ); + final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); + final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); + try { + DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, + protein_lists_per_species, + plus_minus_analysis_high_copy_base, + plus_minus_analysis_high_copy_target, + plus_minus_analysis_low_copy, + min_diff, + factor, + plain_out_dom, + html_out_dom, + html_out_dc, + domain_id_to_go_ids_map, + go_id_to_term_map, + all_domains_go_ids_out_dom, + passing_domains_go_ids_out_dom, + proteins_file_base ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + plain_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dc + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, + "Wrote plus minus domain analysis based passing GO ids to \"" + + passing_domains_go_ids_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" + + all_domains_go_ids_out_dom + "\"" ); + } + + private static Phylogeny[] getIntrees( final File[] intree_files, + final int number_of_genomes, + final String[][] input_file_properties ) { + final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; + int i = 0; + for( final File intree_file : intree_files ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + + error ); + } + try { + final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ForesterUtil.createParserDependingOnFileType( intree_file, true ) ); + if ( p_array.length < 1 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( p_array.length > 1 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = p_array[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "failed to read input tree from file [" + intree_file + + "]: " + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "number of external nodes [" + + intree.getNumberOfExternalNodes() + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + number_of_genomes + "]" ); + } + final StringBuilder parent_names = new StringBuilder(); + final int nodes_lacking_name = SurfacingUtil.getNumberOfNodesLackingName( intree, parent_names ); + if ( nodes_lacking_name > 0 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "input tree [" + intree_file + "] has " + + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); + } + preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); + if ( !intree.isCompletelyBinary() ) { + ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "input tree [" + intree_file + + "] is not completely binary" ); + } + intrees[ i++ ] = intree; + } + return intrees; + } + + private static List inferSpeciesTrees( final File outfile, final List distances_list ) { + final NeighborJoining nj = NeighborJoining.createInstance(); + final List phylogenies = nj.execute( distances_list ); + final PhylogenyWriter w = new PhylogenyWriter(); + try { + w.toNewHampshire( phylogenies, true, true, outfile, ";" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); + } + return phylogenies; + } + + public static void main( final String args[] ) { + final long start_time = new Date().getTime(); + // final StringBuffer log = new StringBuffer(); + final StringBuilder html_desc = new StringBuilder(); + ForesterUtil.printProgramInformation( surfacing_old.PRG_NAME, + surfacing_old.PRG_VERSION, + surfacing_old.PRG_DATE, + surfacing_old.E_MAIL, + surfacing_old.WWW ); + final String nl = ForesterUtil.LINE_SEPARATOR; + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( surfacing_old.HELP_OPTION_1 ) || cla.isOptionSet( surfacing_old.HELP_OPTION_2 ) ) { + surfacing_old.printHelp(); + System.exit( 0 ); + } + if ( ( args.length < 1 ) ) { + surfacing_old.printHelp(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( surfacing_old.NOT_IGNORE_DUFS_OPTION ); + allowed_options.add( surfacing_old.MAX_E_VALUE_OPTION ); + allowed_options.add( surfacing_old.DETAILEDNESS_OPTION ); + allowed_options.add( surfacing_old.OUTPUT_FILE_OPTION ); + allowed_options.add( surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION ); + allowed_options.add( surfacing_old.SPECIES_MATRIX_OPTION ); + allowed_options.add( surfacing_old.SCORING_OPTION ); + allowed_options.add( surfacing_old.MAX_ALLOWED_OVERLAP_OPTION ); + allowed_options.add( surfacing_old.NO_ENGULFING_OVERLAP_OPTION ); + allowed_options.add( surfacing_old.DOMAIN_COUNT_SORT_OPTION ); + allowed_options.add( surfacing_old.CUTOFF_SCORE_FILE_OPTION ); + allowed_options.add( surfacing_old.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ); + allowed_options.add( surfacing_old.OUTPUT_DIR_OPTION ); + allowed_options.add( surfacing_old.IGNORE_COMBINATION_WITH_SAME_OPTION ); + allowed_options.add( surfacing_old.PFAM_TO_GO_FILE_USE_OPTION ); + allowed_options.add( surfacing_old.GO_OBO_FILE_USE_OPTION ); + allowed_options.add( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION ); + allowed_options.add( surfacing_old.GO_NAMESPACE_LIMIT_OPTION ); + allowed_options.add( surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + allowed_options.add( surfacing_old.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); + allowed_options.add( surfacing_old.DISPLAY_M_HISTOGRAMS_OPTION ); + allowed_options.add( surfacing_old.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); + allowed_options.add( JACKNIFE_OPTION ); + allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); + allowed_options.add( JACKNIFE_RATIO_OPTION ); + allowed_options.add( INPUT_SPECIES_TREE_OPTION ); + //allowed_options.add( INFER_SPECIES_TREES_OPTION ); + allowed_options.add( FILTER_POSITIVE_OPTION ); + allowed_options.add( FILTER_NEGATIVE_OPTION ); + allowed_options.add( INPUT_FILES_FROM_FILE_OPTION ); + allowed_options.add( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION ); + allowed_options.add( IGNORE_VIRAL_IDS ); + allowed_options.add( SEQ_EXTRACT_OPTION ); + allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); + allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); + allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); + allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); + allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + boolean ignore_dufs = surfacing_old.IGNORE_DUFS_DEFAULT; + boolean ignore_combination_with_same = surfacing_old.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; + double e_value_max = surfacing_old.MAX_E_VALUE_DEFAULT; + int max_allowed_overlap = surfacing_old.MAX_ALLOWED_OVERLAP_DEFAULT; + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean output_binary_domain_combinationsfor_graph_analysis = false; + if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { + output_binary_domain_combinationsfor_graph_analysis = true; + } + if ( cla.isOptionSet( surfacing_old.MAX_E_VALUE_OPTION ) ) { + try { + e_value_max = cla.getOptionValueAsDouble( surfacing_old.MAX_E_VALUE_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no acceptable value for E-value maximum" ); + } + } + if ( cla.isOptionSet( surfacing_old.MAX_ALLOWED_OVERLAP_OPTION ) ) { + try { + max_allowed_overlap = cla.getOptionValueAsInt( surfacing_old.MAX_ALLOWED_OVERLAP_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no acceptable value for maximal allowed domain overlap" ); + } + } + boolean no_engulfing_overlaps = false; + if ( cla.isOptionSet( surfacing_old.NO_ENGULFING_OVERLAP_OPTION ) ) { + no_engulfing_overlaps = true; + } + boolean ignore_virus_like_ids = false; + if ( cla.isOptionSet( surfacing_old.IGNORE_VIRAL_IDS ) ) { + ignore_virus_like_ids = true; + } + if ( cla.isOptionSet( surfacing_old.NOT_IGNORE_DUFS_OPTION ) ) { + ignore_dufs = false; + } + if ( cla.isOptionSet( surfacing_old.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) { + ignore_combination_with_same = true; + } + boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT; + if ( cla.isOptionSet( surfacing_old.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) { + ignore_domains_without_combs_in_all_spec = true; + } + boolean ignore_species_specific_domains = IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing_old.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION ) ) { + ignore_species_specific_domains = true; + } + File output_file = null; + if ( cla.isOptionSet( surfacing_old.OUTPUT_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.OUTPUT_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no value for domain combinations similarities output file: -" + + surfacing_old.OUTPUT_FILE_OPTION + "=" ); + } + output_file = new File( cla.getOptionValue( surfacing_old.OUTPUT_FILE_OPTION ) ); + SurfacingUtil.checkForOutputFileWriteability( output_file ); + } + File cutoff_scores_file = null; + Map individual_domain_score_cutoffs = null; + if ( cla.isOptionSet( surfacing_old.CUTOFF_SCORE_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.CUTOFF_SCORE_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for individual domain score cutoffs file: -" + + surfacing_old.CUTOFF_SCORE_FILE_OPTION + "=" ); + } + cutoff_scores_file = new File( cla.getOptionValue( surfacing_old.CUTOFF_SCORE_FILE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( cutoff_scores_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read individual domain score cutoffs file: " + + error ); + } + try { + final BasicTable scores_table = BasicTableParser.parse( cutoff_scores_file, " " ); + individual_domain_score_cutoffs = scores_table.getColumnsAsMap( 0, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "cannot read from individual domain score cutoffs file: " + e ); + } + } + BinaryDomainCombination.DomainCombinationType dc_type = BinaryDomainCombination.DomainCombinationType.BASIC; + if ( cla.isOptionSet( surfacing_old.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED; + } + if ( cla.isOptionSet( surfacing_old.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ) ) { + dc_type = BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT; + } + File out_dir = null; + if ( cla.isOptionSet( surfacing_old.OUTPUT_DIR_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.OUTPUT_DIR_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for output directory: -" + + surfacing_old.OUTPUT_DIR_OPTION + "=" ); + } + out_dir = new File( cla.getOptionValue( surfacing_old.OUTPUT_DIR_OPTION ) ); + if ( out_dir.exists() && ( out_dir.listFiles().length > 0 ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "\"" + out_dir + "\" aready exists and is not empty" ); + } + if ( !out_dir.exists() ) { + final boolean success = out_dir.mkdir(); + if ( !success || !out_dir.exists() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "failed to create \"" + out_dir + "\"" ); + } + } + if ( !out_dir.canWrite() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot write to \"" + out_dir + "\"" ); + } + } + File positive_filter_file = null; + File negative_filter_file = null; + File negative_domains_filter_file = null; + if ( cla.isOptionSet( surfacing_old.FILTER_NEGATIVE_OPTION ) + && cla.isOptionSet( surfacing_old.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil + .fatalError( surfacing_old.PRG_NAME, "attempt to use both negative and positive protein filter" ); + } + if ( cla.isOptionSet( surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION ) + && ( cla.isOptionSet( surfacing_old.FILTER_NEGATIVE_OPTION ) || cla + .isOptionSet( surfacing_old.FILTER_POSITIVE_OPTION ) ) ) { + ForesterUtil + .fatalError( surfacing_old.PRG_NAME, + "attempt to use both negative or positive protein filter together wirh a negative domains filter" ); + } + if ( cla.isOptionSet( surfacing_old.FILTER_NEGATIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.FILTER_NEGATIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for negative filter: -" + + surfacing_old.FILTER_NEGATIVE_OPTION + "=" ); + } + negative_filter_file = new File( cla.getOptionValue( surfacing_old.FILTER_NEGATIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "can not read from \"" + negative_filter_file + "\": " + + msg ); + } + } + else if ( cla.isOptionSet( surfacing_old.FILTER_POSITIVE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.FILTER_POSITIVE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for positive filter: -" + + surfacing_old.FILTER_POSITIVE_OPTION + "=" ); + } + positive_filter_file = new File( cla.getOptionValue( surfacing_old.FILTER_POSITIVE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( positive_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "can not read from \"" + positive_filter_file + "\": " + + msg ); + } + } + else if ( cla.isOptionSet( surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for negative domains filter: -" + + surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION + "=" ); + } + negative_domains_filter_file = new File( cla.getOptionValue( surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( negative_domains_filter_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "can not read from \"" + negative_domains_filter_file + + "\": " + msg ); + } + } + final List plus_minus_analysis_high_copy_base_species = new ArrayList(); + final List plus_minus_analysis_high_copy_target_species = new ArrayList(); + final List plus_minus_analysis_high_low_copy_species = new ArrayList(); + final List plus_minus_analysis_numbers = new ArrayList(); + processPlusMinusAnalysisOption( cla, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + plus_minus_analysis_numbers ); + File input_files_file = null; + String[] input_file_names_from_file = null; + if ( cla.isOptionSet( surfacing_old.INPUT_FILES_FROM_FILE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.INPUT_FILES_FROM_FILE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for input files file: -" + + surfacing_old.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + input_files_file = new File( cla.getOptionValue( surfacing_old.INPUT_FILES_FROM_FILE_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( input_files_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "can not read from \"" + input_files_file + "\": " + + msg ); + } + try { + input_file_names_from_file = ForesterUtil.file2array( input_files_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "failed to read from \"" + input_files_file + "\": " + + e ); + } + } + if ( ( cla.getNumberOfNames() < 1 ) + && ( ( input_file_names_from_file == null ) || ( input_file_names_from_file.length < 1 ) ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "No hmmpfam output file indicated is input: use comand line directly or " + + surfacing_old.INPUT_FILES_FROM_FILE_OPTION + "=" ); + } + DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; + if ( cla.isOptionSet( surfacing_old.SCORING_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.SCORING_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no value for scoring method for domain combinations similarity calculation: -" + + surfacing_old.SCORING_OPTION + "=<" + + surfacing_old.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing_old.SCORING_PROTEIN_COUNT_BASED + "|" + + surfacing_old.SCORING_COMBINATION_BASED + ">\"" ); + } + final String scoring_str = cla.getOptionValue( surfacing_old.SCORING_OPTION ); + if ( scoring_str.equals( surfacing_old.SCORING_DOMAIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.DOMAINS; + } + else if ( scoring_str.equals( surfacing_old.SCORING_COMBINATION_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + } + else if ( scoring_str.equals( surfacing_old.SCORING_PROTEIN_COUNT_BASED ) ) { + scoring = DomainSimilarity.DomainSimilarityScoring.PROTEINS; + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + scoring_str + + "\" for scoring method for domain combinations similarity calculation: \"-" + + surfacing_old.SCORING_OPTION + "=<" + surfacing_old.SCORING_DOMAIN_COUNT_BASED + "|" + + surfacing_old.SCORING_PROTEIN_COUNT_BASED + "|" + surfacing_old.SCORING_COMBINATION_BASED + + ">\"" ); + } + } + boolean sort_by_species_count_first = false; + if ( cla.isOptionSet( surfacing_old.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION ) ) { + sort_by_species_count_first = true; + } + boolean species_matrix = false; + if ( cla.isOptionSet( surfacing_old.SPECIES_MATRIX_OPTION ) ) { + species_matrix = true; + } + boolean output_protein_lists_for_all_domains = false; + if ( cla.isOptionSet( surfacing_old.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) { + output_protein_lists_for_all_domains = true; + } + Detailedness detailedness = DETAILEDNESS_DEFAULT; + if ( cla.isOptionSet( surfacing_old.DETAILEDNESS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.DETAILEDNESS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for -" + surfacing_old.DETAILEDNESS_OPTION + + "=<" + surfacing_old.DETAILEDNESS_BASIC + "|" + surfacing_old.DETAILEDNESS_LIST_IDS + "|" + + surfacing_old.DETAILEDNESS_PUNCTILIOUS + ">\"" ); + } + final String detness = cla.getOptionValue( surfacing_old.DETAILEDNESS_OPTION ).toLowerCase(); + if ( detness.equals( surfacing_old.DETAILEDNESS_BASIC ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.BASIC; + } + else if ( detness.equals( surfacing_old.DETAILEDNESS_LIST_IDS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES; + } + else if ( detness.equals( surfacing_old.DETAILEDNESS_PUNCTILIOUS ) ) { + detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + detness + + "\" for detailedness: \"-" + surfacing_old.DETAILEDNESS_OPTION + "=<" + + surfacing_old.DETAILEDNESS_BASIC + "|" + surfacing_old.DETAILEDNESS_LIST_IDS + "|" + + surfacing_old.DETAILEDNESS_PUNCTILIOUS + ">\"" ); + } + } + String automated_pairwise_comparison_suffix = null; + boolean perform_pwc = false; + boolean write_pwc_files = false; + if ( cla.isOptionSet( surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + perform_pwc = true; + if ( !cla.isOptionValueSet( surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION ) ) { + write_pwc_files = false; + } + else { + write_pwc_files = true; + automated_pairwise_comparison_suffix = "_" + + cla.getOptionValue( surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); + } + } + String query_domain_ids = null; + if ( cla.isOptionSet( surfacing_old.SEQ_EXTRACT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.SEQ_EXTRACT_OPTION ) ) { + ForesterUtil + .fatalError( surfacing_old.PRG_NAME, + "no domain ids given for sequences with given domains to be extracted : -" + + surfacing_old.SEQ_EXTRACT_OPTION + + "=" ); + } + query_domain_ids = cla.getOptionValue( surfacing_old.SEQ_EXTRACT_OPTION ); + } + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field = DOMAIN_SORT_FILD_DEFAULT; + DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT; + if ( cla.isOptionSet( surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no value for domain combinations similarities sorting: -" + + surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + + surfacing_old.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_MAX + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_MEAN + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_DIFF + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_SD + ">\"" ); + } + final String sort_str = cla.getOptionValue( surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION ).toLowerCase(); + if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_ALPHA ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_MAX ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_MIN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MIN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_MEAN ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MEAN; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MEAN; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SPECIES_COUNT; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_SD ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SD; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + else if ( sort_str.equals( surfacing_old.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + sort_str + + "\" for domain combinations similarities sorting: \"-" + + surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + + surfacing_old.DOMAIN_SIMILARITY_SORT_ALPHA + "|" + surfacing_old.DOMAIN_SIMILARITY_SORT_MAX + + "|" + surfacing_old.DOMAIN_SIMILARITY_SORT_MIN + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_MEAN + "|" + surfacing_old.DOMAIN_SIMILARITY_SORT_DIFF + + "|" + surfacing_old.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + + surfacing_old.DOMAIN_SIMILARITY_SORT_SD + ">\"" ); + } + } + PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option = DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT; + if ( cla.isOptionSet( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for print option: -" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.HTML; + } + else if ( sort.equals( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML ) ) { + // domain_similarity_print_option = + // DomainSimilarity.PRINT_OPTION.SIMPLE_HTML; + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "simple HTML output not implemented yet :(" ); + } + else if ( sort.equals( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED ) ) { + domain_similarity_print_option = PrintableDomainSimilarity.PRINT_OPTION.SIMPLE_TAB_DELIMITED; + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + sort + "\" for print option: -" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|" + + surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" ); + } + } + GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order = DOMAINS_SORT_ORDER_DEFAULT; + if ( cla.isOptionSet( surfacing_old.DOMAIN_COUNT_SORT_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.DOMAIN_COUNT_SORT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for sorting of domain counts: -" + + surfacing_old.DOMAIN_COUNT_SORT_OPTION + "=<" + surfacing_old.DOMAIN_COUNT_SORT_ALPHA + "|" + + surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing_old.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + final String sort = cla.getOptionValue( surfacing_old.DOMAIN_COUNT_SORT_OPTION ).toLowerCase(); + if ( sort.equals( surfacing_old.DOMAIN_COUNT_SORT_ALPHA ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + } + else if ( sort.equals( surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT; + } + else if ( sort.equals( surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT; + } + else if ( sort.equals( surfacing_old.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT ) ) { + dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT; + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + sort + + "\" for sorting of domain counts: \"-" + surfacing_old.DOMAIN_COUNT_SORT_OPTION + "=<" + + surfacing_old.DOMAIN_COUNT_SORT_ALPHA + "|" + + surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|" + + surfacing_old.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|" + + surfacing_old.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); + } + } + String[][] input_file_properties = null; + if ( input_file_names_from_file != null ) { + input_file_properties = surfacing_old.processInputFileNames( input_file_names_from_file ); + } + else { + input_file_properties = surfacing_old.processInputFileNames( cla.getNames() ); + } + final int number_of_genomes = input_file_properties.length; + if ( number_of_genomes < 2 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot analyze less than two files" ); + } + if ( ( number_of_genomes < 3 ) && perform_pwc ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot use : -" + + surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "= to turn on pairwise analyses with less than three input files" ); + } + checkWriteabilityForPairwiseComparisons( domain_similarity_print_option, + input_file_properties, + automated_pairwise_comparison_suffix, + out_dir ); + for( int i = 0; i < number_of_genomes; i++ ) { + File dcc_outfile = new File( input_file_properties[ i ][ 0 ] + + surfacing_old.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + if ( out_dir != null ) { + dcc_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + dcc_outfile ); + } + SurfacingUtil.checkForOutputFileWriteability( dcc_outfile ); + } + File pfam_to_go_file = null; + Map> domain_id_to_go_ids_map = null; + int domain_id_to_go_ids_count = 0; + if ( cla.isOptionSet( surfacing_old.PFAM_TO_GO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.PFAM_TO_GO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for Pfam to GO mapping file: -" + + surfacing_old.PFAM_TO_GO_FILE_USE_OPTION + "=" ); + } + pfam_to_go_file = new File( cla.getOptionValue( surfacing_old.PFAM_TO_GO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( pfam_to_go_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read Pfam to GO mapping file: " + error ); + } + try { + final PfamToGoParser parser = new PfamToGoParser( pfam_to_go_file ); + final List pfam_to_go_mappings = parser.parse(); + domain_id_to_go_ids_map = SurfacingUtil.createDomainIdToGoIdMap( pfam_to_go_mappings ); + if ( parser.getMappingCount() < domain_id_to_go_ids_map.size() ) { + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, + "parser.getMappingCount() < domain_id_to_go_ids_map.size()" ); + } + domain_id_to_go_ids_count = parser.getMappingCount(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read from Pfam to GO mapping file: " + e ); + } + } + File go_obo_file = null; + List go_terms = null; + if ( cla.isOptionSet( surfacing_old.GO_OBO_FILE_USE_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.GO_OBO_FILE_USE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for GO OBO file: -" + + surfacing_old.GO_OBO_FILE_USE_OPTION + "=" ); + } + if ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot use GO OBO file (-" + + surfacing_old.GO_OBO_FILE_USE_OPTION + "=) without Pfam to GO mapping file (" + + surfacing_old.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + go_obo_file = new File( cla.getOptionValue( surfacing_old.GO_OBO_FILE_USE_OPTION ) ); + final String error = ForesterUtil.isReadableFile( go_obo_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read GO OBO file: " + error ); + } + try { + final OBOparser parser = new OBOparser( go_obo_file, OBOparser.ReturnType.BASIC_GO_TERM ); + go_terms = parser.parse(); + if ( parser.getGoTermCount() != go_terms.size() ) { + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, + "parser.getGoTermCount() != go_terms.size()" ); + } + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read from GO OBO file: " + + e.getLocalizedMessage() ); + } + } + Map go_id_to_term_map = null; + if ( ( ( domain_id_to_go_ids_map != null ) && ( domain_id_to_go_ids_map.size() > 0 ) ) + && ( ( go_terms != null ) && ( go_terms.size() > 0 ) ) ) { + go_id_to_term_map = GoUtils.createGoIdToGoTermMap( go_terms ); + } + GoNameSpace go_namespace_limit = null; + if ( cla.isOptionSet( surfacing_old.GO_NAMESPACE_LIMIT_OPTION ) ) { + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot use GO namespace limit (-" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION + "=) without Pfam to GO mapping file (" + + surfacing_old.PFAM_TO_GO_FILE_USE_OPTION + "=) and GO OBO file (-" + + surfacing_old.GO_OBO_FILE_USE_OPTION + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing_old.GO_NAMESPACE_LIMIT_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for GO namespace limit: \"-" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + final String go_namespace_limit_str = cla.getOptionValue( surfacing_old.GO_NAMESPACE_LIMIT_OPTION ) + .toLowerCase(); + if ( go_namespace_limit_str.equals( surfacing_old.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION ) ) { + go_namespace_limit = GoNameSpace.createMolecularFunction(); + } + else if ( go_namespace_limit_str.equals( surfacing_old.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS ) ) { + go_namespace_limit = GoNameSpace.createBiologicalProcess(); + } + else if ( go_namespace_limit_str.equals( surfacing_old.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT ) ) { + go_namespace_limit = GoNameSpace.createCellularComponent(); + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "unknown value \"" + go_namespace_limit_str + + "\" for GO namespace limit: \"-" + surfacing_old.GO_NAMESPACE_LIMIT_OPTION + "=<" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|" + + surfacing_old.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); + } + } + if ( ( domain_similarity_sort_field == DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE ) + && ( number_of_genomes > 2 ) ) { + domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + } + boolean jacknifed_distances = false; + int jacknife_resamplings = JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT; + double jacknife_ratio = JACKNIFE_RATIO_DEFAULT; + long random_seed = JACKNIFE_RANDOM_SEED_DEFAULT; + if ( cla.isOptionSet( surfacing_old.JACKNIFE_OPTION ) ) { + if ( ( number_of_genomes < 3 ) || !perform_pwc ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot use jacknife resampling analysis (-" + + surfacing_old.JACKNIFE_OPTION + "[=]) without pairwise analyses (" + + surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + jacknifed_distances = true; + if ( cla.isOptionHasAValue( surfacing_old.JACKNIFE_OPTION ) ) { + try { + jacknife_resamplings = cla.getOptionValueAsInt( surfacing_old.JACKNIFE_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "illegal format for number of resamplings" ); + } + if ( jacknife_resamplings < 2 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "attempt to use less than 2 resamplings" ); + } + } + if ( cla.isOptionSet( surfacing_old.JACKNIFE_RATIO_OPTION ) + && cla.isOptionHasAValue( surfacing_old.JACKNIFE_RATIO_OPTION ) ) { + try { + jacknife_ratio = cla.getOptionValueAsDouble( surfacing_old.JACKNIFE_RATIO_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "illegal format for jacknife ratio" ); + } + if ( ( jacknife_ratio <= 0.0 ) || ( jacknife_ratio >= 1.0 ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "attempt to use illegal value for jacknife ratio: " + jacknife_ratio ); + } + } + if ( cla.isOptionSet( surfacing_old.JACKNIFE_RANDOM_SEED_OPTION ) + && cla.isOptionHasAValue( surfacing_old.JACKNIFE_RANDOM_SEED_OPTION ) ) { + try { + random_seed = cla.getOptionValueAsLong( surfacing_old.JACKNIFE_RANDOM_SEED_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "illegal format for random generator seed" ); + } + } + } + // boolean infer_species_trees = false; + // if ( cla.isOptionSet( surfacing.INFER_SPECIES_TREES_OPTION ) ) { + // if ( ( output_file == null ) || ( number_of_genomes < 3 ) + // || ForesterUtil.isEmpty( automated_pairwise_comparison_suffix ) ) { + // ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer species trees (-" + // + surfacing.INFER_SPECIES_TREES_OPTION + " without pairwise analyses (" + // + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + // + "=)" ); + // } + // infer_species_trees = true; + // } + File[] intree_files = null; + Phylogeny[] intrees = null; + if ( cla.isOptionSet( surfacing_old.INPUT_SPECIES_TREE_OPTION ) ) { + // TODO FIXME if jacknife.... maybe not + if ( number_of_genomes < 3 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "cannot infer gains and losses on input species trees (-" + + surfacing_old.INPUT_SPECIES_TREE_OPTION + + " without pairwise analyses (" + + surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "=)" ); + } + if ( !cla.isOptionValueSet( surfacing_old.INPUT_SPECIES_TREE_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for input tree: -" + + surfacing_old.INPUT_SPECIES_TREE_OPTION + "=" ); + } + final String intrees_str = cla.getOptionValue( surfacing_old.INPUT_SPECIES_TREE_OPTION ); + if ( intrees_str.indexOf( "#" ) > 0 ) { + final String[] intrees_strs = intrees_str.split( "#" ); + intree_files = new File[ intrees_strs.length ]; + int i = 0; + for( final String s : intrees_strs ) { + intree_files[ i++ ] = new File( s.trim() ); + } + } + else { + intree_files = new File[ 1 ]; + intree_files[ 0 ] = new File( intrees_str ); + } + intrees = getIntrees( intree_files, number_of_genomes, input_file_properties ); + } + long random_number_seed_for_fitch_parsimony = 0l; + boolean radomize_fitch_parsimony = false; + if ( cla.isOptionSet( surfacing_old.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for random number seed: -" + + surfacing_old.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + "=" ); + } + try { + random_number_seed_for_fitch_parsimony = cla + .getOptionValueAsLong( RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + radomize_fitch_parsimony = true; + } + SortedSet filter = null; + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + filter = new TreeSet(); + if ( positive_filter_file != null ) { + processFilter( positive_filter_file, filter ); + } + else if ( negative_filter_file != null ) { + processFilter( negative_filter_file, filter ); + } + else if ( negative_domains_filter_file != null ) { + processFilter( negative_domains_filter_file, filter ); + } + } + Map>[] domain_id_to_secondary_features_maps = null; + File[] secondary_features_map_files = null; + final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile ); + } + if ( cla.isOptionSet( surfacing_old.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + if ( !cla.isOptionValueSet( surfacing_old.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for secondary features map file: -" + + surfacing_old.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=" ); + } + final String[] secondary_features_map_files_strs = cla + .getOptionValue( surfacing_old.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ).split( "#" ); + secondary_features_map_files = new File[ secondary_features_map_files_strs.length ]; + domain_id_to_secondary_features_maps = new Map[ secondary_features_map_files_strs.length ]; + int i = 0; + for( final String secondary_features_map_files_str : secondary_features_map_files_strs ) { + secondary_features_map_files[ i ] = new File( secondary_features_map_files_str ); + final String error = ForesterUtil.isReadableFile( secondary_features_map_files[ i ] ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read secondary features map file: " + + error ); + } + try { + domain_id_to_secondary_features_maps[ i ] = SurfacingUtil + .createDomainIdToSecondaryFeaturesMap( secondary_features_map_files[ i ] ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "cannot read secondary features map file: " + + e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "problem with contents of features map file [" + + secondary_features_map_files[ i ] + "]: " + e.getMessage() ); + } + i++; + } + } + if ( out_dir == null ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no output directory indicated (-" + + surfacing_old.OUTPUT_DIR_OPTION + "=)" ); + } + if ( output_file == null ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no name for (main) output file indicated (-" + + surfacing_old.OUTPUT_FILE_OPTION + "=)" ); + } + if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no (acceptable) Pfam to GO id mapping file provided ('pfam2go file') (-" + + surfacing_old.PFAM_TO_GO_FILE_USE_OPTION + "=)" ); + } + if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "no (acceptable) go id to term mapping file provided ('GO OBO file') (-" + + surfacing_old.GO_OBO_FILE_USE_OPTION + "=)" ); + } + boolean display_histograms = false; + if ( cla.isOptionSet( surfacing_old.DISPLAY_M_HISTOGRAMS_OPTION ) ) { + display_histograms = true; + } + System.out.println( "Output directory : " + out_dir ); + if ( input_file_names_from_file != null ) { + System.out.println( "Input files names from : " + input_files_file + " [" + + input_file_names_from_file.length + " input files]" ); + html_desc.append( "" + nl ); + } + if ( positive_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Positive protein filter : " + positive_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative protein filter : " + negative_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( negative_domains_filter_file != null ) { + final int filter_size = filter.size(); + System.out.println( "Negative domain filter : " + negative_domains_filter_file + " [" + filter_size + + " domain ids]" ); + html_desc.append( "" + nl ); + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + String plus0 = ""; + for( final String s : plus_minus_analysis_high_copy_base_species ) { + plus0 += "+" + s + " "; + } + String plus1 = ""; + for( final String s : plus_minus_analysis_high_copy_target_species ) { + plus1 += "*" + s + " "; + } + String minus = ""; + for( final String s : plus_minus_analysis_high_low_copy_species ) { + minus += "-" + s + " "; + } + System.out.println( "Plus-minus analysis : " + plus1 + "&& " + plus0 + "&& " + minus ); + html_desc.append( "" + nl ); + } + if ( cutoff_scores_file != null ) { + System.out.println( "Cutoff scores file : " + cutoff_scores_file ); + html_desc.append( "" + nl ); + } + if ( e_value_max >= 0.0 ) { + System.out.println( "E-value maximum (inclusive) : " + e_value_max ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore DUFs : " + ignore_dufs ); + if ( ignore_virus_like_ids ) { + System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids ); + html_desc.append( "" + nl ); + } + html_desc.append( "" + nl ); + if ( max_allowed_overlap != surfacing_old.MAX_ALLOWED_OVERLAP_DEFAULT ) { + System.out.println( "Max allowed domain overlap : " + max_allowed_overlap ); + html_desc.append( "" + nl ); + } + if ( no_engulfing_overlaps ) { + System.out.println( "Ignore engulfed domains : " + no_engulfing_overlaps ); + html_desc.append( "" + nl ); + } + System.out.println( "Ignore singlet domains : " + ignore_domains_without_combs_in_all_spec ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore species specific doms: " + ignore_species_specific_domains ); + html_desc + .append( "" + nl ); + System.out.println( "Ignore combination with self: " + ignore_combination_with_same ); + html_desc.append( "" + nl ); + ; + System.out.println( "Consider directedness : " + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) ); + html_desc.append( "" + nl ); + if ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) { + System.out.println( "Consider adjacency : " + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) ); + html_desc.append( "" + + nl ); + } + System.out.print( "Domain counts sort order : " ); + switch ( dc_sort_order ) { + case ALPHABETICAL_KEY_ID: + System.out.println( "alphabetical" ); + break; + case KEY_DOMAIN_COUNT: + System.out.println( "domain count" ); + break; + case KEY_DOMAIN_PROTEINS_COUNT: + System.out.println( "domain proteins count" ); + break; + case COMBINATIONS_COUNT: + System.out.println( "domain combinations count" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for dc sort order" ); + } + if ( domain_id_to_go_ids_map != null ) { + System.out.println( "Pfam to GO mappings from : " + pfam_to_go_file + " [" + domain_id_to_go_ids_count + + " mappings]" ); + html_desc.append( "" + nl ); + } + if ( go_terms != null ) { + System.out.println( "GO terms from : " + go_obo_file + " [" + go_terms.size() + " terms]" ); + html_desc.append( "" + nl ); + } + if ( go_namespace_limit != null ) { + System.out.println( "Limit GO terms to : " + go_namespace_limit.toString() ); + html_desc.append( "" + nl ); + } + if ( perform_pwc ) { + System.out.println( "Suffix for PWC files : " + automated_pairwise_comparison_suffix ); + html_desc.append( "" + nl ); + } + if ( out_dir != null ) { + System.out.println( "Output directory : " + out_dir ); + } + if ( query_domain_ids != null ) { + System.out.println( "Query domains (ordered) : " + query_domain_ids ); + html_desc.append( "" + nl ); + } + System.out.println( "Write similarities to : " + output_file ); + System.out.print( " Scoring method : " ); + html_desc.append( "" + nl ); + break; + case DOMAINS: + System.out.println( "domain counts based" ); + html_desc.append( "domain counts based" + "" + nl ); + break; + case PROTEINS: + System.out.println( "domain proteins counts based" ); + html_desc.append( "domain proteins counts based" + "" + nl ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for sorting for scoring" ); + } + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + System.out.print( " Detailedness : " ); + switch ( detailedness ) { + case BASIC: + System.out.println( "basic" ); + break; + case LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES: + System.out.println( "list combining domains for each species" ); + break; + case PUNCTILIOUS: + System.out.println( "punctilious" ); + break; + default: + ForesterUtil + .unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for sorting for detailedness" ); + } + System.out.print( " Print option : " ); + switch ( domain_similarity_print_option ) { + case HTML: + System.out.println( "HTML" ); + break; + case SIMPLE_TAB_DELIMITED: + System.out.println( "simple tab delimited" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for print option" ); + } + System.out.print( " Species matrix : " + species_matrix ); + System.out.println(); + if ( perform_pwc ) { + System.out.println( "Pairwise comparisons: " ); + html_desc.append( "" ); + System.out.print( " Sort by : " ); + html_desc.append( "" + nl ); + if ( jacknifed_distances ) { + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + System.out.println( " Jacknife : " + jacknife_resamplings + " resamplings" ); + System.out.println( " Ratio : " + ForesterUtil.round( jacknife_ratio, 2 ) ); + System.out.println( " Random number seed : " + random_seed ); + } + // if ( infer_species_trees ) { + // html_desc.append( "" + nl ); + // System.out.println( " Infer species trees : true" ); + // } + if ( ( intrees != null ) && ( intrees.length > 0 ) ) { + for( final File intree_file : intree_files ) { + html_desc.append( "" + nl ); + System.out.println( " Intree for gain/loss pars.: " + intree_file ); + } + } + if ( radomize_fitch_parsimony ) { + html_desc.append( "" + nl ); + System.out.println( " Random number seed : " + random_number_seed_for_fitch_parsimony ); + } + if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + for( int i = 0; i < secondary_features_map_files.length; i++ ) { + html_desc.append( "" + nl ); + System.out.println( "Secondary features map file : " + secondary_features_map_files[ i ] + + " [mappings for " + domain_id_to_secondary_features_maps[ i ].size() + " domain ids]" ); + if ( VERBOSE ) { + System.out.println(); + System.out.println( "Domain ids to secondary features map:" ); + for( final DomainId domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { + System.out.print( domain_id.getId() ); + System.out.print( " => " ); + for( final String sec : domain_id_to_secondary_features_maps[ i ].get( domain_id ) ) { + System.out.print( sec ); + System.out.print( " " ); + } + System.out.println(); + } + } + } + } + } // if ( perform_pwc ) { + System.out.println(); + html_desc.append( "" + nl ); + System.out.println( "Command line : " + cla.getCommandLineArgsAsString() ); + BufferedWriter[] query_domains_writer_ary = null; + List[] query_domain_ids_array = null; + if ( query_domain_ids != null ) { + final String[] query_domain_ids_str_array = query_domain_ids.split( "#" ); + query_domain_ids_array = new ArrayList[ query_domain_ids_str_array.length ]; + query_domains_writer_ary = new BufferedWriter[ query_domain_ids_str_array.length ]; + for( int i = 0; i < query_domain_ids_str_array.length; i++ ) { + String query_domain_ids_str = query_domain_ids_str_array[ i ]; + final String[] query_domain_ids_str_ary = query_domain_ids_str.split( "~" ); + final List query = new ArrayList(); + for( final String element : query_domain_ids_str_ary ) { + query.add( new DomainId( element ) ); + } + query_domain_ids_array[ i ] = query; + query_domain_ids_str = query_domain_ids_str.replace( '~', '_' ); + String protein_names_writer_str = query_domain_ids_str + surfacing_old.SEQ_EXTRACT_SUFFIX; + if ( out_dir != null ) { + protein_names_writer_str = out_dir + ForesterUtil.FILE_SEPARATOR + protein_names_writer_str; + } + try { + query_domains_writer_ary[ i ] = new BufferedWriter( new FileWriter( protein_names_writer_str ) ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "Could not open [" + protein_names_writer_str + + "]: " + e.getLocalizedMessage() ); + } + } + } + SortedMap> protein_lists_per_species = null; //This will only be created if neede. + boolean need_protein_lists_per_species = false; + if ( ( plus_minus_analysis_high_copy_base_species.size() > 0 ) || output_protein_lists_for_all_domains ) { + need_protein_lists_per_species = true; + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species = new TreeMap>(); + } + final List gwcd_list = new ArrayList( number_of_genomes ); + final SortedSet all_domains_encountered = new TreeSet(); + final SortedSet all_bin_domain_combinations_encountered = new TreeSet(); + List all_bin_domain_combinations_gained_fitch = null; + List all_bin_domain_combinations_lost_fitch = null; + if ( ( intrees != null ) && ( intrees.length == 1 ) ) { + all_bin_domain_combinations_gained_fitch = new ArrayList(); + all_bin_domain_combinations_lost_fitch = new ArrayList(); + } + final DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); + final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + + output_file + D_PROMISCUITY_FILE_SUFFIX ); + BufferedWriter per_genome_domain_promiscuity_statistics_writer = null; + try { + per_genome_domain_promiscuity_statistics_writer = new BufferedWriter( new FileWriter( per_genome_domain_promiscuity_statistics_file ) ); + per_genome_domain_promiscuity_statistics_writer.write( "Species:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Mean:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "SD:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Median:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Min:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "N:\t" ); + per_genome_domain_promiscuity_statistics_writer.write( "Max Promiscuous Domains:" + + ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e2.getMessage() ); + } + for( int i = 0; i < number_of_genomes; ++i ) { + System.out.println(); + System.out.println( ( i + 1 ) + "/" + number_of_genomes ); + System.out.println( "Processing : " + input_file_properties[ i ][ 0 ] ); + HmmPfamOutputParser parser = null; + if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) + || ( negative_domains_filter_file != null ) ) { + HmmPfamOutputParser.FilterType filter_type = HmmPfamOutputParser.FilterType.NONE; + if ( positive_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN; + } + else if ( negative_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.NEGATIVE_PROTEIN; + } + else if ( negative_domains_filter_file != null ) { + filter_type = HmmPfamOutputParser.FilterType.NEGATIVE_DOMAIN; + } + parser = new HmmPfamOutputParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + input_file_properties[ i ][ 2 ], + filter, + filter_type ); + } + else { + parser = new HmmPfamOutputParser( new File( input_file_properties[ i ][ 0 ] ), + input_file_properties[ i ][ 1 ], + input_file_properties[ i ][ 2 ] ); + } + if ( e_value_max >= 0.0 ) { + parser.setEValueMaximum( e_value_max ); + } + parser.setIgnoreDufs( ignore_dufs ); + parser.setIgnoreVirusLikeIds( ignore_virus_like_ids ); + parser.setIgnoreEngulfedDomains( no_engulfing_overlaps ); + if ( max_allowed_overlap != surfacing_old.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parser.setMaxAllowedOverlap( max_allowed_overlap ); + } + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + if ( individual_domain_score_cutoffs != null ) { + parser.setIndividualDomainScoreCutoffs( individual_domain_score_cutoffs ); + } + parser.setAllowNonUniqueQuery( ALLOW_NON_UNIQUE_QUERY_IN_HMMPFAM_OUTPUT_DEFAULT ); + parser.setVerbose( VERBOSE_DEFAULT ); + List protein_list = null; + try { + protein_list = parser.parse(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, e.getMessage(), e ); + } + if ( VERBOSE ) { + System.out.println( "Domains ignored due to negative domain filter: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToNegativeDomainFilterCountsMap() ); + System.out.println( "Domains ignored due to virus like id: " ); + ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() ); + } + System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() ); + System.out.println( "Number of proteins stored : " + protein_list.size() ); + System.out.println( "Domains encountered : " + parser.getDomainsEncountered() ); + System.out.println( "Domains stored : " + parser.getDomainsStored() ); + System.out.println( "Distinct domains stored : " + + parser.getDomainsStoredSet().size() ); + System.out.println( "Domains ignored due to individual score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() ); + System.out.println( "Domains ignored due to E-value : " + + parser.getDomainsIgnoredDueToEval() ); + System.out.println( "Domains ignored due to DUF designation : " + + parser.getDomainsIgnoredDueToDuf() ); + if ( ignore_virus_like_ids ) { + System.out.println( "Domains ignored due virus like ids : " + + parser.getDomainsIgnoredDueToVirusLikeIds() ); + } + System.out.println( "Domains ignored due negative domain filter : " + + parser.getDomainsIgnoredDueToNegativeDomainFilter() ); + System.out.println( "Domains ignored due to overlap : " + + parser.getDomainsIgnoredDueToOverlap() ); + if ( negative_filter_file != null ) { + System.out.println( "Proteins ignored due to negative filter : " + + parser.getProteinsIgnoredDueToFilter() ); + } + if ( positive_filter_file != null ) { + System.out.println( "Proteins ignored due to positive filter : " + + parser.getProteinsIgnoredDueToFilter() ); + } + System.out.println( "Time for processing : " + parser.getTime() + "ms" ); + html_desc.append( "" + nl ); + // domain_partner_counts_array[ i ] = + // Methods.getDomainPartnerCounts( protein_domain_collections_array[ + // i ], + // false, input_file_properties[ i ][ 1 ] ); + gwcd_list.add( BasicGenomeWideCombinableDomains + .createInstance( protein_list, + ignore_combination_with_same, + new BasicSpecies( input_file_properties[ i ][ 1 ] ), + domain_id_to_go_ids_map, + dc_type ) ); + domain_lengths_table.addLengths( protein_list ); + if ( gwcd_list.get( i ).getSize() > 0 ) { + SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, + out_dir, + per_genome_domain_promiscuity_statistics_writer, + gwcd_list.get( i ), + i, + dc_sort_order ); + if ( output_binary_domain_combinationsfor_graph_analysis ) { + SurfacingUtil.writeBinaryDomainCombinationsFileForGraphAnalysis( input_file_properties, + out_dir, + gwcd_list.get( i ), + i, + dc_sort_order ); + } + SurfacingUtil.addAllDomainIdsToSet( gwcd_list.get( i ), all_domains_encountered ); + SurfacingUtil.addAllBinaryDomainCombinationToSet( gwcd_list.get( i ), + all_bin_domain_combinations_encountered ); + } + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + SurfacingUtil.extractProteinNames( protein_list, + query_domain_ids_array[ j ], + query_domains_writer_ary[ j ], + "\t" ); + query_domains_writer_ary[ j ].flush(); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } + } + if ( need_protein_lists_per_species ) { + protein_lists_per_species.put( new BasicSpecies( input_file_properties[ i ][ 1 ] ), protein_list ); + } + System.gc(); + } // for( int i = 0; i < number_of_hmmpfam_files_to_analyze; ++i ) { + try { + per_genome_domain_promiscuity_statistics_writer.flush(); + per_genome_domain_promiscuity_statistics_writer.close(); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e2.toString() ); + } + ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + + per_genome_domain_promiscuity_statistics_file ); + if ( query_domains_writer_ary != null ) { + for( int j = 0; j < query_domain_ids_array.length; j++ ) { + try { + query_domains_writer_ary[ j ].close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.toString() ); + } + } + } + if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + try { + SurfacingUtil.executeDomainLengthAnalysis( input_file_properties, + number_of_genomes, + domain_lengths_table, + domain_lengths_analysis_outfile ); + } + catch ( final IOException e1 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e1.toString() ); + } + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "Wrote domain length data to: " + domain_lengths_analysis_outfile ); + System.out.println(); + } + final long analysis_start_time = new Date().getTime(); + PairwiseDomainSimilarityCalculator pw_calc = null; + // double[] values_for_all_scores_histogram = null; + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, + sort_by_species_count_first, + number_of_genomes == 2 ); + switch ( scoring ) { + case COMBINATIONS: + pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); + break; + case DOMAINS: + pw_calc = new DomainCountsBasedPairwiseSimilarityCalculator(); + break; + case PROTEINS: + pw_calc = new ProteinCountsBasedPairwiseDomainSimilarityCalculator(); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for sorting for scoring" ); + } + DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE; + if ( domain_id_to_go_ids_map != null ) { + go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL; + } + final SortedSet similarities = calc + .calculateSimilarities( pw_calc, + gwcd_list, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains ); + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, + detailedness, + go_annotation_output, + go_id_to_term_map, + go_namespace_limit ); + DescriptiveStatistics pw_stats = null; + try { + String my_outfile = output_file.toString(); + if ( !my_outfile.endsWith( ".html" ) ) { + my_outfile += ".html"; + } + final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? my_outfile : out_dir + + ForesterUtil.FILE_SEPARATOR + my_outfile ) ); + List species_order = null; + if ( species_matrix ) { + species_order = new ArrayList(); + for( int i = 0; i < number_of_genomes; i++ ) { + species_order.add( new BasicSpecies( input_file_properties[ i ][ 1 ] ) ); + } + } + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "" + nl ); + html_desc.append( "
Produced by:" + surfacing_old.PRG_NAME + "
Version:" + surfacing_old.PRG_VERSION + "
Release Date:" + surfacing_old.PRG_DATE + "
Contact:" + surfacing_old.E_MAIL + "
WWW:" + surfacing_old.WWW + "
Input files names from:" + input_files_file + " [" + + input_file_names_from_file.length + " input files]
Positive protein filter:" + positive_filter_file + " [" + filter_size + + " domain ids]
Negative protein filter:" + negative_filter_file + " [" + filter_size + + " domain ids]
Negative domain filter:" + negative_domains_filter_file + " [" + + filter_size + " domain ids]
Plus-minus analysis:" + plus1 + "&& " + plus0 + "&& " + minus + + "
Cutoff scores file:" + cutoff_scores_file + "
E-value maximum (inclusive):" + e_value_max + "
Ignore virus, phage, transposition related ids:" + + ignore_virus_like_ids + "
Ignore DUFs:" + ignore_dufs + "
Max allowed domain overlap:" + max_allowed_overlap + "
Ignore (lower confidence) engulfed domains:" + no_engulfing_overlaps + + "
Ignore singlet domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_domains_without_combs_in_all_spec + "
Ignore species specific domains for domain combination similarity analyses (not for parsimony analyses):" + + ignore_species_specific_domains + "
Ignore combination with self for domain combination similarity analyses:" + + ignore_combination_with_same + "
Consider directedness of binary domain combinations:" + + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) + "
Consider djacency of binary domain combinations:" + + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "
Pfam to GO mappings from:" + pfam_to_go_file + " [" + + domain_id_to_go_ids_count + " mappings]" + "
GO terms from:" + go_obo_file + " [" + go_terms.size() + " terms]" + + "
Limit GO terms to" + go_namespace_limit + "
Suffix for PWC files" + automated_pairwise_comparison_suffix + + "
" + query_domain_ids + "
Scoring method:" ); + switch ( scoring ) { + case COMBINATIONS: + System.out.println( "domain combinations based" ); + html_desc.append( "domain combinations based" + "
Sort by:" ); + switch ( domain_similarity_sort_field ) { + case MIN: + System.out.print( "score minimum" ); + html_desc.append( "score minimum" ); + break; + case MAX: + System.out.print( "score maximum" ); + html_desc.append( "score maximum" ); + break; + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case SD: + System.out.print( "score standard deviation" ); + html_desc.append( "score standard deviation" ); + break; + case SPECIES_COUNT: + System.out.print( "species number" ); + html_desc.append( "species number" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "(maximal) difference" ); + html_desc.append( "(maximal) difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute (maximal) counts difference" ); + html_desc.append( "absolute (maximal) counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "(maximal) counts difference" ); + html_desc.append( "(maximal) counts difference" ); + break; + default: + ForesterUtil + .unexpectedFatalError( surfacing_old.PRG_NAME, "unknown value for sorting for similarities" ); + } + if ( sort_by_species_count_first ) { + System.out.println( " (sort by species count first)" ); + html_desc.append( " (sort by species count first)" ); + } + else { + System.out.println(); + } + html_desc.append( "
Pairwise comparisons:
Sort by:" ); + switch ( domain_similarity_sort_field_for_automated_pwc ) { + case MEAN: + System.out.print( "score mean" ); + html_desc.append( "score mean" ); + break; + case DOMAIN_ID: + System.out.print( "alphabetical domain identifier" ); + html_desc.append( "alphabetical domain identifier" ); + break; + case MAX_DIFFERENCE: + System.out.print( "difference" ); + html_desc.append( "difference" ); + break; + case ABS_MAX_COUNTS_DIFFERENCE: + System.out.print( "absolute counts difference" ); + html_desc.append( "absolute counts difference" ); + break; + case MAX_COUNTS_DIFFERENCE: + System.out.print( "counts difference" ); + html_desc.append( "counts difference" ); + break; + default: + ForesterUtil.unexpectedFatalError( surfacing_old.PRG_NAME, + "unknown value for sorting for similarities" ); + } + System.out.println(); + html_desc.append( "
Jacknife:" + jacknife_resamplings + " resamplings
Jacknife ratio:" + ForesterUtil.round( jacknife_ratio, 2 ) + + "
Jacknife random number seed:" + random_seed + "
Infer species trees:true
Intree for gain/loss parsimony analysis:" + intree_file + + "
Random number seed for Fitch parsimony analysis:" + + random_number_seed_for_fitch_parsimony + "
Secondary features map file:" + + secondary_features_map_files[ i ] + "
Command line:" + cla.getCommandLineArgsAsString() + "
" + input_file_properties[ i ][ 0 ] + " [species: " + + input_file_properties[ i ][ 1 ] + "]" + ":domains analyzed: " + + parser.getDomainsStored() + "; domains ignored: [ind score cutoffs: " + + parser.getDomainsIgnoredDueToIndividualScoreCutoff() + "] [E-value cutoff: " + + parser.getDomainsIgnoredDueToEval() + "] [DUF: " + parser.getDomainsIgnoredDueToDuf() + + "] [virus like ids: " + parser.getDomainsIgnoredDueToVirusLikeIds() + + "] [negative domain filter: " + parser.getDomainsIgnoredDueToNegativeDomainFilter() + + "] [overlap: " + parser.getDomainsIgnoredDueToOverlap() + "]" ); + if ( negative_filter_file != null ) { + html_desc.append( "; proteins ignored due to negative filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + if ( positive_filter_file != null ) { + html_desc.append( "; proteins ignored due to positive filter: " + + parser.getProteinsIgnoredDueToFilter() ); + } + html_desc.append( "
Sum of all distinct binary combinations:" + + all_bin_domain_combinations_encountered.size() + "
Sum of all distinct domains:" + all_domains_encountered.size() + + "
Analysis date/time:" + + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + + "
" + nl ); + pw_stats = SurfacingUtil + .writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( number_of_genomes + " genomes" ), + writer, + similarities, + number_of_genomes == 2, + species_order, + domain_similarity_print_option, + domain_similarity_sort_field, + scoring, + true ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, + "Wrote main output (includes domain similarities) to: \"" + + ( out_dir == null ? my_outfile : out_dir + + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "Failed to write similarites to: \"" + output_file + + "\" [" + e.getMessage() + "]" ); + } + System.out.println(); + // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); + final Species[] species = new Species[ number_of_genomes ]; + for( int i = 0; i < number_of_genomes; ++i ) { + species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); + } + List inferred_trees = null; + if ( ( number_of_genomes > 2 ) && perform_pwc ) { + final PairwiseGenomeComparator pwgc = new PairwiseGenomeComparator(); + pwgc.performPairwiseComparisons( html_desc, + sort_by_species_count_first, + detailedness, + ignore_domains_without_combs_in_all_spec, + ignore_species_specific_domains, + domain_similarity_sort_field_for_automated_pwc, + domain_similarity_print_option, + scoring, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + species, + number_of_genomes, + gwcd_list, + pw_calc, + automated_pairwise_comparison_suffix, + true, + surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, + surfacing_old.PRG_NAME, + display_histograms, + out_dir, + write_pwc_files ); + String matrix_output_file = new String( output_file.toString() ); + if ( matrix_output_file.indexOf( '.' ) > 1 ) { + matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); + } + if ( out_dir != null ) { + matrix_output_file = out_dir + ForesterUtil.FILE_SEPARATOR + matrix_output_file; + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_old.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getDomainDistanceScoresMeans() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_old.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + + surfacing_old.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + final Phylogeny nj_gd = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_old.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getDomainDistanceScoresMeans().get( 0 ) ); + final Phylogeny nj_bc = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_old.NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances().get( 0 ) ); + final Phylogeny nj_d = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file + + surfacing_old.NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances().get( 0 ) ); + inferred_trees = new ArrayList(); + inferred_trees.add( nj_gd ); + inferred_trees.add( nj_bc ); + inferred_trees.add( nj_d ); + // final List histogram_datas = pwgc.getHistogramDatas(); + // if ( infer_species_trees ) { + // inferred_trees = new ArrayList(); + // final List inferred_trees_bc = inferSpeciesTrees( new File( output_file + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // final List inferred_trees_d = inferSpeciesTrees( new File( output_file + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedDomainsBasedDistances() ); + // inferred_trees.addAll( inferred_trees_bc ); + // inferred_trees.addAll( inferred_trees_d ); + // } + if ( jacknifed_distances ) { + pwgc.performPairwiseComparisonsJacknifed( species, + number_of_genomes, + gwcd_list, + true, + jacknife_resamplings, + jacknife_ratio, + random_seed ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing_old.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedBinaryCombinationsBasedDistances() ); + SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_" + + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings + + surfacing_old.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc + .getSharedDomainsBasedDistances() ); + // if ( infer_species_trees ) { + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc + // .getSharedBinaryCombinationsBasedDistances() ); + // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings + // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); + // } + } + if ( display_histograms ) { + // final List histogram_datas_all = new ArrayList(); + // histogram_datas_all.add( new HistogramData( "all", + // values_for_all_scores_histogram, + // null, + // 20 ) ); + // final HistogramsFrame hf_all = new HistogramsFrame( histogram_datas_all ); + // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); + // hf_all.setVisible( true ); + // hf.setVisible( true ); + } + } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) + if ( ( out_dir != null ) && ( !perform_pwc ) ) { + output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); + } + writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { + final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + e_value_max, + max_allowed_overlap, + no_engulfing_overlaps, + cutoff_scores_file, + dc_type ); + String s = "_"; + if ( radomize_fitch_parsimony ) { + s += random_number_seed_for_fitch_parsimony + "_"; + } + int i = 0; + for( final Phylogeny intree : intrees ) { + final String outfile_name = ForesterUtil.removeSuffix( output_file.toString() ) + s + + ForesterUtil.removeSuffix( intree_files[ i ].toString() ); + final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator.createInstance( intree, + gwcd_list ); + SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + radomize_fitch_parsimony, + outfile_name, + domain_parsimony, + intree, + domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + parameters_sb.toString(), + domain_id_to_secondary_features_maps, + positive_filter_file == null ? null : filter, + output_binary_domain_combinationsfor_graph_analysis, + all_bin_domain_combinations_gained_fitch, + all_bin_domain_combinations_lost_fitch, + dc_type ); + // Listing of all domain combinations gained is only done if only one input tree is used. + if ( ( domain_id_to_secondary_features_maps != null ) + && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + int j = 0; + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + final Map mapping_results_map = new TreeMap(); + final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator + .createInstance( intree, gwcd_list, domain_id_to_secondary_features_map ); + SurfacingUtil + .executeParsimonyAnalysisForSecondaryFeatures( outfile_name + + "_" + + secondary_features_map_files[ j++ ], + secondary_features_parsimony, + intree, + parameters_sb.toString(), + mapping_results_map ); + if ( i == 0 ) { + System.out.println(); + System.out.println( "Mapping to secondary features:" ); + for( final Species spec : mapping_results_map.keySet() ) { + final MappingResults mapping_results = mapping_results_map.get( spec ); + final int total_domains = mapping_results.getSumOfFailures() + + mapping_results.getSumOfSuccesses(); + System.out.print( spec + ":" ); + System.out.print( " mapped domains = " + mapping_results.getSumOfSuccesses() ); + System.out.print( ", not mapped domains = " + mapping_results.getSumOfFailures() ); + if ( total_domains > 0 ) { + System.out.println( ", mapped ratio = " + + ( 100 * mapping_results.getSumOfSuccesses() / total_domains ) + "%" ); + } + else { + System.out.println( ", mapped ratio = n/a (total domains = 0 )" ); + } + } + } + } + } + i++; + } // for( final Phylogeny intree : intrees ) { + } + if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { + executePlusMinusAnalysis( output_file, + plus_minus_analysis_high_copy_base_species, + plus_minus_analysis_high_copy_target_species, + plus_minus_analysis_high_low_copy_species, + gwcd_list, + protein_lists_per_species, + domain_id_to_go_ids_map, + go_id_to_term_map, + plus_minus_analysis_numbers ); + } + if ( output_protein_lists_for_all_domains ) { + writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list ); + } + // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) { + // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + // e_value_max, + // max_allowed_overlap, + // no_engulfing_overlaps, + // cutoff_scores_file ); + // String s = "_"; + // if ( radomize_fitch_parsimony ) { + // s += random_number_seed_for_fitch_parsimony + "_"; + // } + // int i = 0; + // for( final Phylogeny inferred_tree : inferred_trees ) { + // if ( !inferred_tree.isRooted() ) { + // intrees[ 0 ].getRoot().getName(); + // inferred_tree.r + // } + // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s; + // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator + // .createInstance( inferred_tree, gwcd_list ); + // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, + // radomize_fitch_parsimony, + // outfile_name, + // domain_parsimony, + // inferred_tree, + // domain_id_to_go_ids_map, + // go_id_to_term_map, + // go_namespace_limit, + // parameters_sb.toString() ); + // i++; + // } + // } + if ( all_bin_domain_combinations_gained_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing_old.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_gained_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + true ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + if ( all_bin_domain_combinations_lost_fitch != null ) { + try { + executeFitchGainsAnalysis( new File( output_file + + surfacing_old.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), + all_bin_domain_combinations_lost_fitch, + all_domains_encountered.size(), + all_bin_domain_combinations_encountered, + false ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() ); + } + } + final Runtime rt = java.lang.Runtime.getRuntime(); + final long free_memory = rt.freeMemory() / 1000000; + final long total_memory = rt.totalMemory() / 1000000; + System.out.println(); + System.out.println( "Time for analysis : " + ( new Date().getTime() - analysis_start_time ) + "ms" ); + System.out.println( "Total running time: " + ( new Date().getTime() - start_time ) + "ms " ); + System.out.println( "Free memory : " + free_memory + "MB, total memory: " + total_memory + "MB" ); + System.out.println(); + System.out.println( "If this application is useful to you, please cite:" ); + System.out.println( surfacing_old.WWW ); + System.out.println(); + ForesterUtil.programMessage( PRG_NAME, "OK" ); + System.out.println(); + } + + private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, + final String[][] input_file_properties ) { + final String[] genomes = new String[ input_file_properties.length ]; + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] is not unique in input tree " + intree.getName() ); + } + genomes[ i ] = input_file_properties[ i ][ 1 ]; + } + PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); + for( int i = 0; i < input_file_properties.length; ++i ) { + try { + intree.getNode( input_file_properties[ i ][ 1 ] ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] not present/not unique in input tree" ); + } + } + } + + // public static StringBuffer stringCombinableDomainsMapToStringBuffer( + // final SortedMap map ) { + // final StringBuffer sb = new StringBuffer(); + // for( final Iterator iter = map.keySet().iterator(); + // iter.hasNext(); ) { + // final Object key = iter.next(); + // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ', + // false ) ); + // final CombinableDomains domain_combination = map.get( key ); + // sb.append( ForesterUtil.pad( new StringBuffer( "" + + // domain_combination.getNumberOfCombiningDomains() ), 8, + // ' ', false ) ); + // sb.append( domain_combination.toStringBuffer() ); + // sb.append( ForesterUtil.getLineSeparator() ); + // } + // return sb; + // } + private static void printHelp() { + System.out.println(); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( "% java -Xms256m -Xmx512m -cp forester.jar org.forester.applications." + + surfacing_old.PRG_NAME + + " [options] [external node name 1] [name 2] ... [name n]" ); + System.out.println(); + System.out.println( " Note: This software might need a significant amount of memory (heap space);" ); + System.out + .println( " hence use \"-Xms128m -Xmx512m\" (or more) to prevent a \"java.lang.OutOfMemoryError\"." ); + System.out.println(); + System.out.println( " Options: " ); + System.out.println( surfacing_old.DETAILEDNESS_OPTION + + ": level of detail for similarities output file (default:" + DETAILEDNESS_DEFAULT + ")" ); + System.out.println( surfacing_old.IGNORE_COMBINATION_WITH_SAME_OPTION + + ": to ignore combinations with self (default: not to ignore)" ); + System.out + .println( surfacing_old.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION + + ": to ignore domains without combinations in any species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out + .println( surfacing_old.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION + + ": to ignore domains specific to one species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" ); + System.out.println( surfacing_old.NOT_IGNORE_DUFS_OPTION + + ": to _not_ ignore DUFs (domains with unknown function) (default: ignore DUFs)" ); + System.out + .println( surfacing_old.IGNORE_VIRAL_IDS + + ": to ignore domains with ids containing 'vir', 'retro', 'transpos', 'phage', or starting with 'rv' or 'gag_'" ); + System.out.println( surfacing_old.DOMAIN_SIMILARITY_SORT_OPTION + ": sorting for similarities (default: " + + DOMAIN_SORT_FILD_DEFAULT + ")" ); + System.out.println( surfacing_old.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" ); + System.out.println( surfacing_old.MAX_E_VALUE_OPTION + ": max (inclusive) E-value" ); + System.out.println( surfacing_old.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" ); + System.out + .println( surfacing_old.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" ); + System.out.println( surfacing_old.SPECIES_MATRIX_OPTION + ": species matrix" ); + System.out.println( surfacing_old.SCORING_OPTION + ": scoring (default:" + SCORING_DEFAULT + ")" ); + System.out.println( surfacing_old.DOMAIN_COUNT_SORT_OPTION + ": sorting for domain counts (default:" + + DOMAINS_SORT_ORDER_DEFAULT + ")" ); + System.out.println( surfacing_old.DOMAIN_SIMILARITY_PRINT_OPTION + ": domain similarity print option (default:" + + DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT + ")" ); + System.out.println( surfacing_old.CUTOFF_SCORE_FILE_OPTION + ": cutoff score file" ); + System.out.println( surfacing_old.DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION + + ": sort by species count first" ); + System.out.println( surfacing_old.OUTPUT_DIR_OPTION + ": output directory" ); + System.out.println( surfacing_old.PFAM_TO_GO_FILE_USE_OPTION + ": Pfam to GO mapping file" ); + System.out.println( surfacing_old.GO_OBO_FILE_USE_OPTION + ": GO terms file (OBO format)" ); + System.out.println( surfacing_old.GO_NAMESPACE_LIMIT_OPTION + ": limit GO term to one GO namespace" ); + System.out.println( surfacing_old.PAIRWISE_DOMAIN_COMPARISONS_OPTION + + "[=]: to perform pairwise comparison based analyses" ); + System.out.println( surfacing_old.INPUT_SPECIES_TREE_OPTION + + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); + System.out.println( surfacing_old.DISPLAY_M_HISTOGRAMS_OPTION + + ": to display multiple histograms (using fluorite)" ); + System.out + .println( JACKNIFE_OPTION + + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " + + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); + System.out.println( JACKNIFE_RATIO_OPTION + ": ratio for jacknife resampling [default: " + + JACKNIFE_RATIO_DEFAULT + "]" ); + System.out.println( JACKNIFE_RANDOM_SEED_OPTION + + ": seed for random number generator for jacknife resampling [default: " + + JACKNIFE_RANDOM_SEED_DEFAULT + "]" ); + // System.out.println( surfacing.INFER_SPECIES_TREES_OPTION + // + ": to infer NJ species trees based on shared domains/binary domain combinations" ); + System.out + .println( surfacing_old.INPUT_SPECIES_TREE_OPTION + + "=: to infer domain/binary domain combination gains/losses on given species trees" ); + System.out.println( surfacing_old.FILTER_POSITIVE_OPTION + + "=: to filter out proteins not containing at least one domain listed in " ); + System.out.println( surfacing_old.FILTER_NEGATIVE_OPTION + + "=: to filter out proteins containing at least one domain listed in " ); + System.out.println( surfacing_old.FILTER_NEGATIVE_DOMAINS_OPTION + + "=: to filter out (ignore) domains listed in " ); + System.out.println( surfacing_old.INPUT_FILES_FROM_FILE_OPTION + "=: to read input files from " ); + System.out + .println( surfacing_old.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + + "=: seed for random number generator for Fitch Parsimony analysis (type: long, default: no randomization - given a choice, prefer absence" ); + System.out.println( surfacing_old.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS + + ": to consider directedness in binary combinations: e.g. A-B != B-A" ); + System.out.println( surfacing_old.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY + + ": to consider directedness and adjacency in binary combinations" ); + System.out + .println( surfacing_old.SEQ_EXTRACT_OPTION + + "=: to extract sequence names of sequences containing matching domains and/or domain-sequences (order N to C) (domain separator: '~', domain sequences speparator: '#', e.g. 'NACHT#BIR~CARD')" ); + System.out.println( surfacing_old.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + + "=: to perfom parsimony analysis on secondary features" ); + System.out.println( surfacing_old.PLUS_MINUS_ANALYSIS_OPTION + "=: to presence/absence genome analysis" ); + System.out.println( surfacing_old.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + + ": to output binary domain combinations for (downstream) graph analysis" ); + System.out.println( surfacing_old.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); + System.out.println(); + System.out.println(); + System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar" + + "org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST" + + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo " + + "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo " + + "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION + + "=50 human mouse brafl strpu" ); + System.out.println(); + } + + private static void processFilter( final File filter_file, final SortedSet filter ) { + SortedSet filter_str = null; + try { + filter_str = ForesterUtil.file2set( filter_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + if ( filter_str != null ) { + for( final String string : filter_str ) { + filter.add( new DomainId( string ) ); + } + } + if ( VERBOSE ) { + System.out.println( "Filter:" ); + for( final DomainId domainId : filter ) { + System.out.println( domainId.getId() ); + } + } + } + + private static String[][] processInputFileNames( final String[] names ) { + final String[][] input_file_properties = new String[ names.length ][]; + for( int i = 0; i < names.length; ++i ) { + if ( names[ i ].indexOf( SEPARATOR_FOR_INPUT_VALUES ) < 0 ) { + input_file_properties[ i ] = new String[ 3 ]; + input_file_properties[ i ][ 0 ] = names[ i ]; + input_file_properties[ i ][ 1 ] = names[ i ]; + input_file_properties[ i ][ 2 ] = DEFAULT_SEARCH_PARAMETER; + } + else { + input_file_properties[ i ] = names[ i ].split( surfacing_old.SEPARATOR_FOR_INPUT_VALUES + "" ); + if ( input_file_properties[ i ].length != 3 ) { + ForesterUtil + .fatalError( surfacing_old.PRG_NAME, + "properties for the input files (hmmpfam output) are expected " + + "to be in the following format \"##\" (or just one word, which is both the filename and the species id), instead received \"" + + names[ i ] + "\"" ); + } + } + final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, error ); + } + } + return input_file_properties; + } + + private static void processPlusMinusAnalysisOption( final CommandLineArguments cla, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + if ( cla.isOptionSet( surfacing_old.PLUS_MINUS_ANALYSIS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing_old.PLUS_MINUS_ANALYSIS_OPTION ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "no value for 'plus-minus' file: -" + + surfacing_old.PLUS_MINUS_ANALYSIS_OPTION + "=" ); + } + final File plus_minus_file = new File( cla.getOptionValue( surfacing_old.PLUS_MINUS_ANALYSIS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( plus_minus_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + + msg ); + } + processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); + } + } + + // First numbers is minimal difference, second is factor. + private static void processPlusMinusFile( final File plus_minus_file, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + Set species_set = null; + int min_diff = PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; + double factor = PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; + try { + species_set = ForesterUtil.file2set( plus_minus_file ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + if ( species_set != null ) { + for( final String species : species_set ) { + final String species_trimmed = species.substring( 1 ); + if ( species.startsWith( "+" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_base.add( species_trimmed ); + } + else if ( species.startsWith( "*" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_target.add( species_trimmed ); + } + else if ( species.startsWith( "-" ) ) { + if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + low_copy.add( species_trimmed ); + } + else if ( species.startsWith( "$D" ) ) { + try { + min_diff = Integer.parseInt( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "could not parse integer value for minimal difference from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "$F" ) ) { + try { + factor = Double.parseDouble( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, + "could not parse double value for factor from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "#" ) ) { + // Comment, ignore. + } + else { + ForesterUtil + .fatalError( surfacing_old.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); + } + numbers.add( new Integer( min_diff + "" ) ); + numbers.add( new Double( factor + "" ) ); + } + } + else { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, "'plus minus' file [" + plus_minus_file + + "] appears empty" ); + } + } + + private static void writePresentToNexus( final File output_file, + final File positive_filter_file, + final SortedSet filter, + final List gwcd_list ) { + try { + SurfacingUtil + .writeMatrixToFile( DomainParsimonyCalculator + .createMatrixOfDomainPresenceOrAbsence( gwcd_list, positive_filter_file == null ? null + : filter ), output_file + DOMAINS_PRESENT_NEXUS, Format.NEXUS_BINARY ); + SurfacingUtil.writeMatrixToFile( DomainParsimonyCalculator + .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), output_file + + BDC_PRESENT_NEXUS, Format.NEXUS_BINARY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + } + } + + private static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final DomainId domain : all_domains ) { + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); + SurfacingUtil.checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + SurfacingUtil.extractProteinNames( protein_lists_per_species, domain, proteins_file_writer, "\t" ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } +} diff --git a/forester/java/src/org/forester/application/ta.java b/forester/java/src/org/forester/application/ta.java new file mode 100644 index 0000000..5817b84 --- /dev/null +++ b/forester/java/src/org/forester/application/ta.java @@ -0,0 +1,239 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.application; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.sdi.GSDI; +import org.forester.sdi.SDI; +import org.forester.sdi.SDIse; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class ta { + + final static private String STRIP_OPTION = "s"; + final static private String SDISE_OPTION = "b"; + final static private String MOST_PARSIMONIOUS_OPTION = "m"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String DEFAULT_OUTFILE = "sdi_out.xml"; + final static private String PRG_NAME = "sdi"; + final static private String PRG_VERSION = "alpha 0.3"; + final static private String PRG_DATE = "2008.03.04"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + System.out.println(); + print_help(); + System.exit( 0 ); + } + else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) { + System.out.println(); + System.out.println( "Wrong number of arguments." ); + System.out.println(); + print_help(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList(); + allowed_options.add( STRIP_OPTION ); + allowed_options.add( SDISE_OPTION ); + allowed_options.add( MOST_PARSIMONIOUS_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + boolean use_sdise = false; + boolean strip = false; + boolean most_parsimonous_duplication_model = false; + if ( cla.isOptionSet( STRIP_OPTION ) ) { + strip = true; + } + if ( cla.isOptionSet( SDISE_OPTION ) ) { + use_sdise = true; + } + if ( cla.isOptionSet( MOST_PARSIMONIOUS_OPTION ) ) { + if ( use_sdise ) { + ForesterUtil.fatalError( PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" ); + } + most_parsimonous_duplication_model = true; + } + Phylogeny species_tree = null; + Phylogeny gene_tree = null; + File gene_tree_file = null; + File species_tree_file = null; + File out_file = null; + try { + gene_tree_file = cla.getFile( 0 ); + species_tree_file = cla.getFile( 1 ); + if ( cla.getNumberOfNames() == 3 ) { + out_file = cla.getFile( 2 ); + } + else { + out_file = new File( DEFAULT_OUTFILE ); + } + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() ); + } + if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) { + ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) ); + } + if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) { + ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) ); + } + if ( ForesterUtil.isWritableFile( out_file ) != null ) { + ForesterUtil.fatalError( PRG_NAME, ForesterUtil.isWritableFile( out_file ) ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( species_tree_file, true ); + species_tree = factory.create( species_tree_file, pp )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "Failed to read species tree from \"" + gene_tree_file + "\" [" + + e.getMessage() + "]" ); + } + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( gene_tree_file, true ); + gene_tree = factory.create( gene_tree_file, pp )[ 0 ]; + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "Failed to read gene tree from \"" + gene_tree_file + "\" [" + + e.getMessage() + "]" ); + } + gene_tree.setRooted( true ); + species_tree.setRooted( true ); + if ( !gene_tree.isCompletelyBinary() ) { + ForesterUtil.fatalError( PRG_NAME, "gene tree is not completely binary." ); + } + if ( use_sdise ) { + if ( !species_tree.isCompletelyBinary() ) { + ForesterUtil.fatalError( PRG_NAME, "species tree is not completely binary." ); + } + } + // For timing. + // gene_tree = Helper.createBalancedTree( 10 ); + // species_tree = Helper.createBalancedTree( 13 ); + // species_tree = Helper.createUnbalancedTree( 1024 ); + // gene_tree = Helper.createUnbalancedTree( 8192 ); + // species_tree = gene_tree.copyTree(); + // gene_tree = species_tree.copyTree(); + // Helper.numberSpeciesInOrder( species_tree ); + // Helper.numberSpeciesInOrder( gene_tree ); + // Helper.randomizeSpecies( 1, 8192, gene_tree ); + // Helper.intervalNumberSpecies( gene_tree, 4096 ); + // Helper.numberSpeciesInDescOrder( gene_tree ); + System.out.println(); + System.out.println( "Strip species tree: " + strip ); + SDI sdi = null; + final long start_time = new Date().getTime(); + try { + if ( use_sdise ) { + System.out.println(); + System.out.println( "Using SDIse algorithm." ); + sdi = new SDIse( gene_tree, species_tree ); + } + else { + System.out.println(); + System.out.println( "Using GSDI algorithm." ); + System.out.println(); + System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model ); + sdi = new GSDI( gene_tree, species_tree, most_parsimonous_duplication_model ); + } + } + catch ( final Exception e ) { + ForesterUtil.unexpectedFatalError( PRG_NAME, e ); + } + System.out.println(); + System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" ); + try { + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( out_file, gene_tree, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, "Failed to write to \"" + out_file + "\" [" + e.getMessage() + "]" ); + } + System.out.println(); + System.out.println( "Successfully wrote resulting gene tree to: " + out_file ); + System.out.println(); + // if ( use_sdise ) { + // computeMappingCostL(); + // System.out.println( "Mapping cost : " + computeMappingCostL() ); + // } + // System.out.println( "Number of duplications : " + getDuplicationsSum() ); + if ( !use_sdise && !most_parsimonous_duplication_model ) { + System.out.println( "Number of potential duplications: " + + ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum() ); + } + if ( !use_sdise ) { + System.out.println( "Number speciations : " + ( ( GSDI ) sdi ).getSpeciationsSum() ); + } + System.out.println(); + } // main( final String args[] ) + + private static void print_help() { + System.out.println( "Usage: \"" + PRG_NAME + + " [-options] [outfile name]\"" ); + System.out.println(); + System.out.println( "Options:" ); + System.out.println( " -" + STRIP_OPTION + ": to strip the species tree prior to duplication inference" ); + System.out.println( " -" + SDISE_OPTION + + ": to use SDIse algorithm instead of GSDI algorithm (for binary trees only, faster)" ); + System.out.println( " -" + MOST_PARSIMONIOUS_OPTION + ": use most parimonious duplication model for GSDI: " ); + System.out.println( " assign nodes as speciations which would otherwise be assiged" ); + System.out.println( " as unknown because of polytomies in the species tree" ); + System.out.println(); + System.out.println( "Species tree file" ); + System.out.println( " In NHX format, with species names in species name fields unless -n option" ); + System.out.println( " is used." ); + System.out.println(); + System.out.println( "Gene tree file" ); + System.out.println( " In NHX format, with species names in species name fields and sequence names" ); + System.out.println( " in sequence name fields." ); + System.out.println(); + System.out.println( "!! WARNING: GSDI algorithm is under development, please use SDIse (-b) instead !!" ); + System.out.println(); + } // print_help() +}