import java.util.TreeMap;
import java.util.TreeSet;
-import org.forester.evoinference.distance.NeighborJoining;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
-import org.forester.evoinference.matrix.distance.DistanceMatrix;
import org.forester.go.GoId;
import org.forester.go.GoNameSpace;
import org.forester.go.GoTerm;
import org.forester.io.parsers.HmmscanPerDomainTableParser;
import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF;
import org.forester.io.parsers.util.ParserUtils;
-import org.forester.io.writers.PhylogenyWriter;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.protein.BinaryDomainCombination;
+import org.forester.protein.Domain;
+import org.forester.protein.DomainId;
+import org.forester.protein.Protein;
+import org.forester.species.BasicSpecies;
+import org.forester.species.Species;
import org.forester.surfacing.BasicDomainSimilarityCalculator;
import org.forester.surfacing.BasicGenomeWideCombinableDomains;
-import org.forester.surfacing.BasicSpecies;
-import org.forester.surfacing.BinaryDomainCombination;
import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculator;
import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator;
import org.forester.surfacing.DomainCountsDifferenceUtil;
-import org.forester.surfacing.DomainId;
import org.forester.surfacing.DomainLengthsTable;
import org.forester.surfacing.DomainParsimonyCalculator;
import org.forester.surfacing.DomainSimilarity;
import org.forester.surfacing.PairwiseGenomeComparator;
import org.forester.surfacing.PrintableDomainSimilarity;
import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION;
-import org.forester.surfacing.Protein;
import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator;
-import org.forester.surfacing.Species;
import org.forester.surfacing.SurfacingUtil;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.BasicTable;
public class surfacing {
- private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000;
- public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out";
- public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot";
- public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot";
- public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc";
+ private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000;
+ public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out";
+ public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot";
+ public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot";
+ public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc";
// gain/loss:
- public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d";
- public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc";
- public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d";
- public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc";
+ public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d";
+ public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc";
+ public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d";
+ public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc";
// gain/loss counts:
- public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d";
- public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc";
- public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d";
- public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc";
+ public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d";
+ public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc";
+ public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d";
+ public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc";
// tables:
- public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc";
- public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html";
- public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc";
- public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html";
- public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc";
- public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html";
- public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d";
- public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html";
- public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d";
- public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html";
- public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d";
- public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html";
- public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex";
- public final static String BDC_PRESENT_NEXUS = "_dc.nex";
+ public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html";
+ public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html";
+ public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html";
+ public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex";
+ public final static String BDC_PRESENT_NEXUS = "_dc.nex";
// ---
- public final static String PRG_NAME = "surfacing";
- public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex";
- public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex";
- public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex";
- public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features";
- public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features";
- public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features";
- public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features";
- public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features";
- public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d";
- public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc";
- final static private String HELP_OPTION_1 = "help";
- final static private String HELP_OPTION_2 = "h";
- final static private String OUTPUT_DIR_OPTION = "out_dir";
- final static private String SCORING_OPTION = "scoring";
- private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS;
- final static private String SCORING_DOMAIN_COUNT_BASED = "domains";
- final static private String SCORING_PROTEIN_COUNT_BASED = "proteins";
- final static private String SCORING_COMBINATION_BASED = "combinations";
- final static private String DETAILEDNESS_OPTION = "detail";
- private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS;
- final static private String SPECIES_MATRIX_OPTION = "smatrix";
- final static private String DETAILEDNESS_BASIC = "basic";
- final static private String DETAILEDNESS_LIST_IDS = "list_ids";
- final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious";
- final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort";
- private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID;
- final static private String DOMAIN_SIMILARITY_SORT_MIN = "min";
- final static private String DOMAIN_SIMILARITY_SORT_MAX = "max";
- final static private String DOMAIN_SIMILARITY_SORT_SD = "sd";
- final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean";
- final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff";
- final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff";
- final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff";
- final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species";
- final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha";
- final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first";
- final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort";
- private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID;
- final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha";
- final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom";
- final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot";
- final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb";
- final static private String CUTOFF_SCORE_FILE_OPTION = "cos";
- final static private String NOT_IGNORE_DUFS_OPTION = "dufs";
- final static private String MAX_E_VALUE_OPTION = "e";
- final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo";
- final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo";
- final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb";
- final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_";
- final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc";
- final static private String OUTPUT_FILE_OPTION = "o";
- final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g";
- final static private String GO_OBO_FILE_USE_OPTION = "obo";
- final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace";
- final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function";
- final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process";
- final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component";
- final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary";
- final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab";
- final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html";
- final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html";
- final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output";
- private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML;
- final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains";
- final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids";
- final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false;
- final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains";
- final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false;
- final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd";
- final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd";
- final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd";
- final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
- final static private String JACKNIFE_OPTION = "jack";
- final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed";
- final static private String JACKNIFE_RATIO_OPTION = "jack_ratio";
- private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100;
- final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19;
- final static private double JACKNIFE_RATIO_DEFAULT = 0.5;
+ public final static String PRG_NAME = "surfacing";
+ public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex";
+ public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex";
+ public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex";
+ public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features";
+ public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features";
+ public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features";
+ public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features";
+ public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features";
+ public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d";
+ public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc";
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String OUTPUT_DIR_OPTION = "out_dir";
+ final static private String SCORING_OPTION = "scoring";
+ private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS;
+ final static private String SCORING_DOMAIN_COUNT_BASED = "domains";
+ final static private String SCORING_PROTEIN_COUNT_BASED = "proteins";
+ final static private String SCORING_COMBINATION_BASED = "combinations";
+ final static private String DETAILEDNESS_OPTION = "detail";
+ private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS;
+ final static private String SPECIES_MATRIX_OPTION = "smatrix";
+ final static private String DETAILEDNESS_BASIC = "basic";
+ final static private String DETAILEDNESS_LIST_IDS = "list_ids";
+ final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious";
+ final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort";
+ private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID;
+ final static private String DOMAIN_SIMILARITY_SORT_MIN = "min";
+ final static private String DOMAIN_SIMILARITY_SORT_MAX = "max";
+ final static private String DOMAIN_SIMILARITY_SORT_SD = "sd";
+ final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean";
+ final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff";
+ final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff";
+ final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff";
+ final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species";
+ final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha";
+ final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first";
+ final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort";
+ private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID;
+ final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha";
+ final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom";
+ final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot";
+ final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb";
+ final static private String CUTOFF_SCORE_FILE_OPTION = "cos";
+ final static private String NOT_IGNORE_DUFS_OPTION = "dufs";
+ final static private String MAX_E_VALUE_OPTION = "e";
+ final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo";
+ final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo";
+ final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb";
+ final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_";
+ final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc";
+ final static private String OUTPUT_FILE_OPTION = "o";
+ final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g";
+ final static private String GO_OBO_FILE_USE_OPTION = "obo";
+ final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace";
+ final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function";
+ final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process";
+ final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component";
+ final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary";
+ final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab";
+ final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html";
+ final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html";
+ final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output";
+ private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML;
+ final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains";
+ final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids";
+ final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false;
+ final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains";
+ final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false;
+ final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd";
+ final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd";
+ final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd";
+ final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ final static private String JACKNIFE_OPTION = "jack";
+ final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed";
+ final static private String JACKNIFE_RATIO_OPTION = "jack_ratio";
+ private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100;
+ final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19;
+ final static private double JACKNIFE_RATIO_DEFAULT = 0.5;
//final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference";
- final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh";
- final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh";
- final static private String FILTER_POSITIVE_OPTION = "pos_filter";
- final static private String FILTER_NEGATIVE_OPTION = "neg_filter";
- final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter";
- final static private String INPUT_FILES_FROM_FILE_OPTION = "input";
- final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
- final static private String SEQ_EXTRACT_OPTION = "prot_extract";
- final static private char SEPARATOR_FOR_INPUT_VALUES = '#';
- final static private String PRG_VERSION = "2.210";
- final static private String PRG_DATE = "2011.12.08";
- final static private String E_MAIL = "czmasek@burnham.org";
- final static private String WWW = "www.phylosoft.org/forester/applications/surfacing";
- final static private boolean IGNORE_DUFS_DEFAULT = true;
- final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false;
- final static private double MAX_E_VALUE_DEFAULT = -1;
- final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
- private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed";
- private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction";
- private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj";
- private static final String SEQ_EXTRACT_SUFFIX = ".prot";
- private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus";
- private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt";
- private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html";
- private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html";
- private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0;
- private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0;
- private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt";
- private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt";
- private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot";
- private static final boolean VERBOSE = false;
- private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts";
- private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts";
- private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis";
- private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true;
- public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams";
- public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation";
- public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary";
- public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains";
- public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains";
- public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc";
- public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc";
- public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS";
- public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS";
- public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities";
- private static final String LOG_FILE_SUFFIX = "_log.txt";
- private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt";
- private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN";
- private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE;
- public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt";
- public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt";
- public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt";
- public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt";
+ final static private String FILTER_POSITIVE_OPTION = "pos_filter";
+ final static private String FILTER_NEGATIVE_OPTION = "neg_filter";
+ final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter";
+ final static private String INPUT_FILES_FROM_FILE_OPTION = "input";
+ final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
+ final static private String SEQ_EXTRACT_OPTION = "prot_extract";
+ final static private char SEPARATOR_FOR_INPUT_VALUES = '#';
+ final static private String PRG_VERSION = "2.252";
+ final static private String PRG_DATE = "2012.08.01";
+ final static private String E_MAIL = "czmasek@burnham.org";
+ final static private String WWW = "www.phylosoft.org/forester/applications/surfacing";
+ final static private boolean IGNORE_DUFS_DEFAULT = true;
+ final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false;
+ final static private double MAX_E_VALUE_DEFAULT = -1;
+ final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
+ private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed";
+ private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction";
+ private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj";
+ private static final String SEQ_EXTRACT_SUFFIX = ".prot";
+ private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus";
+ private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt";
+ private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html";
+ private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html";
+ private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0;
+ private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0;
+ private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt";
+ private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt";
+ private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot";
+ final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e";
+ private static final boolean VERBOSE = false;
+ private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts";
+ private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts";
+ private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis";
+ private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true;
+ public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams";
+ public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation";
+ public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary";
+ public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains";
+ public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains";
+ public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc";
+ public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc";
+ public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS";
+ public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS";
+ public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities";
+ private static final String LOG_FILE_SUFFIX = "_log.txt";
+ private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt";
+ private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN";
+ private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change?
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt";
+ public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default).
+ public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
+ private static final boolean PERFORM_DC_REGAIN_PROTEINS_STATS = true;
private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
final String[][] input_file_properties,
* @param sum_of_all_domains_encountered
* @param all_bin_domain_combinations_encountered
* @param is_gains_analysis
+ * @param protein_length_stats_by_dc
* @throws IOException
*/
private static void executeFitchGainsAnalysis( final File output_file,
return intrees;
}
- private static List<Phylogeny> inferSpeciesTrees( final File outfile, final List<DistanceMatrix> distances_list ) {
- final NeighborJoining nj = NeighborJoining.createInstance();
- final List<Phylogeny> phylogenies = nj.execute( distances_list );
- final PhylogenyWriter w = new PhylogenyWriter();
- try {
- w.toNewHampshire( phylogenies, true, true, outfile, ";" );
- }
- catch ( final IOException e ) {
- ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() );
- }
- return phylogenies;
- }
-
private static void log( final String msg, final Writer w ) {
try {
w.write( msg );
allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION );
allowed_options.add( IGNORE_VIRAL_IDS );
allowed_options.add( SEQ_EXTRACT_OPTION );
+ allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION );
allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE );
allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION );
allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS );
species_matrix = true;
}
boolean output_protein_lists_for_all_domains = false;
+ double output_list_of_all_proteins_per_domain_e_value_max = -1;
if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) {
output_protein_lists_for_all_domains = true;
+ //
+ if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ) ) {
+ try {
+ output_list_of_all_proteins_per_domain_e_value_max = cla
+ .getOptionValueAsDouble( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for per domain E-value maximum" );
+ }
+ }
+ //
}
Detailedness detailedness = DETAILEDNESS_DEFAULT;
if ( cla.isOptionSet( surfacing.DETAILEDNESS_OPTION ) ) {
System.out.println( "E-value maximum (inclusive) : " + e_value_max );
html_desc.append( "<tr><td>E-value maximum (inclusive):</td><td>" + e_value_max + "</td></tr>" + nl );
}
+ if ( output_protein_lists_for_all_domains ) {
+ System.out.println( "Domain E-value max : " + output_list_of_all_proteins_per_domain_e_value_max );
+ html_desc.append( "<tr><td>Protein lists: E-value maximum per domain (inclusive):</td><td>"
+ + output_list_of_all_proteins_per_domain_e_value_max + "</td></tr>" + nl );
+ }
System.out.println( "Ignore DUFs : " + ignore_dufs );
if ( ignore_virus_like_ids ) {
System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids );
System.out.println( "Ignore combination with self: " + ignore_combination_with_same );
html_desc.append( "<tr><td>Ignore combination with self for domain combination similarity analyses:</td><td>"
+ ignore_combination_with_same + "</td></tr>" + nl );
- ;
System.out.println( "Consider directedness : "
+ ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) );
html_desc.append( "<tr><td>Consider directedness of binary domain combinations:</td><td>"
catch ( final IOException e3 ) {
e3.printStackTrace();
}
+ Map<String, DescriptiveStatistics> protein_length_stats_by_dc = null;
+ Map<String, DescriptiveStatistics> domain_number_stats_by_dc = null;
+ final Map<String, DescriptiveStatistics> domain_length_stats_by_domain = new HashMap<String, DescriptiveStatistics>();
+ if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) {
+ protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+ domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+ }
// Main loop:
for( int i = 0; i < number_of_genomes; ++i ) {
System.out.println();
dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" )
.toString() );
++count;
+ for( final Domain d : protein.getProteinDomains() ) {
+ final String d_str = d.getDomainId().toString();
+ if ( !domain_length_stats_by_domain.containsKey( d_str ) ) {
+ domain_length_stats_by_domain.put( d_str, new BasicDescriptiveStatistics() );
+ }
+ domain_length_stats_by_domain.get( d_str ).addValue( d.getLength() );
+ }
}
}
catch ( final IOException e ) {
ignore_combination_with_same,
new BasicSpecies( input_file_properties[ i ][ 1 ] ),
domain_id_to_go_ids_map,
- dc_type ) );
+ dc_type,
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc ) );
domain_lengths_table.addLengths( protein_list );
if ( gwcd_list.get( i ).getSize() > 0 ) {
SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
SurfacingUtil.extractProteinNames( protein_list,
query_domain_ids_array[ j ],
query_domains_writer_ary[ j ],
- "\t" );
+ "\t",
+ LIMIT_SPEC_FOR_PROT_EX );
query_domains_writer_ary[ j ].flush();
}
catch ( final IOException e ) {
output_binary_domain_combinationsfor_graph_analysis,
all_bin_domain_combinations_gained_fitch,
all_bin_domain_combinations_lost_fitch,
- dc_type );
+ dc_type,
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc,
+ domain_length_stats_by_domain );
// Listing of all domain combinations gained is only done if only one input tree is used.
if ( ( domain_id_to_secondary_features_maps != null )
&& ( domain_id_to_secondary_features_maps.length > 0 ) ) {
plus_minus_analysis_numbers );
}
if ( output_protein_lists_for_all_domains ) {
- writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list );
- }
- // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) {
- // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs,
- // e_value_max,
- // max_allowed_overlap,
- // no_engulfing_overlaps,
- // cutoff_scores_file );
- // String s = "_";
- // if ( radomize_fitch_parsimony ) {
- // s += random_number_seed_for_fitch_parsimony + "_";
- // }
- // int i = 0;
- // for( final Phylogeny inferred_tree : inferred_trees ) {
- // if ( !inferred_tree.isRooted() ) {
- // intrees[ 0 ].getRoot().getName();
- // inferred_tree.r
- // }
- // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s;
- // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator
- // .createInstance( inferred_tree, gwcd_list );
- // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony,
- // radomize_fitch_parsimony,
- // outfile_name,
- // domain_parsimony,
- // inferred_tree,
- // domain_id_to_go_ids_map,
- // go_id_to_term_map,
- // go_namespace_limit,
- // parameters_sb.toString() );
- // i++;
- // }
- // }
+ writeProteinListsForAllSpecies( out_dir,
+ protein_lists_per_species,
+ gwcd_list,
+ output_list_of_all_proteins_per_domain_e_value_max );
+ }
if ( all_bin_domain_combinations_gained_fitch != null ) {
try {
executeFitchGainsAnalysis( new File( output_file
final Runtime rt = java.lang.Runtime.getRuntime();
final long free_memory = rt.freeMemory() / 1000000;
final long total_memory = rt.totalMemory() / 1000000;
- System.out.println();
- System.out.println( "Time for analysis : " + ( new Date().getTime() - analysis_start_time ) + "ms" );
- System.out.println( "Total running time: " + ( new Date().getTime() - start_time ) + "ms " );
- System.out.println( "Free memory : " + free_memory + "MB, total memory: " + total_memory + "MB" );
- System.out.println();
- System.out.println( "If this application is useful to you, please cite:" );
- System.out.println( surfacing.WWW );
- System.out.println();
+ ForesterUtil.programMessage( PRG_NAME, "Time for analysis : " + ( new Date().getTime() - analysis_start_time )
+ + "ms" );
+ ForesterUtil.programMessage( PRG_NAME, "Total running time: " + ( new Date().getTime() - start_time ) + "ms " );
+ ForesterUtil.programMessage( PRG_NAME, "Free memory : " + free_memory + "MB, total memory: "
+ + total_memory + "MB" );
+ ForesterUtil.programMessage( PRG_NAME, "If this application is useful to you, please cite:" );
+ ForesterUtil.programMessage( PRG_NAME, surfacing.WWW );
ForesterUtil.programMessage( PRG_NAME, "OK" );
System.out.println();
}
}
}
- // public static StringBuffer stringCombinableDomainsMapToStringBuffer(
- // final SortedMap<String, CombinableDomains> map ) {
- // final StringBuffer sb = new StringBuffer();
- // for( final Iterator<String> iter = map.keySet().iterator();
- // iter.hasNext(); ) {
- // final Object key = iter.next();
- // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ',
- // false ) );
- // final CombinableDomains domain_combination = map.get( key );
- // sb.append( ForesterUtil.pad( new StringBuffer( "" +
- // domain_combination.getNumberOfCombiningDomains() ), 8,
- // ' ', false ) );
- // sb.append( domain_combination.toStringBuffer() );
- // sb.append( ForesterUtil.getLineSeparator() );
- // }
- // return sb;
- // }
private static void printHelp() {
System.out.println();
System.out.println( "Usage:" );
System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS
+ ": to output binary domain combinations for (downstream) graph analysis" );
System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" );
+ System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION
+ + ": e value max per domain for output of all proteins per domain" );
System.out.println();
+ System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar"
+ + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1"
+ + " -no_eo -mo=0 -input=genomes_limited.txt -out_dir=out -o=o "
+ + " -species_tree=tol.xml -obo=gene_ontology_2012_02_07.obo -pos_filter=f.txt -all_prot" );
System.out.println();
- System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar"
+ System.out.println( "Example 2: java -Xms128m -Xmx512m -cp path/to/forester.jar"
+ " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST"
+ " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo "
+ "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo "
- + "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION
- + "=50 human mouse brafl strpu" );
+ + "-ds_output=detailed_html -scoring=domains -sort=alpha human mouse brafl strpu" );
System.out.println();
}
private static void writeProteinListsForAllSpecies( final File output_dir,
final SortedMap<Species, List<Protein>> protein_lists_per_species,
- final List<GenomeWideCombinableDomains> gwcd_list ) {
+ final List<GenomeWideCombinableDomains> gwcd_list,
+ final double domain_e_cutoff ) {
final SortedSet<DomainId> all_domains = new TreeSet<DomainId>();
for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
all_domains.addAll( gwcd.getAllDomainIds() );
SurfacingUtil.checkForOutputFileWriteability( out );
try {
final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) );
- SurfacingUtil.extractProteinNames( protein_lists_per_species, domain, proteins_file_writer, "\t" );
+ SurfacingUtil.extractProteinNames( protein_lists_per_species,
+ domain,
+ proteins_file_writer,
+ "\t",
+ LIMIT_SPEC_FOR_PROT_EX,
+ domain_e_cutoff );
proteins_file_writer.close();
}
catch ( final IOException e ) {