From: cmzmasek@gmail.com Date: Wed, 6 Nov 2013 20:28:27 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=3d68fcd1fc4272b56546f78f3cbd437237af5cfa;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 413f45e..302800d 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -63,9 +63,6 @@ import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculato import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator; import org.forester.surfacing.DomainLengthsTable; import org.forester.surfacing.DomainParsimonyCalculator; -import org.forester.surfacing.DomainSimilarity; -import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring; -import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField; import org.forester.surfacing.DomainSimilarityCalculator; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.surfacing.GenomeWideCombinableDomains; @@ -74,6 +71,7 @@ import org.forester.surfacing.MappingResults; import org.forester.surfacing.PairwiseDomainSimilarityCalculator; import org.forester.surfacing.PairwiseGenomeComparator; import org.forester.surfacing.PrintableDomainSimilarity; +import org.forester.surfacing.PrintableDomainSimilarity.DomainSimilarityScoring; import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator; import org.forester.surfacing.SurfacingUtil; @@ -87,188 +85,188 @@ import org.forester.util.ForesterUtil; public class surfacing { - private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000; - public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; - public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; - public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; - public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; + private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000; + public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; + public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; + public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; // gain/loss: - public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d"; - public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc"; - public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d"; - public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS = "_dollo_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_gl_dc"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_DOMAINS = "_fitch_gl_d"; + public final static String PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_gl_dc"; // gain/loss counts: - public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d"; - public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc"; - public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d"; - public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS = "_dollo_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_BINARY_COMBINATIONS = "_dollo_glc_dc"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d"; + public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc"; // tables: - public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc"; - public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html"; - public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc"; - public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html"; - public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; - public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; - public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; - public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; - public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; - public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; - public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; - public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; - public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; - public final static String BDC_PRESENT_NEXUS = "_dc.nex"; + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; + public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; + public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; + public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; + public final static String BDC_PRESENT_NEXUS = "_dc.nex"; // --- - public final static String PRG_NAME = "surfacing"; - public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex"; - public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex"; - public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex"; - public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features"; - public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features"; - public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features"; - public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features"; - public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; - public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; - public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String OUTPUT_DIR_OPTION = "out_dir"; - final static private String SCORING_OPTION = "scoring"; - private static final DomainSimilarityScoring SCORING_DEFAULT = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; - final static private String SCORING_DOMAIN_COUNT_BASED = "domains"; - final static private String SCORING_PROTEIN_COUNT_BASED = "proteins"; - final static private String SCORING_COMBINATION_BASED = "combinations"; - final static private String DETAILEDNESS_OPTION = "detail"; - private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; - final static private String SPECIES_MATRIX_OPTION = "smatrix"; - final static private String DETAILEDNESS_BASIC = "basic"; - final static private String DETAILEDNESS_LIST_IDS = "list_ids"; - final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious"; - final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort"; - private static final DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; - final static private String DOMAIN_SIMILARITY_SORT_MIN = "min"; - final static private String DOMAIN_SIMILARITY_SORT_MAX = "max"; - final static private String DOMAIN_SIMILARITY_SORT_SD = "sd"; - final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean"; - final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff"; - final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff"; - final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff"; - final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species"; - final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha"; - final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first"; - final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort"; - private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; - final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha"; - final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom"; - final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot"; - final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb"; - final static private String CUTOFF_SCORE_FILE_OPTION = "cos"; - final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; - final static private String MAX_E_VALUE_OPTION = "e"; - final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; - final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; - final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; - final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; - final static private String DA_ANALYSIS_OPTION = "DA_analyis"; - final static private String USE_LAST_IN_FITCH_OPTION = "last"; - public final static String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; - final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; - final static private String OUTPUT_FILE_OPTION = "o"; - final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; - final static private String GO_OBO_FILE_USE_OPTION = "obo"; - final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace"; - final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function"; - final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process"; - final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component"; - final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary"; - final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab"; - final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html"; - final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html"; - final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output"; - private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML; - final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains"; - final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids"; - final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false; - final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains"; - final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false; - final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd"; - final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd"; - final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd"; - final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ" - + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ" - + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" - + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String FILTER_POSITIVE_OPTION = "pos_filter"; - final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; - final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; - final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; - final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; - final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.304"; - final static private String PRG_DATE = "131024"; - final static private String E_MAIL = "czmasek@burnham.org"; - final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; - final static private boolean IGNORE_DUFS_DEFAULT = true; - final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; - final static private double MAX_E_VALUE_DEFAULT = -1; - public final static int MAX_ALLOWED_OVERLAP_DEFAULT = -1; - private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; - private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; - private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; - public static final String SEQ_EXTRACT_SUFFIX = ".prot"; - public static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; - public static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; - public static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; - public static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; - public static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; - public static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; - public static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; - public static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; - private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; - final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e"; - public static final boolean VERBOSE = false; - private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; - private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; - private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; - private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; - public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; - public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; - public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; - public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains"; - public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains"; - public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc"; - public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc"; - public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS"; - public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS"; - public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities"; - private static final String LOG_FILE_SUFFIX = "_log.txt"; - private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; - private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; - private static final String WRITE_TO_NEXUS_OPTION = "nexus"; - private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt"; - public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default). - public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch" - + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; - public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; - private static final boolean CALC_SIMILARITY_SCORES = false; + public final static String PRG_NAME = "surfacing"; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex"; + public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex"; + public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex"; + public static final String PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_gl_secondary_features"; + public static final String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES = "_dollo_glc_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES = "_dollo_gains_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features"; + public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; + public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; + public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String OUTPUT_DIR_OPTION = "out_dir"; + final static private String SCORING_OPTION = "scoring"; + private static final DomainSimilarityScoring SCORING_DEFAULT = PrintableDomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + final static private String SCORING_DOMAIN_COUNT_BASED = "domains"; + final static private String SCORING_PROTEIN_COUNT_BASED = "proteins"; + final static private String SCORING_COMBINATION_BASED = "combinations"; + final static private String DETAILEDNESS_OPTION = "detail"; + private final static Detailedness DETAILEDNESS_DEFAULT = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + final static private String SPECIES_MATRIX_OPTION = "smatrix"; + final static private String DETAILEDNESS_BASIC = "basic"; + final static private String DETAILEDNESS_LIST_IDS = "list_ids"; + final static private String DETAILEDNESS_PUNCTILIOUS = "punctilious"; + final static private String DOMAIN_SIMILARITY_SORT_OPTION = "sort"; + private static final PrintableDomainSimilarity.DomainSimilaritySortField DOMAIN_SORT_FILD_DEFAULT = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + final static private String DOMAIN_SIMILARITY_SORT_MIN = "min"; + final static private String DOMAIN_SIMILARITY_SORT_MAX = "max"; + final static private String DOMAIN_SIMILARITY_SORT_SD = "sd"; + final static private String DOMAIN_SIMILARITY_SORT_MEAN = "mean"; + final static private String DOMAIN_SIMILARITY_SORT_DIFF = "diff"; + final static private String DOMAIN_SIMILARITY_SORT_COUNTS_DIFF = "count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF = "abs_count_diff"; + final static private String DOMAIN_SIMILARITY_SORT_SPECIES_COUNT = "species"; + final static private String DOMAIN_SIMILARITY_SORT_ALPHA = "alpha"; + final static private String DOMAIN_SIMILARITY_SORT_BY_SPECIES_COUNT_FIRST_OPTION = "species_first"; + final static private String DOMAIN_COUNT_SORT_OPTION = "dc_sort"; + private static final GenomeWideCombinableDomainsSortOrder DOMAINS_SORT_ORDER_DEFAULT = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID; + final static private String DOMAIN_COUNT_SORT_ALPHA = "alpha"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT = "dom"; + final static private String DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT = "prot"; + final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb"; + final static private String CUTOFF_SCORE_FILE_OPTION = "cos"; + final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; + final static private String MAX_E_VALUE_OPTION = "e"; + final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; + final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; + final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; + final static private String DA_ANALYSIS_OPTION = "DA_analyis"; + final static private String USE_LAST_IN_FITCH_OPTION = "last"; + public final static String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; + final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; + final static private String OUTPUT_FILE_OPTION = "o"; + final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; + final static private String GO_OBO_FILE_USE_OPTION = "obo"; + final static private String GO_NAMESPACE_LIMIT_OPTION = "go_namespace"; + final static private String GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION = "molecular_function"; + final static private String GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS = "biological_process"; + final static private String GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT = "cellular_component"; + final static private String SECONDARY_FEATURES_PARSIMONY_MAP_FILE = "secondary"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED = "simple_tab"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML = "simple_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML = "detailed_html"; + final static private String DOMAIN_SIMILARITY_PRINT_OPTION = "ds_output"; + private static final PRINT_OPTION DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT = PrintableDomainSimilarity.PRINT_OPTION.HTML; + final static private String IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION = "ignore_singlet_domains"; + final static private String IGNORE_VIRAL_IDS = "ignore_viral_ids"; + final static private boolean IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT = false; + final static private String IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION = "ignore_species_specific_domains"; + final static private boolean IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION_DEFAULT = false; + final static private String MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score.pwd"; + final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd"; + final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd"; + final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + + ForesterConstants.PHYLO_XML_SUFFIX; + final static private String FILTER_POSITIVE_OPTION = "pos_filter"; + final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; + final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; + final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; + final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; + final static private String SEQ_EXTRACT_OPTION = "prot_extract"; + final static private String PRG_VERSION = "2.400"; + final static private String PRG_DATE = "131106"; + final static private String E_MAIL = "czmasek@burnham.org"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; + final static private boolean IGNORE_DUFS_DEFAULT = true; + final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; + final static private double MAX_E_VALUE_DEFAULT = -1; + public final static int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; + private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; + public static final String SEQ_EXTRACT_SUFFIX = ".prot"; + public static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; + public static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; + public static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; + public static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; + public static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; + public static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; + public static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; + public static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; + private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e"; + public static final boolean VERBOSE = false; + private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; + private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; + private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; + private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; + public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; + public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; + public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; + public static final String ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX = "_all_pfams_gained_as_domains"; + public static final String ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX = "_all_pfams_lost_as_domains"; + public static final String ALL_PFAMS_GAINED_AS_DC_SUFFIX = "_all_pfams_gained_as_dc"; + public static final String ALL_PFAMS_LOST_AS_DC_SUFFIX = "_all_pfams_lost_as_dc"; + public static final String BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES = "PER_NODE_EVENTS"; + public static final String BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES = "PER_SUBTREE_EVENTS"; + public static final String D_PROMISCUITY_FILE_SUFFIX = "_domain_promiscuities"; + private static final String LOG_FILE_SUFFIX = "_log.txt"; + private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; + private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; + private static final String WRITE_TO_NEXUS_OPTION = "nexus"; + private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt"; + public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default). + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; + private static final boolean CALC_SIMILARITY_SCORES = false; public static void main( final String args[] ) { final long start_time = new Date().getTime(); @@ -544,7 +542,7 @@ public class surfacing { ForesterUtil.fatalError( surfacing.PRG_NAME, "no input genomes file given: " + surfacing.INPUT_GENOMES_FILE_OPTION + "=" ); } - DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; + PrintableDomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; if ( cla.isOptionSet( surfacing.SCORING_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.SCORING_OPTION ) ) { ForesterUtil.fatalError( surfacing.PRG_NAME, @@ -556,13 +554,13 @@ public class surfacing { } final String scoring_str = cla.getOptionValue( surfacing.SCORING_OPTION ); if ( scoring_str.equals( surfacing.SCORING_DOMAIN_COUNT_BASED ) ) { - scoring = DomainSimilarity.DomainSimilarityScoring.DOMAINS; + scoring = PrintableDomainSimilarity.DomainSimilarityScoring.DOMAINS; } else if ( scoring_str.equals( surfacing.SCORING_COMBINATION_BASED ) ) { - scoring = DomainSimilarity.DomainSimilarityScoring.COMBINATIONS; + scoring = PrintableDomainSimilarity.DomainSimilarityScoring.COMBINATIONS; } else if ( scoring_str.equals( surfacing.SCORING_PROTEIN_COUNT_BASED ) ) { - scoring = DomainSimilarity.DomainSimilarityScoring.PROTEINS; + scoring = PrintableDomainSimilarity.DomainSimilarityScoring.PROTEINS; } else { ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + scoring_str @@ -641,8 +639,8 @@ public class surfacing { } query_domain_ids = cla.getOptionValue( surfacing.SEQ_EXTRACT_OPTION ); } - DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field = DOMAIN_SORT_FILD_DEFAULT; - DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT; + PrintableDomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field = DOMAIN_SORT_FILD_DEFAULT; + PrintableDomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT; if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for domain combinations similarities sorting: -" @@ -656,40 +654,40 @@ public class surfacing { } final String sort_str = cla.getOptionValue( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ).toLowerCase(); if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_ALPHA ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MAX ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.MAX; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MIN ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MIN; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.MIN; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_MEAN ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MEAN; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MEAN; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.MEAN; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.MEAN; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SPECIES_COUNT; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.SPECIES_COUNT; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_SD ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.SD; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.SD; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_DIFF ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.MAX_DIFFERENCE; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; } else if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; - domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field_for_automated_pwc = PrintableDomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE; } else { ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort_str @@ -875,9 +873,9 @@ public class surfacing { + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" ); } } - if ( ( domain_similarity_sort_field == DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE ) + if ( ( domain_similarity_sort_field == PrintableDomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE ) && ( number_of_genomes > 2 ) ) { - domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; + domain_similarity_sort_field = PrintableDomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; } File[] intree_files = null; Phylogeny[] intrees = null; @@ -1778,7 +1776,7 @@ public class surfacing { if ( domain_id_to_go_ids_map != null ) { go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL; } - final SortedSet similarities = calc + final SortedSet similarities = calc .calculateSimilarities( pw_calc, gwcd_list, ignore_domains_without_combs_in_all_spec, diff --git a/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java b/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java index 3f799d1..518c32e 100644 --- a/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java +++ b/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java @@ -35,15 +35,6 @@ public class BasicBinaryDomainCombination implements BinaryDomainCombination { String _id1; String _str; - BasicBinaryDomainCombination() { - _id0 = null; - _id1 = null; - } - - private String getAsStr() { - return _id0 + SEPARATOR + _id1; - } - public BasicBinaryDomainCombination( final String id0, final String id1 ) { if ( ( id0 == null ) || ( id1 == null ) ) { throw new IllegalArgumentException( "attempt to create binary domain combination using null" ); @@ -58,6 +49,11 @@ public class BasicBinaryDomainCombination implements BinaryDomainCombination { } } + BasicBinaryDomainCombination() { + _id0 = null; + _id1 = null; + } + @Override public int compareTo( final BinaryDomainCombination binary_domain_combination ) { if ( binary_domain_combination.getClass() != this.getClass() ) { @@ -158,6 +154,10 @@ public class BasicBinaryDomainCombination implements BinaryDomainCombination { return getAsStr(); } + private String getAsStr() { + return _id0 + SEPARATOR + _id1; + } + public static BinaryDomainCombination createInstance( final String ids ) { if ( ids.indexOf( BinaryDomainCombination.SEPARATOR ) < 1 ) { throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); diff --git a/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java b/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java index c04333c..2f48a64 100644 --- a/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java +++ b/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java @@ -40,11 +40,11 @@ import org.forester.util.ForesterUtil; public class BasicCombinableDomains implements CombinableDomains { + final private TreeMap _combining_domains; final private String _key_domain; private int _key_domain_count; - final private Species _species; - final private TreeMap _combining_domains; final private Set _proteins_with_key_domain; + final private Species _species; public BasicCombinableDomains( final String key_domain, final Species species ) { _key_domain = key_domain; @@ -117,10 +117,6 @@ public class BasicCombinableDomains implements CombinableDomains { return sb; } - protected TreeMap getCombiningDomains() { - return _combining_domains; - } - @Override public String getKeyDomain() { return _key_domain; @@ -132,6 +128,11 @@ public class BasicCombinableDomains implements CombinableDomains { } @Override + public Set getKeyDomainProteins() { + return _proteins_with_key_domain; + } + + @Override public int getKeyDomainProteinsCount() { return getKeyDomainProteins().size(); } @@ -190,8 +191,7 @@ public class BasicCombinableDomains implements CombinableDomains { return sb.toString(); } - @Override - public Set getKeyDomainProteins() { - return _proteins_with_key_domain; + protected TreeMap getCombiningDomains() { + return _combining_domains; } } diff --git a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java index 81cef33..c4c5ab7 100644 --- a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java @@ -42,12 +42,12 @@ import org.forester.util.ForesterUtil; public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator { - final DomainSimilarity.DomainSimilaritySortField _sort; - private final boolean _sort_by_species_count_first; - private final boolean _treat_as_binary_comparison; - private final boolean _calc_similarity_score; + final PrintableDomainSimilarity.DomainSimilaritySortField _sort; + private final boolean _calc_similarity_score; + private final boolean _sort_by_species_count_first; + private final boolean _treat_as_binary_comparison; - public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort, + public BasicDomainSimilarityCalculator( final PrintableDomainSimilarity.DomainSimilaritySortField sort, final boolean sort_by_species_count_first, final boolean treat_as_binary_comparison, final boolean calc_similarity_score ) { @@ -57,19 +57,15 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat _calc_similarity_score = calc_similarity_score; } - public boolean isCalcSimilarityScore() { - return _calc_similarity_score; - } - @Override - public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, - final List cdc_list, - final boolean ignore_domains_without_combinations_in_any_genome, - final boolean ignore_domains_specific_to_one_genome ) { + public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List cdc_list, + final boolean ignore_domains_without_combinations_in_any_genome, + final boolean ignore_domains_specific_to_one_genome ) { if ( cdc_list.size() < 2 ) { throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" ); } - final SortedSet similarities = new TreeSet(); + final SortedSet similarities = new TreeSet(); final SortedSet keys = new TreeSet(); for( final GenomeWideCombinableDomains cdc : cdc_list ) { keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() ); @@ -102,7 +98,7 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat } if ( same_id_cd_list.size() > 0 ) { if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) { - final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list ); + final PrintableDomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list ); if ( s != null ) { similarities.add( s ); } @@ -119,8 +115,12 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat return similarities; } - private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator, - final List domains_list ) { + public boolean isCalcSimilarityScore() { + return _calc_similarity_score; + } + + private PrintableDomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List domains_list ) { if ( domains_list.size() == 1 ) { final SortedMap species_data = new TreeMap(); species_data.put( domains_list.get( 0 ).getSpecies(), @@ -199,7 +199,7 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat max_difference = Math.abs( max_difference ); } } - DomainSimilarity similarity = null; + PrintableDomainSimilarity similarity = null; if ( !isCalcSimilarityScore() ) { similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), max_difference_in_counts, diff --git a/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java index 3e59603..1d115b2 100644 --- a/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java +++ b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java @@ -26,18 +26,18 @@ import org.forester.util.ForesterUtil; public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains { - private static final Comparator DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator() { + private static final Comparator DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator() { @Override public int compare( final CombinableDomains d1, final CombinableDomains d2 ) { - if ( d1.getKeyDomainCount() < d2 - .getKeyDomainCount() ) { + if ( d1.getNumberOfCombinableDomains() < d2 + .getNumberOfCombinableDomains() ) { return 1; } else if ( d1 - .getKeyDomainCount() > d2 - .getKeyDomainCount() ) { + .getNumberOfCombinableDomains() > d2 + .getNumberOfCombinableDomains() ) { return -1; } else { @@ -48,18 +48,18 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom } } }; - private static final Comparator DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator() { + private static final Comparator DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator() { @Override public int compare( final CombinableDomains d1, final CombinableDomains d2 ) { - if ( d1.getKeyDomainProteinsCount() < d2 - .getKeyDomainProteinsCount() ) { + if ( d1.getKeyDomainCount() < d2 + .getKeyDomainCount() ) { return 1; } else if ( d1 - .getKeyDomainProteinsCount() > d2 - .getKeyDomainProteinsCount() ) { + .getKeyDomainCount() > d2 + .getKeyDomainCount() ) { return -1; } else { @@ -70,18 +70,18 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom } } }; - private static final Comparator DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator() { + private static final Comparator DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator() { @Override public int compare( final CombinableDomains d1, final CombinableDomains d2 ) { - if ( d1.getNumberOfCombinableDomains() < d2 - .getNumberOfCombinableDomains() ) { + if ( d1.getKeyDomainProteinsCount() < d2 + .getKeyDomainProteinsCount() ) { return 1; } else if ( d1 - .getNumberOfCombinableDomains() > d2 - .getNumberOfCombinableDomains() ) { + .getKeyDomainProteinsCount() > d2 + .getKeyDomainProteinsCount() ) { return -1; } else { @@ -93,8 +93,8 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom } }; final private SortedMap _combinable_domains_map; - final private Species _species; final private DomainCombinationType _dc_type; + final private Species _species; private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) { _combinable_domains_map = new TreeMap(); @@ -102,10 +102,6 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom _dc_type = dc_type; } - private void add( final String key, final CombinableDomains cdc ) { - _combinable_domains_map.put( key, cdc ); - } - @Override public boolean contains( final String key_id ) { return _combinable_domains_map.containsKey( key_id ); @@ -220,16 +216,8 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom return sb; } - private static void countDomains( final Map domain_counts, - final Set saw_c, - final String id_i ) { - if ( domain_counts.containsKey( id_i ) ) { - domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) ); - } - else { - domain_counts.put( id_i, 1 ); - } - saw_c.add( id_i ); + private void add( final String key, final CombinableDomains cdc ) { + _combinable_domains_map.put( key, cdc ); } public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, @@ -355,4 +343,16 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom } return instance; } + + private static void countDomains( final Map domain_counts, + final Set saw_c, + final String id_i ) { + if ( domain_counts.containsKey( id_i ) ) { + domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) ); + } + else { + domain_counts.put( id_i, 1 ); + } + saw_c.add( id_i ); + } } diff --git a/forester/java/src/org/forester/surfacing/CombinableDomains.java b/forester/java/src/org/forester/surfacing/CombinableDomains.java index 028810e..28fa0e2 100644 --- a/forester/java/src/org/forester/surfacing/CombinableDomains.java +++ b/forester/java/src/org/forester/surfacing/CombinableDomains.java @@ -43,17 +43,6 @@ public interface CombinableDomains { public void addCombinableDomain( final String protein_domain ); /** - * - * This must return all domains in this set of combinable domains (i.e. - * the key domain and all domains which can combine with the key domain). - * - * @return all domains - */ - List getAllDomains(); - - List getCombinableDomains(); - - /** * Returns the combinable domain identifiers sorted in alphabetical manner: - * keys are the combinable domain identifiers - values are the counts of * proteins exhibiting a particular combination @@ -80,6 +69,8 @@ public interface CombinableDomains { */ public int getKeyDomainCount(); + public Set getKeyDomainProteins(); + /** * Returns how many proteins with the key domain are present in a given * species genome. @@ -88,8 +79,6 @@ public interface CombinableDomains { */ public int getKeyDomainProteinsCount(); - public Set getKeyDomainProteins(); - public int getNumberOfCombinableDomains(); public int getNumberOfProteinsExhibitingCombination( final String protein_domain ); @@ -103,6 +92,21 @@ public interface CombinableDomains { public boolean isCombinable( final String protein_domain ); + public List toBinaryDomainCombinations(); + + void addKeyDomainProtein( String protein ); + + /** + * + * This must return all domains in this set of combinable domains (i.e. + * the key domain and all domains which can combine with the key domain). + * + * @return all domains + */ + List getAllDomains(); + + List getCombinableDomains(); + /** * Sets how many times the key domain is present in a given species genome. * @@ -110,8 +114,4 @@ public interface CombinableDomains { * key domain count in species */ void setKeyDomainCount( final int key_domain_count ); - - public List toBinaryDomainCombinations(); - - void addKeyDomainProtein( String protein ); } \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java index f271afa..a98298d 100644 --- a/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java @@ -29,9 +29,9 @@ package org.forester.surfacing; public class CombinationsBasedPairwiseDomainSimilarity implements PairwiseDomainSimilarity { - private final int _same_domains; - private final int _different_domains; private final int _difference_in_counts; + private final int _different_domains; + private final int _same_domains; private final double _score; public CombinationsBasedPairwiseDomainSimilarity( final int same_domains, diff --git a/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java index caf6cb0..a61ba30 100644 --- a/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java @@ -35,18 +35,18 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { public static final double MAX_SIMILARITY_SCORE = 1.0; public static final double MIN_SIMILARITY_SCORE = 0.0; + private Set _all_binary_domain_combinations; + private Set _all_domains; + private boolean _allow_domains_to_be_ignored; + private Set _binary_domain_combinations_specific_to_0; + private Set _binary_domain_combinations_specific_to_1; final private GenomeWideCombinableDomains _combinable_domains_genome_0; final private GenomeWideCombinableDomains _combinable_domains_genome_1; private Set _domain_ids_to_ignore; - private boolean _allow_domains_to_be_ignored; - private Set _all_domains; - private Set _shared_domains; private Set _domains_specific_to_0; private Set _domains_specific_to_1; - private Set _all_binary_domain_combinations; private Set _shared_binary_domain_combinations; - private Set _binary_domain_combinations_specific_to_0; - private Set _binary_domain_combinations_specific_to_1; + private Set _shared_domains; public DomainArchitectureBasedGenomeSimilarityCalculator( final GenomeWideCombinableDomains combinable_domains_genome_0, final GenomeWideCombinableDomains combinable_domains_genome_1 ) { @@ -115,17 +115,6 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { setDomainIdsToIgnore( new HashSet() ); } - private void forceRecalculation() { - _all_domains = null; - _shared_domains = null; - _domains_specific_to_0 = null; - _domains_specific_to_1 = null; - _all_binary_domain_combinations = null; - _shared_binary_domain_combinations = null; - _binary_domain_combinations_specific_to_0 = null; - _binary_domain_combinations_specific_to_1 = null; - } - /** * Does not return binary combinations which contain one or two domains * to be ignored -- if ignoring is allowed. @@ -169,30 +158,6 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { return _all_domains; } - private Set getBinaryDomainCombinationsSpecificToGenome( final boolean specific_to_genome_0 ) { - final Set specific = new HashSet(); - final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); - final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); - if ( specific_to_genome_0 ) { - for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { - if ( !bc1.contains( binary_domain_combination0 ) ) { - specific.add( binary_domain_combination0 ); - } - } - } - else { - for( final BinaryDomainCombination binary_domain_combination1 : bc1 ) { - if ( !bc0.contains( binary_domain_combination1 ) ) { - specific.add( binary_domain_combination1 ); - } - } - } - if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { - return pruneBinaryCombinations( specific ); - } - return specific; - } - public Set getBinaryDomainCombinationsSpecificToGenome0() { if ( _binary_domain_combinations_specific_to_0 == null ) { _binary_domain_combinations_specific_to_0 = getBinaryDomainCombinationsSpecificToGenome( true ); @@ -207,42 +172,6 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { return _binary_domain_combinations_specific_to_1; } - private GenomeWideCombinableDomains getCombinableDomainsGenome0() { - return _combinable_domains_genome_0; - } - - private GenomeWideCombinableDomains getCombinableDomainsGenome1() { - return _combinable_domains_genome_1; - } - - private Set getDomainIdsToIgnore() { - return _domain_ids_to_ignore; - } - - private Set getDomainsSpecificToGenome( final boolean specific_to_genome_0 ) { - final Set specific = new HashSet(); - final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); - final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); - if ( specific_to_genome_0 ) { - for( final String domain0 : d0 ) { - if ( !d1.contains( domain0 ) ) { - specific.add( domain0 ); - } - } - } - else { - for( final String domain1 : d1 ) { - if ( !d0.contains( domain1 ) ) { - specific.add( domain1 ); - } - } - } - if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { - return pruneDomains( specific ); - } - return specific; - } - public Set getDomainsSpecificToGenome0() { if ( _domains_specific_to_0 == null ) { _domains_specific_to_0 = getDomainsSpecificToGenome( true ); @@ -293,6 +222,87 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { return _shared_domains; } + public void setAllowDomainsToBeIgnored( final boolean allow_domains_to_be_ignored ) { + forceRecalculation(); + _allow_domains_to_be_ignored = allow_domains_to_be_ignored; + } + + void setDomainIdsToIgnore( final Set domain_ids_to_ignore ) { + forceRecalculation(); + _domain_ids_to_ignore = domain_ids_to_ignore; + } + + private void forceRecalculation() { + _all_domains = null; + _shared_domains = null; + _domains_specific_to_0 = null; + _domains_specific_to_1 = null; + _all_binary_domain_combinations = null; + _shared_binary_domain_combinations = null; + _binary_domain_combinations_specific_to_0 = null; + _binary_domain_combinations_specific_to_1 = null; + } + + private Set getBinaryDomainCombinationsSpecificToGenome( final boolean specific_to_genome_0 ) { + final Set specific = new HashSet(); + final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); + final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); + if ( specific_to_genome_0 ) { + for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { + if ( !bc1.contains( binary_domain_combination0 ) ) { + specific.add( binary_domain_combination0 ); + } + } + } + else { + for( final BinaryDomainCombination binary_domain_combination1 : bc1 ) { + if ( !bc0.contains( binary_domain_combination1 ) ) { + specific.add( binary_domain_combination1 ); + } + } + } + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + return pruneBinaryCombinations( specific ); + } + return specific; + } + + private GenomeWideCombinableDomains getCombinableDomainsGenome0() { + return _combinable_domains_genome_0; + } + + private GenomeWideCombinableDomains getCombinableDomainsGenome1() { + return _combinable_domains_genome_1; + } + + private Set getDomainIdsToIgnore() { + return _domain_ids_to_ignore; + } + + private Set getDomainsSpecificToGenome( final boolean specific_to_genome_0 ) { + final Set specific = new HashSet(); + final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); + final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); + if ( specific_to_genome_0 ) { + for( final String domain0 : d0 ) { + if ( !d1.contains( domain0 ) ) { + specific.add( domain0 ); + } + } + } + else { + for( final String domain1 : d1 ) { + if ( !d0.contains( domain1 ) ) { + specific.add( domain1 ); + } + } + } + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + return pruneDomains( specific ); + } + return specific; + } + private void init() { deleteAllDomainIdsToIgnore(); setAllowDomainsToBeIgnored( false ); @@ -322,14 +332,4 @@ public class DomainArchitectureBasedGenomeSimilarityCalculator { } return pruned; } - - public void setAllowDomainsToBeIgnored( final boolean allow_domains_to_be_ignored ) { - forceRecalculation(); - _allow_domains_to_be_ignored = allow_domains_to_be_ignored; - } - - void setDomainIdsToIgnore( final Set domain_ids_to_ignore ) { - forceRecalculation(); - _domain_ids_to_ignore = domain_ids_to_ignore; - } } \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java b/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java index e172659..8f9a249 100644 --- a/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java +++ b/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java @@ -58,77 +58,11 @@ import org.forester.util.ForesterUtil; */ public final class DomainCountsDifferenceUtil { - private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES = COPY_CALCULATION_MODE.MIN; private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES = COPY_CALCULATION_MODE.MIN; + private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES = COPY_CALCULATION_MODE.MIN; private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_LOW_COPY_SPECIES = COPY_CALCULATION_MODE.MAX; private static final String PLUS_MINUS_PROTEINS_FILE_DOM_SUFFIX = ".prot"; - //FIXME really needs to be tested! - private static void addCounts( final SortedMap> copy_counts, - final BinaryDomainCombination dc, - final GenomeWideCombinableDomains genome, - final Set bdc ) { - if ( !copy_counts.containsKey( dc ) ) { - copy_counts.put( dc, new ArrayList() ); - } - if ( bdc.contains( dc ) - && ( ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains().get( dc.getId1() ) != null ) ) { - final int count = ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains() - .get( dc.getId1() ); - copy_counts.get( dc ).add( count ); - } - else { - copy_counts.get( dc ).add( 0 ); - } - } - - private static void addCounts( final SortedMap> copy_counts, - final String domain, - final GenomeWideCombinableDomains genome ) { - if ( !copy_counts.containsKey( domain ) ) { - copy_counts.put( domain, new ArrayList() ); - } - if ( genome.contains( domain ) ) { - copy_counts.get( domain ).add( genome.get( domain ).getKeyDomainProteinsCount() ); - } - else { - copy_counts.get( domain ).add( 0 ); - } - } - - private static StringBuilder addGoInformation( final String d, - final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map ) { - final StringBuilder sb = new StringBuilder(); - if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() - || !domain_id_to_go_ids_map.containsKey( d ) ) { - return sb; - } - final List go_ids = domain_id_to_go_ids_map.get( d ); - for( int i = 0; i < go_ids.size(); ++i ) { - final GoId go_id = go_ids.get( i ); - if ( go_id_to_term_map.containsKey( go_id ) ) { - appendGoTerm( sb, go_id_to_term_map.get( go_id ) ); - sb.append( "
" ); - } - else { - sb.append( "go id \"" + go_id + "\" not found [" + d + "]" ); - } - } - return sb; - } - - private static void appendGoTerm( final StringBuilder sb, final GoTerm go_term ) { - final GoId go_id = go_term.getGoId(); - sb.append( "" + go_id - + "" ); - sb.append( ":" ); - sb.append( go_term.getName() ); - sb.append( " [" ); - sb.append( go_term.getGoNameSpace().toShortString() ); - sb.append( "]" ); - } - public static void calculateCopyNumberDifferences( final List genomes, final SortedMap> protein_lists_per_species, final List high_copy_base_species, @@ -301,6 +235,72 @@ public final class DomainCountsDifferenceUtil { writeGoIdsToFile( passing_gos_writer, go_ids_of_passing_domains ); } + //FIXME really needs to be tested! + private static void addCounts( final SortedMap> copy_counts, + final BinaryDomainCombination dc, + final GenomeWideCombinableDomains genome, + final Set bdc ) { + if ( !copy_counts.containsKey( dc ) ) { + copy_counts.put( dc, new ArrayList() ); + } + if ( bdc.contains( dc ) + && ( ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains().get( dc.getId1() ) != null ) ) { + final int count = ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains() + .get( dc.getId1() ); + copy_counts.get( dc ).add( count ); + } + else { + copy_counts.get( dc ).add( 0 ); + } + } + + private static void addCounts( final SortedMap> copy_counts, + final String domain, + final GenomeWideCombinableDomains genome ) { + if ( !copy_counts.containsKey( domain ) ) { + copy_counts.put( domain, new ArrayList() ); + } + if ( genome.contains( domain ) ) { + copy_counts.get( domain ).add( genome.get( domain ).getKeyDomainProteinsCount() ); + } + else { + copy_counts.get( domain ).add( 0 ); + } + } + + private static StringBuilder addGoInformation( final String d, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map ) { + final StringBuilder sb = new StringBuilder(); + if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() + || !domain_id_to_go_ids_map.containsKey( d ) ) { + return sb; + } + final List go_ids = domain_id_to_go_ids_map.get( d ); + for( int i = 0; i < go_ids.size(); ++i ) { + final GoId go_id = go_ids.get( i ); + if ( go_id_to_term_map.containsKey( go_id ) ) { + appendGoTerm( sb, go_id_to_term_map.get( go_id ) ); + sb.append( "
" ); + } + else { + sb.append( "go id \"" + go_id + "\" not found [" + d + "]" ); + } + } + return sb; + } + + private static void appendGoTerm( final StringBuilder sb, final GoTerm go_term ) { + final GoId go_id = go_term.getGoId(); + sb.append( "" + go_id + + "" ); + sb.append( ":" ); + sb.append( go_term.getName() ); + sb.append( " [" ); + sb.append( go_term.getGoNameSpace().toShortString() ); + sb.append( "]" ); + } + private static void calculateDomainCountsBasedValue( final SortedMap copy_values, final SortedMap> copy_counts, final BinaryDomainCombination bdc, @@ -827,6 +827,6 @@ public final class DomainCountsDifferenceUtil { } public static enum COPY_CALCULATION_MODE { - MEAN, MEDIAN, MAX, MIN + MAX, MEAN, MEDIAN, MIN } } diff --git a/forester/java/src/org/forester/surfacing/DomainLengths.java b/forester/java/src/org/forester/surfacing/DomainLengths.java index 1018aea..5bb71cf 100644 --- a/forester/java/src/org/forester/surfacing/DomainLengths.java +++ b/forester/java/src/org/forester/surfacing/DomainLengths.java @@ -52,13 +52,6 @@ public class DomainLengths { getLengthStatistic( species ).addValue( domain_length ); } - private void addLengthStatistics( final Species species, final DescriptiveStatistics length_statistic ) { - if ( getLengthStatistics().containsKey( species ) ) { - throw new IllegalArgumentException( "length statistics for [" + species.getSpeciesId() + "] already added" ); - } - getLengthStatistics().put( species, length_statistic ); - } - /** * Returns descriptive statistics based on the arithmetic means * for each species. @@ -98,10 +91,6 @@ public class DomainLengths { return getLengthStatistics().get( species ); } - private SortedMap getLengthStatistics() { - return _length_statistics; - } - public List getLengthStatisticsList() { final List list = new ArrayList(); for( final DescriptiveStatistics stats : _length_statistics.values() ) { @@ -141,4 +130,15 @@ public class DomainLengths { public boolean isHasLengthStatistic( final Species species ) { return getLengthStatistics().containsKey( species ); } + + private void addLengthStatistics( final Species species, final DescriptiveStatistics length_statistic ) { + if ( getLengthStatistics().containsKey( species ) ) { + throw new IllegalArgumentException( "length statistics for [" + species.getSpeciesId() + "] already added" ); + } + getLengthStatistics().put( species, length_statistic ); + } + + private SortedMap getLengthStatistics() { + return _length_statistics; + } } diff --git a/forester/java/src/org/forester/surfacing/DomainLengthsTable.java b/forester/java/src/org/forester/surfacing/DomainLengthsTable.java index 44b6247..9ef11cc 100644 --- a/forester/java/src/org/forester/surfacing/DomainLengthsTable.java +++ b/forester/java/src/org/forester/surfacing/DomainLengthsTable.java @@ -50,21 +50,6 @@ public class DomainLengthsTable { _species = new ArrayList(); } - private void addDomainLengths( final DomainLengths domain_lengths ) { - if ( getDomainLengths().containsKey( domain_lengths.getDomainId() ) ) { - throw new IllegalArgumentException( "domain lengths for [" + domain_lengths.getDomainId() - + "] already added" ); - } - getDomainLengths().put( domain_lengths.getDomainId(), domain_lengths ); - } - - private void addLength( final String domain_id, final Species species, final int domain_length ) { - if ( !getDomainLengths().containsKey( domain_id ) ) { - addDomainLengths( new DomainLengths( domain_id ) ); - } - getDomainLengths().get( domain_id ).addLength( species, domain_length ); - } - public void addLengths( final List protein_list ) { for( final Protein protein : protein_list ) { final Species species = protein.getSpecies(); @@ -142,10 +127,6 @@ public class DomainLengthsTable { return sb; } - private SortedMap getDomainLengths() { - return _domain_lengths; - } - public DomainLengths getDomainLengths( final String domain_id ) { return getDomainLengths().get( domain_id ); } @@ -165,4 +146,23 @@ public class DomainLengthsTable { public List getSpecies() { return _species; } + + private void addDomainLengths( final DomainLengths domain_lengths ) { + if ( getDomainLengths().containsKey( domain_lengths.getDomainId() ) ) { + throw new IllegalArgumentException( "domain lengths for [" + domain_lengths.getDomainId() + + "] already added" ); + } + getDomainLengths().put( domain_lengths.getDomainId(), domain_lengths ); + } + + private void addLength( final String domain_id, final Species species, final int domain_length ) { + if ( !getDomainLengths().containsKey( domain_id ) ) { + addDomainLengths( new DomainLengths( domain_id ) ); + } + getDomainLengths().get( domain_id ).addLength( species, domain_length ); + } + + private SortedMap getDomainLengths() { + return _domain_lengths; + } } diff --git a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java index 4de0b35..e667d39 100644 --- a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java @@ -55,16 +55,16 @@ import org.forester.util.ForesterUtil; public final class DomainParsimonyCalculator { private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred"; - private CharacterStateMatrix _gain_loss_matrix; private CharacterStateMatrix _binary_internal_states_matrix; + private int _cost; + private Map> _domain_id_to_secondary_features_map; + private CharacterStateMatrix _gain_loss_matrix; private final List _gwcd_list; private final Phylogeny _phylogeny; - private int _total_losses; + private SortedSet _positive_filter; private int _total_gains; + private int _total_losses; private int _total_unchanged; - private int _cost; - private Map> _domain_id_to_secondary_features_map; - private SortedSet _positive_filter; private DomainParsimonyCalculator( final Phylogeny phylogeny ) { init(); @@ -87,70 +87,6 @@ public final class DomainParsimonyCalculator { setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map ); } - int calculateNumberOfBinaryDomainCombination() { - if ( getGenomeWideCombinableDomainsList().isEmpty() ) { - throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); - } - final Set all_binary_combinations = new HashSet(); - for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) { - for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - all_binary_combinations.add( bc ); - } - } - return all_binary_combinations.size(); - } - - CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence() { - return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - - CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence() { - return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() ); - } - - CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map mapping_results_map ) { - return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(), - getDomainIdToSecondaryFeaturesMap(), - mapping_results_map ); - } - - Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) { - for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode node = it.next(); - final String node_identifier = node.getName(); - final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ), - getUnitsGainedOnNode( node_identifier ), - getUnitsLostOnNode( node_identifier ), - TYPE_FORBINARY_CHARACTERS, - getSumOfPresentOnNode( node_identifier ), - getSumOfGainsOnNode( node_identifier ), - getSumOfLossesOnNode( node_identifier ) ); - node.getNodeData().setBinaryCharacters( bc ); - } - return phylogeny; - } - - private void executeDolloParsimony( final boolean on_domain_presence ) { - reset(); - final DolloParsimony dollo = DolloParsimony.createInstance(); - dollo.setReturnGainLossMatrix( true ); - dollo.setReturnInternalStates( true ); - CharacterStateMatrix states = null; - if ( on_domain_presence ) { - states = createMatrixOfDomainPresenceOrAbsence(); - } - else { - states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence(); - } - dollo.execute( getPhylogeny(), states ); - setGainLossMatrix( dollo.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); - setCost( dollo.getCost() ); - setTotalGains( dollo.getTotalGains() ); - setTotalLosses( dollo.getTotalLosses() ); - setTotalUnchanged( dollo.getTotalUnchanged() ); - } - public void executeDolloParsimonyOnBinaryDomainCombintionPresence() { executeDolloParsimony( false ); } @@ -183,86 +119,18 @@ public final class DomainParsimonyCalculator { setTotalUnchanged( dollo.getTotalUnchanged() ); } - private void executeFitchParsimony( final boolean on_domain_presence, - final boolean use_last, - final boolean randomize, - final long random_number_seed ) { - reset(); - if ( use_last ) { - System.out.println( " Fitch parsimony: use_last = true" ); - } - final FitchParsimony fitch = new FitchParsimony(); - fitch.setRandomize( randomize ); - if ( randomize ) { - fitch.setRandomNumberSeed( random_number_seed ); - } - fitch.setUseLast( use_last ); - fitch.setReturnGainLossMatrix( true ); - fitch.setReturnInternalStates( true ); - CharacterStateMatrix states = null; - if ( on_domain_presence ) { - states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - else { - states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - fitch.execute( getPhylogeny(), states, true ); - setGainLossMatrix( fitch.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); - setCost( fitch.getCost() ); - setTotalGains( fitch.getTotalGains() ); - setTotalLosses( fitch.getTotalLosses() ); - setTotalUnchanged( fitch.getTotalUnchanged() ); - } - - private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last, - final boolean randomize, - final long random_number_seed ) { - reset(); - if ( use_last ) { - System.out.println( " Fitch parsimony: use_last = true" ); - } - final FitchParsimony fitch = new FitchParsimony(); - fitch.setRandomize( randomize ); - if ( randomize ) { - fitch.setRandomNumberSeed( random_number_seed ); - } - fitch.setUseLast( use_last ); - fitch.setReturnGainLossMatrix( true ); - fitch.setReturnInternalStates( true ); - final Map> map = getDomainIdToSecondaryFeaturesMap(); - final Map newmap = new HashMap(); - final Iterator>> it = map.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry> pair = it.next(); - if ( pair.getValue().size() != 1 ) { - throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" ); - } - newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] ); - } - final CharacterStateMatrix states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(), - newmap ); - fitch.execute( getPhylogeny(), states, true ); - setGainLossMatrix( fitch.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); - setCost( fitch.getCost() ); - setTotalGains( fitch.getTotalGains() ); - setTotalLosses( fitch.getTotalLosses() ); - setTotalUnchanged( fitch.getTotalUnchanged() ); - } - public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) { executeFitchParsimony( false, use_last, false, 0 ); } - public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) { - executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 ); - } - public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) { executeFitchParsimony( false, false, true, random_number_seed ); } + public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) { + executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 ); + } + public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) { executeFitchParsimony( true, use_last, false, 0 ); } @@ -348,10 +216,6 @@ public final class DomainParsimonyCalculator { return _cost; } - private Map> getDomainIdToSecondaryFeaturesMap() { - return _domain_id_to_secondary_features_map; - } - public CharacterStateMatrix getGainLossCountsMatrix() { final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( getGainLossMatrix() .getNumberOfIdentifiers(), 3 ); @@ -384,10 +248,6 @@ public final class DomainParsimonyCalculator { return _gain_loss_matrix; } - private List getGenomeWideCombinableDomainsList() { - return _gwcd_list; - } - public CharacterStateMatrix getInternalStatesMatrix() { return _binary_internal_states_matrix; } @@ -409,14 +269,6 @@ public final class DomainParsimonyCalculator { return net; } - private Phylogeny getPhylogeny() { - return _phylogeny; - } - - private SortedSet getPositiveFilter() { - return _positive_filter; - } - public int getSumOfGainsOnNode( final String node_identifier ) { return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); } @@ -429,18 +281,6 @@ public final class DomainParsimonyCalculator { return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier ); } - int getSumOfUnchangedAbsentOnNode( final String node_identifier ) { - return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); - } - - int getSumOfUnchangedOnNode( final String node_identifier ) { - return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier ); - } - - int getSumOfUnchangedPresentOnNode( final String node_identifier ) { - return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); - } - public int getTotalGains() { return _total_gains; } @@ -467,59 +307,219 @@ public final class DomainParsimonyCalculator { return present; } - SortedSet getUnitsUnchangedAbsentOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); - } - - SortedSet getUnitsUnchangedPresentOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); - } - - private void init() { - setDomainIdToSecondaryFeaturesMap( null ); - setPositiveFilter( null ); - reset(); + int calculateNumberOfBinaryDomainCombination() { + if ( getGenomeWideCombinableDomainsList().isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + final Set all_binary_combinations = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) { + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + } + } + return all_binary_combinations.size(); } - private void reset() { - setGainLossMatrix( null ); - setBinaryInternalStatesMatrix( null ); - setCost( -1 ); - setTotalGains( -1 ); - setTotalLosses( -1 ); - setTotalUnchanged( -1 ); + CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence() { + return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); } - private void setBinaryInternalStatesMatrix( final CharacterStateMatrix binary_states_matrix ) { - _binary_internal_states_matrix = binary_states_matrix; + CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence() { + return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() ); } - private void setCost( final int cost ) { - _cost = cost; + CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map mapping_results_map ) { + return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(), + getDomainIdToSecondaryFeaturesMap(), + mapping_results_map ); } - private void setDomainIdToSecondaryFeaturesMap( final Map> domain_id_to_secondary_features_map ) { - _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map; + Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) { + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + final String node_identifier = node.getName(); + final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ), + getUnitsGainedOnNode( node_identifier ), + getUnitsLostOnNode( node_identifier ), + TYPE_FORBINARY_CHARACTERS, + getSumOfPresentOnNode( node_identifier ), + getSumOfGainsOnNode( node_identifier ), + getSumOfLossesOnNode( node_identifier ) ); + node.getNodeData().setBinaryCharacters( bc ); + } + return phylogeny; } - private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) { - _gain_loss_matrix = gain_loss_matrix; + int getSumOfUnchangedAbsentOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); } - private void setPositiveFilter( final SortedSet positive_filter ) { - _positive_filter = positive_filter; + int getSumOfUnchangedOnNode( final String node_identifier ) { + return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier ); } - private void setTotalGains( final int total_gains ) { - _total_gains = total_gains; + int getSumOfUnchangedPresentOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); } - private void setTotalLosses( final int total_losses ) { - _total_losses = total_losses; + SortedSet getUnitsUnchangedAbsentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); } - private void setTotalUnchanged( final int total_unchanged ) { - _total_unchanged = total_unchanged; + SortedSet getUnitsUnchangedPresentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); + } + + private void executeDolloParsimony( final boolean on_domain_presence ) { + reset(); + final DolloParsimony dollo = DolloParsimony.createInstance(); + dollo.setReturnGainLossMatrix( true ); + dollo.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence(); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence(); + } + dollo.execute( getPhylogeny(), states ); + setGainLossMatrix( dollo.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); + setCost( dollo.getCost() ); + setTotalGains( dollo.getTotalGains() ); + setTotalLosses( dollo.getTotalLosses() ); + setTotalUnchanged( dollo.getTotalUnchanged() ); + } + + private void executeFitchParsimony( final boolean on_domain_presence, + final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + fitch.execute( getPhylogeny(), states, true ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); + } + + private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + final Map> map = getDomainIdToSecondaryFeaturesMap(); + final Map newmap = new HashMap(); + final Iterator>> it = map.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> pair = it.next(); + if ( pair.getValue().size() != 1 ) { + throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" ); + } + newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] ); + } + final CharacterStateMatrix states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(), + newmap ); + fitch.execute( getPhylogeny(), states, true ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); + } + + private Map> getDomainIdToSecondaryFeaturesMap() { + return _domain_id_to_secondary_features_map; + } + + private List getGenomeWideCombinableDomainsList() { + return _gwcd_list; + } + + private Phylogeny getPhylogeny() { + return _phylogeny; + } + + private SortedSet getPositiveFilter() { + return _positive_filter; + } + + private void init() { + setDomainIdToSecondaryFeaturesMap( null ); + setPositiveFilter( null ); + reset(); + } + + private void reset() { + setGainLossMatrix( null ); + setBinaryInternalStatesMatrix( null ); + setCost( -1 ); + setTotalGains( -1 ); + setTotalLosses( -1 ); + setTotalUnchanged( -1 ); + } + + private void setBinaryInternalStatesMatrix( final CharacterStateMatrix binary_states_matrix ) { + _binary_internal_states_matrix = binary_states_matrix; + } + + private void setCost( final int cost ) { + _cost = cost; + } + + private void setDomainIdToSecondaryFeaturesMap( final Map> domain_id_to_secondary_features_map ) { + _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map; + } + + private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) { + _gain_loss_matrix = gain_loss_matrix; + } + + private void setPositiveFilter( final SortedSet positive_filter ) { + _positive_filter = positive_filter; + } + + private void setTotalGains( final int total_gains ) { + _total_gains = total_gains; + } + + private void setTotalLosses( final int total_losses ) { + _total_losses = total_losses; + } + + private void setTotalUnchanged( final int total_unchanged ) { + _total_unchanged = total_unchanged; } public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) { @@ -544,68 +544,50 @@ public final class DomainParsimonyCalculator { return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map ); } - /** - * For folds instead of Pfam-domains, for example - * - * - * @param gwcd_list - * @return - */ - static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, - final Map> domain_id_to_second_features_map, - final Map mapping_results_map ) { + public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { - throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); - } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_secondary_features = new TreeSet(); + final SortedSet all_binary_combinations = new TreeSet(); + final Set[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ]; + int identifier_index = 0; for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - int mapped = 0; - int not_mapped = 0; - for( final String domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); - mapped++; - } - else { - not_mapped++; - } - } - if ( mapping_results_map != null ) { - final MappingResults mr = new MappingResults(); - mr.setDescription( gwcd.getSpecies().getSpeciesId() ); - mr.setSumOfSuccesses( mapped ); - mr.setSumOfFailures( not_mapped ); - mapping_results_map.put( gwcd.getSpecies(), mr ); + binary_combinations_per_genome[ identifier_index ] = new HashSet(); + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + binary_combinations_per_genome[ identifier_index ].add( bc ); } + ++identifier_index; } - final int number_of_characters = all_secondary_features.size(); + final int number_of_characters = all_binary_combinations.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, number_of_characters ); int character_index = 0; - for( final String second_id : all_secondary_features ) { - matrix.setCharacter( character_index++, second_id ); + for( final BinaryDomainCombination bc : all_binary_combinations ) { + matrix.setCharacter( character_index++, bc.toString() ); } - int identifier_index = 0; + identifier_index = 0; final Set all_identifiers = new HashSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { final String species_id = gwcd.getSpecies().getSpeciesId(); if ( all_identifiers.contains( species_id ) ) { - throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); + throw new AssertionError( "species [" + species_id + "] is not unique" ); } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); - final Set all_second_per_gwcd = new HashSet(); - for( final String domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); - } - } for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { + BinaryDomainCombination bc = null; + if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { + bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { + bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else { + bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -617,65 +599,59 @@ public final class DomainParsimonyCalculator { return matrix; } - public static CharacterStateMatrix createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list, - final Map domain_id_to_second_features_map ) { + public static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list, + final SortedSet positive_filter ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { - throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) { + throw new IllegalArgumentException( "positive filter is empty" ); } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_binary_combinations_mapped = new TreeSet(); - final Set[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ]; - int identifier_index = 0; - final SortedSet no_mappings = new TreeSet(); + final SortedSet all_domain_ids = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet(); - for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map, - bc, - no_mappings ); - all_binary_combinations_mapped.add( mapped_bc ); - binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc ); + for( final String domain : gwcd.getAllDomainIds() ) { + all_domain_ids.add( domain ); } - ++identifier_index; } - if ( !no_mappings.isEmpty() ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size() - + "):" ); - for( final String id : no_mappings ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, id ); + int number_of_characters = all_domain_ids.size(); + if ( positive_filter != null ) { + //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes + number_of_characters = 0; + for( final String id : all_domain_ids ) { + if ( positive_filter.contains( id ) ) { + number_of_characters++; + } } } - final int number_of_characters = all_binary_combinations_mapped.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, number_of_characters ); int character_index = 0; - for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) { - matrix.setCharacter( character_index++, bc.toString() ); + for( final String id : all_domain_ids ) { + if ( positive_filter == null ) { + matrix.setCharacter( character_index++, id ); + } + else { + if ( positive_filter.contains( id ) ) { + matrix.setCharacter( character_index++, id ); + } + } } - identifier_index = 0; + int identifier_index = 0; final Set all_identifiers = new HashSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { final String species_id = gwcd.getSpecies().getSpeciesId(); if ( all_identifiers.contains( species_id ) ) { - throw new AssertionError( "species [" + species_id + "] is not unique" ); + throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - BinaryDomainCombination bc = null; - if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { - bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); - } - else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { - bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); - } - else { - bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) { + throw new RuntimeException( "this should not have happened: problem with character #" + ci ); } - if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) { + final String id = matrix.getCharacter( ci ); + if ( gwcd.contains( id ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -687,49 +663,42 @@ public final class DomainParsimonyCalculator { return matrix; } - private static BinaryDomainCombination mapBinaryDomainCombination( final Map domain_id_to_second_features_map, - final BinaryDomainCombination bc, - final SortedSet no_mappings ) { - String id0 = ""; - String id1 = ""; - if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) { - no_mappings.add( bc.getId0() ); - id0 = bc.getId0(); - } - else { - id0 = domain_id_to_second_features_map.get( bc.getId0() ); - } - if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) { - no_mappings.add( bc.getId1() ); - id1 = bc.getId1(); - } - else { - id1 = domain_id_to_second_features_map.get( bc.getId1() ); - } - return new BasicBinaryDomainCombination( id0, id1 ); - } - - public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { + public static CharacterStateMatrix createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list, + final Map domain_id_to_second_features_map ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_binary_combinations = new TreeSet(); - final Set[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ]; + final SortedSet all_binary_combinations_mapped = new TreeSet(); + final Set[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ]; int identifier_index = 0; + final SortedSet no_mappings = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - binary_combinations_per_genome[ identifier_index ] = new HashSet(); + binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet(); for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - all_binary_combinations.add( bc ); - binary_combinations_per_genome[ identifier_index ].add( bc ); + final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map, + bc, + no_mappings ); + all_binary_combinations_mapped.add( mapped_bc ); + binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc ); } ++identifier_index; } - final int number_of_characters = all_binary_combinations.size(); + if ( !no_mappings.isEmpty() ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size() + + "):" ); + for( final String id : no_mappings ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, id ); + } + } + final int number_of_characters = all_binary_combinations_mapped.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, number_of_characters ); int character_index = 0; - for( final BinaryDomainCombination bc : all_binary_combinations ) { + for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) { matrix.setCharacter( character_index++, bc.toString() ); } identifier_index = 0; @@ -752,7 +721,7 @@ public final class DomainParsimonyCalculator { else { bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); } - if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) { + if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -768,43 +737,50 @@ public final class DomainParsimonyCalculator { return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null ); } - public static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list, - final SortedSet positive_filter ) { + /** + * For folds instead of Pfam-domains, for example + * + * + * @param gwcd_list + * @return + */ + static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, + final Map> domain_id_to_second_features_map, + final Map mapping_results_map ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) { - throw new IllegalArgumentException( "positive filter is empty" ); + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_domain_ids = new TreeSet(); + final SortedSet all_secondary_features = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + int mapped = 0; + int not_mapped = 0; for( final String domain : gwcd.getAllDomainIds() ) { - all_domain_ids.add( domain ); - } - } - int number_of_characters = all_domain_ids.size(); - if ( positive_filter != null ) { - //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes - number_of_characters = 0; - for( final String id : all_domain_ids ) { - if ( positive_filter.contains( id ) ) { - number_of_characters++; + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); + mapped++; + } + else { + not_mapped++; } } + if ( mapping_results_map != null ) { + final MappingResults mr = new MappingResults(); + mr.setDescription( gwcd.getSpecies().getSpeciesId() ); + mr.setSumOfSuccesses( mapped ); + mr.setSumOfFailures( not_mapped ); + mapping_results_map.put( gwcd.getSpecies(), mr ); + } } + final int number_of_characters = all_secondary_features.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, number_of_characters ); int character_index = 0; - for( final String id : all_domain_ids ) { - if ( positive_filter == null ) { - matrix.setCharacter( character_index++, id ); - } - else { - if ( positive_filter.contains( id ) ) { - matrix.setCharacter( character_index++, id ); - } - } + for( final String second_id : all_secondary_features ) { + matrix.setCharacter( character_index++, second_id ); } int identifier_index = 0; final Set all_identifiers = new HashSet(); @@ -815,12 +791,14 @@ public final class DomainParsimonyCalculator { } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); - for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) { - throw new RuntimeException( "this should not have happened: problem with character #" + ci ); + final Set all_second_per_gwcd = new HashSet(); + for( final String domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); } - final String id = matrix.getCharacter( ci ); - if ( gwcd.contains( id ) ) { + } + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -879,4 +857,26 @@ public final class DomainParsimonyCalculator { } return d; } + + private static BinaryDomainCombination mapBinaryDomainCombination( final Map domain_id_to_second_features_map, + final BinaryDomainCombination bc, + final SortedSet no_mappings ) { + String id0 = ""; + String id1 = ""; + if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) { + no_mappings.add( bc.getId0() ); + id0 = bc.getId0(); + } + else { + id0 = domain_id_to_second_features_map.get( bc.getId0() ); + } + if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) { + no_mappings.add( bc.getId1() ); + id1 = bc.getId1(); + } + else { + id1 = domain_id_to_second_features_map.get( bc.getId1() ); + } + return new BasicBinaryDomainCombination( id0, id1 ); + } } diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarity.java b/forester/java/src/org/forester/surfacing/DomainSimilarity.java deleted file mode 100644 index 5a0735e..0000000 --- a/forester/java/src/org/forester/surfacing/DomainSimilarity.java +++ /dev/null @@ -1,108 +0,0 @@ -// $Id: -// -// FORESTER -- software libraries and applications -// for evolutionary biology research and applications. -// -// Copyright (C) 2008-2009 Christian M. Zmasek -// Copyright (C) 2008-2009 Burnham Institute for Medical Research -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: https://sites.google.com/site/cmzmasek/home/software/forester - -package org.forester.surfacing; - -import java.util.Map; -import java.util.SortedMap; -import java.util.SortedSet; - -import org.forester.phylogeny.Phylogeny; -import org.forester.species.Species; -import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; - -/* - * This is to represent a measure of similarity between two or more domains from - * different genomes. - */ -public interface DomainSimilarity extends Comparable { - - static public enum DomainSimilarityScoring { - DOMAINS, PROTEINS, COMBINATIONS; - } - - public static enum DomainSimilaritySortField { - MIN, MAX, SD, MEAN, ABS_MAX_COUNTS_DIFFERENCE, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, SPECIES_COUNT, DOMAIN_ID, - } - - public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain ); - - public String getDomainId(); - - /** - * For pairwise similarities, this should return the "difference"; for example the difference in counts - * for copy number based features (the same as getMaximalDifferenceInCounts(), or the number - * of actually different domain combinations. - * For pairwise similarities, this should return the difference, - * while for comparisons of more than two domains, this should return the maximal difference - * - * - * - * @return - */ - public int getMaximalDifference(); - - /** - * For pairwise similarities, this should return the difference in counts, - * while for comparisons of more than two domains, this should return the maximal difference - * in counts - * - * - * @return the (maximal) difference in counts - */ - public int getMaximalDifferenceInCounts(); - - public double getMaximalSimilarityScore(); - - public double getMeanSimilarityScore(); - - public double getMinimalSimilarityScore(); - - /** - * This should return the number of pairwise distances used to calculate - * this similarity score - * - * @return the number of pairwise distances - */ - public int getN(); - - public SortedSet getSpecies(); - - /** - * This should return a map, which maps species names to - * SpeciesSpecificDomainSimilariyData - * - * - * @return SortedMap - */ - public SortedMap getSpeciesData(); - - public double getStandardDeviationOfSimilarityScore(); - - public StringBuffer toStringBuffer( PRINT_OPTION print_option, - Map tax_code_to_id_map, - Phylogeny phy ); -} diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java index fa0ea62..ad72c45 100644 --- a/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java @@ -32,16 +32,16 @@ import java.util.SortedSet; public interface DomainSimilarityCalculator { - public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, - final List cdc_list, - final boolean ignore_domains_without_combinations_in_any_genome, - final boolean ignore_domains_specific_to_one_genome );; + public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List cdc_list, + final boolean ignore_domains_without_combinations_in_any_genome, + final boolean ignore_domains_specific_to_one_genome );; public static enum Detailedness { BASIC, LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES, PUNCTILIOUS } public static enum GoAnnotationOutput { - NONE, ALL + ALL, NONE } } diff --git a/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java b/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java index 4dbe718..810e409 100644 --- a/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java +++ b/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java @@ -52,8 +52,6 @@ public interface GenomeWideCombinableDomains { public DomainCombinationType getDomainCombinationType(); - SortedSet getMostPromiscuosDomain(); - /** * This should return a statistic for per domain * promiscuity in a genome. @@ -75,7 +73,9 @@ public interface GenomeWideCombinableDomains { public StringBuilder toStringBuilder( GenomeWideCombinableDomainsSortOrder order ); + SortedSet getMostPromiscuosDomain(); + public static enum GenomeWideCombinableDomainsSortOrder { - ALPHABETICAL_KEY_ID, KEY_DOMAIN_PROTEINS_COUNT, KEY_DOMAIN_COUNT, COMBINATIONS_COUNT + ALPHABETICAL_KEY_ID, COMBINATIONS_COUNT, KEY_DOMAIN_COUNT, KEY_DOMAIN_PROTEINS_COUNT } } diff --git a/forester/java/src/org/forester/surfacing/MappingResults.java b/forester/java/src/org/forester/surfacing/MappingResults.java index 69a6a31..2589ed3 100644 --- a/forester/java/src/org/forester/surfacing/MappingResults.java +++ b/forester/java/src/org/forester/surfacing/MappingResults.java @@ -29,8 +29,8 @@ package org.forester.surfacing; public class MappingResults { private String _description; - private int _sum_of_successes; private int _sum_of_failures; + private int _sum_of_successes; public String getDescription() { return _description; diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index 4a78a13..9c6e1b5 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -52,8 +52,8 @@ import org.forester.util.ForesterUtil; public class PairwiseGenomeComparator { private List _domain_distance_scores_means; - private List _shared_domains_based_distances; private List _shared_binary_combinations_based_distances; + private List _shared_domains_based_distances; public PairwiseGenomeComparator() { init(); @@ -71,20 +71,14 @@ public class PairwiseGenomeComparator { return _shared_domains_based_distances; } - private void init() { - _domain_distance_scores_means = new ArrayList(); - _shared_domains_based_distances = new ArrayList(); - _shared_binary_combinations_based_distances = new ArrayList(); - } - public void performPairwiseComparisons( final StringBuilder html_desc, final boolean sort_by_species_count_first, final Detailedness detailedness, final boolean ignore_domains_without_combs_in_all_spec, final boolean ignore_domains_specific_to_one_species, - final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field, + final PrintableDomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field, final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, - final DomainSimilarity.DomainSimilarityScoring scoring, + final PrintableDomainSimilarity.DomainSimilarityScoring scoring, final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, @@ -146,7 +140,7 @@ public class PairwiseGenomeComparator { sort_by_species_count_first, true, calc_similarity_scores ); - final SortedSet similarities = calc + final SortedSet similarities = calc .calculateSimilarities( pw_calc, genome_pair, ignore_domains_without_combs_in_all_spec, @@ -299,6 +293,12 @@ public class PairwiseGenomeComparator { } } + private void init() { + _domain_distance_scores_means = new ArrayList(); + _shared_domains_based_distances = new ArrayList(); + _shared_binary_combinations_based_distances = new ArrayList(); + } + static private String[] getAllUniqueDomainIdAsArray( final List list_of_genome_wide_combinable_domains ) { String[] all_domain_ids_array; final SortedSet all_domain_ids = new TreeSet(); diff --git a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java index 6247714..7fe04aa 100644 --- a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java @@ -44,11 +44,12 @@ import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.util.ForesterUtil; -public class PrintableDomainSimilarity implements DomainSimilarity { +public class PrintableDomainSimilarity implements Comparable { - final public static String SPECIES_SEPARATOR = " "; - final private static int EQUAL = 0; - final private static String NO_SPECIES = " "; + final public static String SPECIES_SEPARATOR = " "; + final private static int EQUAL = 0; + final private static String NO_SPECIES = " "; + private static final boolean OUTPUT_TAXCODES_PER_DOMAIN = false; final private CombinableDomains _combinable_domains; private DomainSimilarityCalculator.Detailedness _detailedness; final private double _max; @@ -158,8 +159,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } } - @Override - public int compareTo( final DomainSimilarity domain_similarity ) { + public int compareTo( final PrintableDomainSimilarity domain_similarity ) { if ( this == domain_similarity ) { return EQUAL; } @@ -173,7 +173,6 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return compareByDomainId( domain_similarity ); } - @Override public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain ) { final SortedSet sorted_ids = new TreeSet(); if ( getSpeciesData().containsKey( species_of_combinable_domain ) ) { @@ -185,42 +184,56 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return sorted_ids; } - @Override public String getDomainId() { return getCombinableDomains().getKeyDomain(); } - @Override + /** + * For pairwise similarities, this should return the "difference"; for example the difference in counts + * for copy number based features (the same as getMaximalDifferenceInCounts(), or the number + * of actually different domain combinations. + * For pairwise similarities, this should return the difference, + * while for comparisons of more than two domains, this should return the maximal difference + * + */ public int getMaximalDifference() { return _max_difference; } - @Override + /** + * For pairwise similarities, this should return the difference in counts, + * while for comparisons of more than two domains, this should return the maximal difference + * in counts + * + * + * @return the (maximal) difference in counts + */ public int getMaximalDifferenceInCounts() { return _max_difference_in_counts; } - @Override public double getMaximalSimilarityScore() { return _max; } - @Override public double getMeanSimilarityScore() { return _mean; } - @Override public double getMinimalSimilarityScore() { return _min; } - @Override + /** + * This should return the number of pairwise distances used to calculate + * this similarity score + * + * @return the number of pairwise distances + */ public int getN() { return _n; } - @Override public SortedSet getSpecies() { final SortedSet species = new TreeSet(); for( final Species s : getSpeciesData().keySet() ) { @@ -233,12 +246,17 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return _species_order; } - @Override + /** + * This should return a map, which maps species names to + * SpeciesSpecificDomainSimilariyData + * + * + * @return SortedMap + */ public SortedMap getSpeciesData() { return _species_data; } - @Override public double getStandardDeviationOfSimilarityScore() { return _sd; } @@ -254,7 +272,6 @@ public class PrintableDomainSimilarity implements DomainSimilarity { _species_order = species_order; } - @Override public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option, final Map tax_code_to_id_map, final Phylogeny phy ) { @@ -262,7 +279,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { case SIMPLE_TAB_DELIMITED: return toStringBufferSimpleTabDelimited(); case HTML: - return toStringBufferDetailedHTML( tax_code_to_id_map, phy ); + return toStringBufferDetailedHTML( tax_code_to_id_map, phy, OUTPUT_TAXCODES_PER_DOMAIN ); default: throw new AssertionError( "Unknown print option: " + print_option ); } @@ -274,14 +291,17 @@ public class PrintableDomainSimilarity implements DomainSimilarity { final Map tax_code_to_id_map, final Phylogeny phy ) { if ( html ) { + sb.append( "" ); + sb.append( "" ); addTaxWithLink( sb, species.getSpeciesId(), tax_code_to_id_map, phy ); + sb.append( "" ); } else { sb.append( species.getSpeciesId() ); } if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { if ( html ) { - sb.append( ":" ); + //sb.append( ":" ); } else { sb.append( "\t" ); @@ -289,7 +309,8 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( getSpeciesData().get( species ).toStringBuffer( getDetaildness(), html ) ); } if ( html ) { - sb.append( "
" ); + //sb.append( "
" ); + sb.append( "" ); } else { sb.append( "\n\t" ); @@ -332,7 +353,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "" ); } - private int compareByDomainId( final DomainSimilarity other ) { + private int compareByDomainId( final PrintableDomainSimilarity other ) { return getDomainId().compareToIgnoreCase( other.getDomainId() ); } @@ -357,7 +378,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } for( final Map.Entry> e : m.entrySet() ) { sb.append( "" + e.getKey() + "" ); - sb.append( ": " ); + sb.append( " " ); sb.append( "" ); for( final String tax : e.getValue() ) { final String hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax, null ); @@ -379,8 +400,35 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return sb; } + private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + final StringBuffer sb = new StringBuffer(); + sb.append( "" ); + for( final Species species : getSpeciesData().keySet() ) { + addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map, phy ); + } + sb.append( "
" ); + return sb; + } + + private StringBuffer getSpeciesDataInCustomOrder( final boolean html, + final Map tax_code_to_id_map, + final Phylogeny phy ) { + final StringBuffer sb = new StringBuffer(); + for( final Species order_species : getSpeciesCustomOrder() ) { + if ( getSpeciesData().keySet().contains( order_species ) ) { + addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map, phy ); + } + else { + sb.append( PrintableDomainSimilarity.NO_SPECIES ); + sb.append( PrintableDomainSimilarity.SPECIES_SEPARATOR ); + } + } + return sb; + } + private StringBuffer getTaxonomyGroupDistribution( final Phylogeny tol ) { - //TODO work on me final SortedMap> domain_to_species_set_map = new TreeMap>(); for( final Species species : getSpeciesData().keySet() ) { for( final String combable_dom : getCombinableDomainIds( species ) ) { @@ -394,8 +442,6 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "" ); for( final Map.Entry> domain_to_species_set : domain_to_species_set_map.entrySet() ) { final Map counts = new HashMap(); - // final ValueComparator bvc = new ValueComparator( counts ); - // final SortedMap sorted_counts = new TreeMap( bvc ); for( final String tax_code : domain_to_species_set.getValue() ) { final String group = SurfacingUtil.obtainTaxonomyGroup( tax_code, tol ); if ( !ForesterUtil.isEmpty( group ) ) { @@ -424,14 +470,12 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } counts_to_groups.get( c ).add( group_to_counts.getKey() ); } - // sorted_counts.putAll( counts ); sb.append( "" ); sb.append( "" ); - // sb.append( "" ); boolean first = true; for( final Entry> count_to_groups : counts_to_groups.entrySet() ) { if ( first ) { @@ -445,7 +489,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "" ); sb.append( "" ); } - // sb.append( "" ); sb.append( ForesterUtil.getLineSeparator() ); } sb.append( "
" ); sb.append( "" + domain_to_species_set.getKey() + "" ); - sb.append( ": " ); + sb.append( " " ); sb.append( "" ); final SortedSet groups = count_to_groups.getValue(); sb.append( count_to_groups.getKey() ); - sb.append( ":" ); + sb.append( " " ); for( final String group : groups ) { final Color color = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group ); if ( color == null ) { @@ -465,83 +509,9 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "
" ); - // i am just a template and need to be modified for "printout" TODO - // for( final Map.Entry> e : m.entrySet() ) { - // sb.append( "" + e.getKey() + "" ); - // sb.append( ": " ); - // sb.append( "" ); - // for( final String tax : e.getValue() ) { - // final String hex = SurfacingUtil.obtainHexColorStringDependingOnTaxonomyGroup( tax, null ); - // if ( !ForesterUtil.isEmpty( hex ) ) { - // sb.append( "" ); - // sb.append( tax ); - // sb.append( "" ); - // } - // else { - // sb.append( tax ); - // } - // sb.append( " " ); - // } - // sb.append( "" ); - // sb.append( "
\n" ); - // } - return sb; - } - - /* - public class Testing { - - public static void main(String[] args) { - - HashMap map = new HashMap(); - ValueComparator bvc = new ValueComparator(map); - TreeMap sorted_map = new TreeMap(bvc); - - map.put("A",99.5); - map.put("B",67.4); - map.put("C",67.4); - map.put("D",67.3); - - System.out.println("unsorted map: "+map); - - sorted_map.putAll(map); - - System.out.println("results: "+sorted_map); - } - } - - - - */ - private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html, - final Map tax_code_to_id_map, - final Phylogeny phy ) { - final StringBuffer sb = new StringBuffer(); - for( final Species species : getSpeciesData().keySet() ) { - addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map, phy ); - } - return sb; - } - - private StringBuffer getSpeciesDataInCustomOrder( final boolean html, - final Map tax_code_to_id_map, - final Phylogeny phy ) { - final StringBuffer sb = new StringBuffer(); - for( final Species order_species : getSpeciesCustomOrder() ) { - if ( getSpeciesData().keySet().contains( order_species ) ) { - addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map, phy ); - } - else { - sb.append( PrintableDomainSimilarity.NO_SPECIES ); - sb.append( PrintableDomainSimilarity.SPECIES_SEPARATOR ); - } - } return sb; } @@ -553,7 +523,9 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return _treat_as_binary_comparison; } - private StringBuffer toStringBufferDetailedHTML( final Map tax_code_to_id_map, final Phylogeny phy ) { + private StringBuffer toStringBufferDetailedHTML( final Map tax_code_to_id_map, + final Phylogeny phy, + final boolean output_tax_codes_per_domain ) { final StringBuffer sb = new StringBuffer(); sb.append( "" ); sb.append( "" ); @@ -609,14 +581,18 @@ public class PrintableDomainSimilarity implements DomainSimilarity { if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) { sb.append( "" ); sb.append( getSpeciesDataInAlphabeticalOrder( true, tax_code_to_id_map, phy ) ); - sb.append( getDomainDataInAlphabeticalOrder() ); + if ( output_tax_codes_per_domain ) { + sb.append( getDomainDataInAlphabeticalOrder() ); + } sb.append( getTaxonomyGroupDistribution( phy ) ); sb.append( "" ); } else { sb.append( "" ); sb.append( getSpeciesDataInCustomOrder( true, tax_code_to_id_map, phy ) ); - sb.append( getDomainDataInAlphabeticalOrder() ); + if ( output_tax_codes_per_domain ) { + sb.append( getDomainDataInAlphabeticalOrder() ); + } sb.append( getTaxonomyGroupDistribution( phy ) ); sb.append( "" ); } @@ -633,6 +609,14 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return sb; } + static public enum DomainSimilarityScoring { + COMBINATIONS, DOMAINS, PROTEINS; + } + + public static enum DomainSimilaritySortField { + ABS_MAX_COUNTS_DIFFERENCE, DOMAIN_ID, MAX, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, MEAN, MIN, SD, SPECIES_COUNT, + } + public static enum PRINT_OPTION { HTML, SIMPLE_TAB_DELIMITED; } diff --git a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java index 92de951..be2fcc7 100644 --- a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java +++ b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java @@ -98,25 +98,29 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { final StringBuffer sb = new StringBuffer(); if ( detailedness == DomainSimilarityCalculator.Detailedness.PUNCTILIOUS ) { if ( html ) { - sb.append( " " ); + //sb.append( " " ); + sb.append( "" ); } sb.append( getKeyDomainDomainsCount() ); if ( html ) { - sb.append( ", " ); + //sb.append( ", " ); + sb.append( "" ); } else { sb.append( "\t" ); } sb.append( getKeyDomainProteinsCount() ); if ( html ) { - sb.append( ", " ); + // sb.append( ", " ); + sb.append( "" ); } else { sb.append( "\t" ); } sb.append( getCombinableDomainsCount() ); - if ( html && !getCombinableDomainIdToCountsMap().isEmpty() ) { - sb.append( ":" ); + if ( html /*&& !getCombinableDomainIdToCountsMap().isEmpty()*/) { + // sb.append( ":" ); + sb.append( "" ); } } if ( html ) { @@ -148,6 +152,7 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { sb.append( link ); } sb.append( "]" ); + sb.append( "" ); } return sb; } diff --git a/forester/java/src/org/forester/surfacing/SpeciesSpecificDcData.java b/forester/java/src/org/forester/surfacing/SpeciesSpecificDcData.java index eb75a9b..be08c39 100644 --- a/forester/java/src/org/forester/surfacing/SpeciesSpecificDcData.java +++ b/forester/java/src/org/forester/surfacing/SpeciesSpecificDcData.java @@ -45,11 +45,11 @@ interface SpeciesSpecificDcData { */ public SortedMap getCombinableDomainIdToCountsMap(); + public SortedSet getKeyDomainProteins(); + public int getNumberOfProteinsExhibitingCombinationWith( final String domain_id ); public StringBuffer toStringBuffer( final DomainSimilarityCalculator.Detailedness detailedness, boolean html ); - public SortedSet getKeyDomainProteins(); - void addKeyDomainProtein( String protein ); } diff --git a/forester/java/src/org/forester/surfacing/SurfacingConstants.java b/forester/java/src/org/forester/surfacing/SurfacingConstants.java index aa588a0..697c84e 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingConstants.java +++ b/forester/java/src/org/forester/surfacing/SurfacingConstants.java @@ -39,7 +39,7 @@ public class SurfacingConstants { public static final String NONE = "[none]"; public static final String PFAM_FAMILY_ID_LINK = "http://pfam.janelia.org/family/"; public static final String UNIPROT_TAXONOMY_ID_LINK = "http://www.uniprot.org/taxonomy/"; + static final boolean PRINT_MORE_DOM_SIMILARITY_INFO = false; static final boolean SECONDARY_FEATURES_ARE_SCOP = true; static final String SECONDARY_FEATURES_SCOP_LINK = "http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi?key="; - static final boolean PRINT_MORE_DOM_SIMILARITY_INFO = false; } diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 73cee01..c4f98d6 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -143,9 +143,9 @@ public final class SurfacingUtil { } } - public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { + public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( final DomainSimilarity similarity : similarities ) { + for( final PrintableDomainSimilarity similarity : similarities ) { stats.addValue( similarity.getMeanSimilarityScore() ); } return stats; @@ -375,11 +375,11 @@ public final class SurfacingUtil { return m; } - public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, + public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, final Detailedness detailedness ) { - for( final DomainSimilarity domain_similarity : domain_similarities ) { + for( final PrintableDomainSimilarity domain_similarity : domain_similarities ) { if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + final PrintableDomainSimilarity printable_domain_similarity = domain_similarity; printable_domain_similarity.setDetailedness( detailedness ); } } @@ -2218,11 +2218,11 @@ public final class SurfacingUtil { final Writer simple_tab_writer, final Writer single_writer, Map split_writers, - final SortedSet similarities, + final SortedSet similarities, final boolean treat_as_binary, final List species_order, final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilarityScoring scoring, + final PrintableDomainSimilarity.DomainSimilarityScoring scoring, final boolean verbose, final Map tax_code_to_id_map, final Phylogeny phy, @@ -2262,9 +2262,9 @@ public final class SurfacingUtil { break; } // - for( final DomainSimilarity similarity : similarities ) { + for( final PrintableDomainSimilarity similarity : similarities ) { if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + ( similarity ).setSpeciesOrder( species_order ); } if ( single_writer != null ) { if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) { @@ -2347,9 +2347,9 @@ public final class SurfacingUtil { w.write( SurfacingConstants.NL ); } // - for( final DomainSimilarity similarity : similarities ) { + for( final PrintableDomainSimilarity similarity : similarities ) { if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + ( similarity ).setSpeciesOrder( species_order ); } if ( simple_tab_writer != null ) { simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, diff --git a/forester/java/src/org/forester/surfacing/TestSurfacing.java b/forester/java/src/org/forester/surfacing/TestSurfacing.java index 168b6a4..dabbb06 100644 --- a/forester/java/src/org/forester/surfacing/TestSurfacing.java +++ b/forester/java/src/org/forester/surfacing/TestSurfacing.java @@ -313,17 +313,17 @@ public class TestSurfacing { cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, true, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, false, true ); - final SortedSet sims = calc + final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, true, true ); - final Iterator sims_it = sims.iterator(); - final DomainSimilarity sa = sims_it.next(); + final Iterator sims_it = sims.iterator(); + final PrintableDomainSimilarity sa = sims_it.next(); if ( !sa.getDomainId().equals( "A" ) ) { return false; } @@ -364,7 +364,7 @@ public class TestSurfacing { if ( sa.getMaximalDifferenceInCounts() != 3 ) { return false; } - final DomainSimilarity sb = sims_it.next(); + final PrintableDomainSimilarity sb = sims_it.next(); if ( !sb.getDomainId().equals( "B" ) ) { return false; } @@ -398,7 +398,7 @@ public class TestSurfacing { if ( sb.getMaximalDifferenceInCounts() != 2 ) { return false; } - final DomainSimilarity sc = sims_it.next(); + final PrintableDomainSimilarity sc = sims_it.next(); if ( !sc.getDomainId().equals( "C" ) ) { return false; } @@ -464,17 +464,17 @@ public class TestSurfacing { cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, false, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc2 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc2 = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, false, true ); - final SortedSet sims2 = calc2 + final SortedSet sims2 = calc2 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list2, false, true ); - final Iterator sims_it2 = sims2.iterator(); - final DomainSimilarity sa2 = sims_it2.next(); + final Iterator sims_it2 = sims2.iterator(); + final PrintableDomainSimilarity sa2 = sims_it2.next(); if ( !sa2.getDomainId().equals( "A" ) ) { return false; } @@ -552,17 +552,17 @@ public class TestSurfacing { cdc_list3.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, true, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc3 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc3 = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, false, true ); - final SortedSet sims3 = calc3 + final SortedSet sims3 = calc3 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list3, false, true ); - final Iterator sims_it3 = sims3.iterator(); - final DomainSimilarity sa3 = sims_it3.next(); + final Iterator sims_it3 = sims3.iterator(); + final PrintableDomainSimilarity sa3 = sims_it3.next(); if ( !sa3.getDomainId().equals( "A" ) ) { return false; } @@ -596,17 +596,17 @@ public class TestSurfacing { cdc_list4.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, false, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc4 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc4 = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, true, false, true ); - final SortedSet sims4 = calc4 + final SortedSet sims4 = calc4 .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list4, false, true ); - final Iterator sims_it4 = sims4.iterator(); - final DomainSimilarity sa4 = sims_it4.next(); + final Iterator sims_it4 = sims4.iterator(); + final PrintableDomainSimilarity sa4 = sims_it4.next(); if ( !sa4.getDomainId().equals( "A" ) ) { return false; } @@ -629,10 +629,10 @@ public class TestSurfacing { if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( "X" ) != 3 ) { return false; } - final SortedSet sims4_d = calc4 + final SortedSet sims4_d = calc4 .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list4, false, true ); - final Iterator sims_it4_d = sims4_d.iterator(); - final DomainSimilarity sa4_d = sims_it4_d.next(); + final Iterator sims_it4_d = sims4_d.iterator(); + final PrintableDomainSimilarity sa4_d = sims_it4_d.next(); if ( !sa4_d.getDomainId().equals( "A" ) ) { return false; } @@ -653,13 +653,13 @@ public class TestSurfacing { if ( sa4_d.getN() != 6 ) { return false; } - final SortedSet sims4_p = calc4 + final SortedSet sims4_p = calc4 .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), cdc_list4, false, true ); - final Iterator sims_it4_p = sims4_p.iterator(); - final DomainSimilarity sa4_p = sims_it4_p.next(); + final Iterator sims_it4_p = sims4_p.iterator(); + final PrintableDomainSimilarity sa4_p = sims_it4_p.next(); if ( !sa4_p.getDomainId().equals( "A" ) ) { return false; } @@ -708,10 +708,10 @@ public class TestSurfacing { cdc_list5.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, true, new BasicSpecies( "nemve" ) ) ); - final SortedSet sims5_d = calc4 + final SortedSet sims5_d = calc4 .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list5, false, true ); - final Iterator sims_it5_d = sims5_d.iterator(); - final DomainSimilarity sa5_d = sims_it5_d.next(); + final Iterator sims_it5_d = sims5_d.iterator(); + final PrintableDomainSimilarity sa5_d = sims_it5_d.next(); if ( sa5_d.getSpecies().size() != 4 ) { return false; } @@ -779,13 +779,13 @@ public class TestSurfacing { if ( sa5_d.getMaximalDifferenceInCounts() != 11 ) { return false; } - final SortedSet sims5_p = calc4 + final SortedSet sims5_p = calc4 .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), cdc_list5, false, true ); - final Iterator sims_it5_p = sims5_p.iterator(); - final DomainSimilarity sa5_p = sims_it5_p.next(); + final Iterator sims_it5_p = sims5_p.iterator(); + final PrintableDomainSimilarity sa5_p = sims_it5_p.next(); if ( !sa5_p.getDomainId().equals( "A" ) ) { return false; } @@ -843,10 +843,10 @@ public class TestSurfacing { cdc_list6.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, false, new BasicSpecies( "nemve" ) ) ); - final SortedSet sims6_d = calc4 + final SortedSet sims6_d = calc4 .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list6, false, true ); - final Iterator sims_it6_d = sims6_d.iterator(); - final DomainSimilarity sa6_d = sims_it6_d.next(); + final Iterator sims_it6_d = sims6_d.iterator(); + final PrintableDomainSimilarity sa6_d = sims_it6_d.next(); if ( sa6_d.getSpecies().size() != 4 ) { return false; } @@ -914,13 +914,13 @@ public class TestSurfacing { if ( sa6_d.getMaximalDifferenceInCounts() != 11 ) { return false; } - final SortedSet sims6_p = calc4 + final SortedSet sims6_p = calc4 .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), cdc_list6, false, true ); - final Iterator sims_it6_p = sims6_p.iterator(); - final DomainSimilarity sa6_p = sims_it6_p.next(); + final Iterator sims_it6_p = sims6_p.iterator(); + final PrintableDomainSimilarity sa6_p = sims_it6_p.next(); if ( !sa6_p.getDomainId().equals( "A" ) ) { return false; } @@ -1028,17 +1028,17 @@ public class TestSurfacing { cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, true, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, false, true ); - final SortedSet sims = calc + final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, true, false ); - final Iterator sims_it = sims.iterator(); - final DomainSimilarity sa = sims_it.next(); + final Iterator sims_it = sims.iterator(); + final PrintableDomainSimilarity sa = sims_it.next(); if ( !sa.getDomainId().equals( "A" ) ) { return false; } @@ -1069,7 +1069,7 @@ public class TestSurfacing { if ( sa.getMaximalDifferenceInCounts() != 0 ) { return false; } - final DomainSimilarity sb = sims_it.next(); + final PrintableDomainSimilarity sb = sims_it.next(); if ( !sb.getDomainId().equals( "B" ) ) { return false; } @@ -1079,13 +1079,13 @@ public class TestSurfacing { if ( !sb.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { return false; } - final SortedSet sims2 = calc + final SortedSet sims2 = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, true, true ); - final Iterator sims_it2 = sims2.iterator(); - final DomainSimilarity sa2 = sims_it2.next(); + final Iterator sims_it2 = sims2.iterator(); + final PrintableDomainSimilarity sa2 = sims_it2.next(); if ( !sa2.getDomainId().equals( "D" ) ) { return false; } @@ -1137,11 +1137,11 @@ public class TestSurfacing { cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, true, new BasicSpecies( "nemve" ) ) ); - final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( PrintableDomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, false, false, true ); - final SortedSet sims = calc + final SortedSet sims = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, false, @@ -1149,8 +1149,8 @@ public class TestSurfacing { if ( sims.size() != 1 ) { return false; } - final Iterator sims_it = sims.iterator(); - final DomainSimilarity sa = sims_it.next(); + final Iterator sims_it = sims.iterator(); + final PrintableDomainSimilarity sa = sims_it.next(); if ( !sa.getDomainId().equals( "A" ) ) { return false; } @@ -1169,7 +1169,7 @@ public class TestSurfacing { if ( !sa.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { return false; } - final SortedSet sims_ns = calc + final SortedSet sims_ns = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list, true, @@ -1211,7 +1211,7 @@ public class TestSurfacing { cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve2, true, new BasicSpecies( "nemve" ) ) ); - final SortedSet sims2 = calc + final SortedSet sims2 = calc .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), cdc_list2, true, diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 7391763..92cd3d8 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -127,7 +127,7 @@ import org.forester.ws.wabi.TxSearch.TAX_RANK; @SuppressWarnings( "unused") public final class Test { - private final static boolean PERFORM_DB_TESTS = true; + private final static boolean PERFORM_DB_TESTS = false; private final static double ZERO_DIFF = 1.0E-9; private final static String PATH_TO_TEST_DATA = System.getProperty( "user.dir" ) + ForesterUtil.getFileSeparator() + "test_data" @@ -501,7 +501,7 @@ public final class Test { failed++; } } - System.exit( 0 ); + /////////////////////System.exit( 0 ); System.out.print( "UniProtKB id extraction: " ); if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) { System.out.println( "OK." );