import org.forester.surfacing.GenomeWideCombinableDomains;
import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
import org.forester.surfacing.MappingResults;
+import org.forester.surfacing.MinimalDomainomeCalculator;
import org.forester.surfacing.PairwiseDomainSimilarityCalculator;
import org.forester.surfacing.PairwiseGenomeComparator;
import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator;
private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000;
public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out";
+ public final static String DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION = "dcc";
public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot";
public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot";
public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc";
// ---
public final static String PRG_NAME = "surfacing";
public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_d_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_d_fitch"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_dc_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH = "_dc_fitch"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String NEXUS_EXTERNAL_DOMAINS = "_dom.nex";
public static final String NEXUS_EXTERNAL_DOMAIN_COMBINATIONS = "_dc.nex";
public static final String NEXUS_SECONDARY_FEATURES = "_secondary_features.nex";
public static final String PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES = "_dollo_losses_secondary_features";
public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features";
public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d";
public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc";
final static private String HELP_OPTION_1 = "help";
final static private String DOMAIN_COUNT_SORT_COMBINATIONS_COUNT = "comb";
final static private String CUTOFF_SCORE_FILE_OPTION = "cos";
final static private String NOT_IGNORE_DUFS_OPTION = "dufs";
- final static private String MAX_E_VALUE_OPTION = "e";
+ final static private String MAX_FS_E_VALUE_OPTION = "fs_e";
+ final static private String MAX_I_E_VALUE_OPTION = "ie";
+ final static private String MIN_REL_ENV_LENGTH_RATIO_OPTION = "mrel";
final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo";
final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo";
final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb";
final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats";
- final static private String DA_ANALYSIS_OPTION = "DA_analyis";
+ final static private String DA_ANALYSIS_OPTION = "da_analyis";
final static private String USE_LAST_IN_FITCH_OPTION = "last";
public final static String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_";
final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc";
final static private String MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains.pwd";
final static private String MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations.pwd";
final static private String NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX = "_mean_score_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
final static private String NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX = "_domains_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
final static private String FILTER_POSITIVE_OPTION = "pos_filter";
final static private String FILTER_NEGATIVE_OPTION = "neg_filter";
final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter";
final static private String INPUT_GENOMES_FILE_OPTION = "genomes";
final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
final static private String SEQ_EXTRACT_OPTION = "prot_extract";
- final static private String PRG_VERSION = "2.400";
- final static private String PRG_DATE = "131106";
- final static private String E_MAIL = "czmasek@burnham.org";
+ final static private String PRG_VERSION = "2.503";
+ final static private String PRG_DATE = "170518";
+ final static private String E_MAIL = "phyloxml@gmail.com";
final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing";
final static private boolean IGNORE_DUFS_DEFAULT = true;
final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false;
private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts";
private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts";
private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis";
- private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true;
+ private static final String PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION = "dla";
public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams";
public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation";
public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary";
private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt";
private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN";
private static final String WRITE_TO_NEXUS_OPTION = "nexus";
+ private static final String PERFORM_DC_FITCH = "dc_pars";
private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change?
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt";
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt";
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt";
public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default).
public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch"
- + ForesterConstants.PHYLO_XML_SUFFIX;
+ + ForesterConstants.PHYLO_XML_SUFFIX;
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt";
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt";
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
private static final boolean CALC_SIMILARITY_SCORES = false;
+ private static final String SEPARATOR_FOR_DA = "--";
+ @SuppressWarnings( "unchecked")
public static void main( final String args[] ) {
final long start_time = new Date().getTime();
// final StringBuffer log = new StringBuffer();
}
final List<String> allowed_options = new ArrayList<String>();
allowed_options.add( surfacing.NOT_IGNORE_DUFS_OPTION );
- allowed_options.add( surfacing.MAX_E_VALUE_OPTION );
+ allowed_options.add( surfacing.MAX_FS_E_VALUE_OPTION );
+ allowed_options.add( surfacing.MAX_I_E_VALUE_OPTION );
+ allowed_options.add( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
allowed_options.add( surfacing.DETAILEDNESS_OPTION );
allowed_options.add( surfacing.OUTPUT_FILE_OPTION );
allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_OPTION );
allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE );
allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION );
allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS );
+ allowed_options.add( DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION );
allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS );
allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY );
allowed_options.add( WRITE_TO_NEXUS_OPTION );
allowed_options.add( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION );
allowed_options.add( DA_ANALYSIS_OPTION );
allowed_options.add( USE_LAST_IN_FITCH_OPTION );
+ allowed_options.add( PERFORM_DC_FITCH );
+ allowed_options.add( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION );
boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT;
boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT;
- double e_value_max = surfacing.MAX_E_VALUE_DEFAULT;
+ double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT;
+ double ie_value_max = surfacing.MAX_E_VALUE_DEFAULT;
+ double rel_env_length_ratio_cutoff = -1;
int max_allowed_overlap = surfacing.MAX_ALLOWED_OVERLAP_DEFAULT;
final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
if ( dissallowed_options.length() > 0 ) {
if ( cla.isOptionSet( WRITE_TO_NEXUS_OPTION ) ) {
write_to_nexus = true;
}
+ boolean perform_dc_fich = false;
+ if ( cla.isOptionSet( PERFORM_DC_FITCH ) ) {
+ perform_dc_fich = true;
+ }
boolean perform_dc_regain_proteins_stats = false;
if ( cla.isOptionSet( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION ) ) {
perform_dc_regain_proteins_stats = true;
if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) {
output_binary_domain_combinationsfor_graph_analysis = true;
}
- if ( cla.isOptionSet( surfacing.MAX_E_VALUE_OPTION ) ) {
+ boolean output_binary_domain_combinationsfor_counts = false;
+ if ( cla.isOptionSet( DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION ) ) {
+ output_binary_domain_combinationsfor_counts = true;
+ }
+ if ( cla.isOptionSet( surfacing.MAX_FS_E_VALUE_OPTION ) ) {
try {
- e_value_max = cla.getOptionValueAsDouble( surfacing.MAX_E_VALUE_OPTION );
+ fs_e_value_max = cla.getOptionValueAsDouble( surfacing.MAX_FS_E_VALUE_OPTION );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" );
+ }
+ }
+ if ( cla.isOptionSet( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ) ) {
+ try {
+ rel_env_length_ratio_cutoff = cla.getOptionValueAsDouble( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for min rel env length ratio" );
+ }
+ }
+ if ( cla.isOptionSet( surfacing.MAX_I_E_VALUE_OPTION ) ) {
+ try {
+ ie_value_max = cla.getOptionValueAsDouble( surfacing.MAX_I_E_VALUE_OPTION );
}
catch ( final Exception e ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" );
if ( cla.isOptionSet( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) {
ignore_combination_with_same = true;
}
+ boolean domain_length_analysis = false;
+ if ( cla.isOptionSet( surfacing.PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION ) ) {
+ domain_length_analysis = true;
+ }
boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT;
if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) {
ignore_domains_without_combs_in_all_spec = true;
ignore_species_specific_domains = true;
}
if ( !cla.isOptionValueSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no input species tree file given: "
- + surfacing.INPUT_SPECIES_TREE_OPTION + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no input species tree file given: " + surfacing.INPUT_SPECIES_TREE_OPTION
+ + "=<file>" );
}
File output_file = null;
if ( cla.isOptionSet( surfacing.OUTPUT_FILE_OPTION ) ) {
Map<String, Double> individual_score_cutoffs = null;
if ( cla.isOptionSet( surfacing.CUTOFF_SCORE_FILE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.CUTOFF_SCORE_FILE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for individual domain score cutoffs file: -"
- + surfacing.CUTOFF_SCORE_FILE_OPTION + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for individual domain score cutoffs file: -"
+ + surfacing.CUTOFF_SCORE_FILE_OPTION + "=<file>" );
}
cutoff_scores_file = new File( cla.getOptionValue( surfacing.CUTOFF_SCORE_FILE_OPTION ) );
final String error = ForesterUtil.isReadableFile( cutoff_scores_file );
if ( !ForesterUtil.isEmpty( error ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read individual domain score cutoffs file: "
- + error );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "cannot read individual domain score cutoffs file: " + error );
}
try {
final BasicTable<String> scores_table = BasicTableParser.parse( cutoff_scores_file, ' ' );
File out_dir = null;
if ( cla.isOptionSet( surfacing.OUTPUT_DIR_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.OUTPUT_DIR_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for output directory: -"
- + surfacing.OUTPUT_DIR_OPTION + "=<dir>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for output directory: -" + surfacing.OUTPUT_DIR_OPTION + "=<dir>" );
}
out_dir = new File( cla.getOptionValue( surfacing.OUTPUT_DIR_OPTION ) );
if ( out_dir.exists() && ( out_dir.listFiles().length > 0 ) ) {
File positive_filter_file = null;
File negative_filter_file = null;
File negative_domains_filter_file = null;
- if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) && cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) {
+ if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION )
+ && cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use both negative and positive protein filter" );
}
if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION )
- && ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) || cla
- .isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) ) {
+ && ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION )
+ || cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) ) {
ForesterUtil
.fatalError( surfacing.PRG_NAME,
"attempt to use both negative or positive protein filter together wirh a negative domains filter" );
}
if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.FILTER_NEGATIVE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for negative filter: -"
- + surfacing.FILTER_NEGATIVE_OPTION + "=<file>" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "no value for negative filter: -" + surfacing.FILTER_NEGATIVE_OPTION + "=<file>" );
}
negative_filter_file = new File( cla.getOptionValue( surfacing.FILTER_NEGATIVE_OPTION ) );
final String msg = ForesterUtil.isReadableFile( negative_filter_file );
if ( !ForesterUtil.isEmpty( msg ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + negative_filter_file + "\": "
- + msg );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "can not read from \"" + negative_filter_file + "\": " + msg );
}
}
else if ( cla.isOptionSet( surfacing.FILTER_POSITIVE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.FILTER_POSITIVE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for positive filter: -"
- + surfacing.FILTER_POSITIVE_OPTION + "=<file>" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "no value for positive filter: -" + surfacing.FILTER_POSITIVE_OPTION + "=<file>" );
}
positive_filter_file = new File( cla.getOptionValue( surfacing.FILTER_POSITIVE_OPTION ) );
final String msg = ForesterUtil.isReadableFile( positive_filter_file );
if ( !ForesterUtil.isEmpty( msg ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + positive_filter_file + "\": "
- + msg );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "can not read from \"" + positive_filter_file + "\": " + msg );
}
}
else if ( cla.isOptionSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for negative domains filter: -"
- + surfacing.FILTER_NEGATIVE_DOMAINS_OPTION + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for negative domains filter: -"
+ + surfacing.FILTER_NEGATIVE_DOMAINS_OPTION + "=<file>" );
}
negative_domains_filter_file = new File( cla.getOptionValue( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION ) );
final String msg = ForesterUtil.isReadableFile( negative_domains_filter_file );
if ( !ForesterUtil.isEmpty( msg ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + negative_domains_filter_file
- + "\": " + msg );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "can not read from \"" + negative_domains_filter_file + "\": " + msg );
}
}
final List<String> plus_minus_analysis_high_copy_base_species = new ArrayList<String>();
File input_genomes_file = null;
if ( cla.isOptionSet( surfacing.INPUT_GENOMES_FILE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.INPUT_GENOMES_FILE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for input genomes file: -"
- + surfacing.INPUT_GENOMES_FILE_OPTION + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for input genomes file: -" + surfacing.INPUT_GENOMES_FILE_OPTION
+ + "=<file>" );
}
input_genomes_file = new File( cla.getOptionValue( surfacing.INPUT_GENOMES_FILE_OPTION ) );
final String msg = ForesterUtil.isReadableFile( input_genomes_file );
if ( !ForesterUtil.isEmpty( msg ) ) {
- ForesterUtil
- .fatalError( surfacing.PRG_NAME, "can not read from \"" + input_genomes_file + "\": " + msg );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "can not read from \"" + input_genomes_file + "\": " + msg );
}
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no input genomes file given: "
- + surfacing.INPUT_GENOMES_FILE_OPTION + "=<file>" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "no input genomes file given: " + surfacing.INPUT_GENOMES_FILE_OPTION + "=<file>" );
}
DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT;
if ( cla.isOptionSet( surfacing.SCORING_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.SCORING_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME,
- "no value for scoring method for domain combinations similarity calculation: -"
- + surfacing.SCORING_OPTION + "=<"
- + surfacing.SCORING_DOMAIN_COUNT_BASED + "|"
- + surfacing.SCORING_PROTEIN_COUNT_BASED + "|"
- + surfacing.SCORING_COMBINATION_BASED + ">\"" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "no value for scoring method for domain combinations similarity calculation: -"
+ + surfacing.SCORING_OPTION + "=<" + surfacing.SCORING_DOMAIN_COUNT_BASED
+ + "|" + surfacing.SCORING_PROTEIN_COUNT_BASED + "|"
+ + surfacing.SCORING_COMBINATION_BASED + ">\"" );
}
final String scoring_str = cla.getOptionValue( surfacing.SCORING_OPTION );
if ( scoring_str.equals( surfacing.SCORING_DOMAIN_COUNT_BASED ) ) {
scoring = DomainSimilarity.DomainSimilarityScoring.PROTEINS;
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + scoring_str
- + "\" for scoring method for domain combinations similarity calculation: \"-"
- + surfacing.SCORING_OPTION + "=<" + surfacing.SCORING_DOMAIN_COUNT_BASED + "|"
- + surfacing.SCORING_PROTEIN_COUNT_BASED + "|" + surfacing.SCORING_COMBINATION_BASED + ">\"" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + scoring_str
+ + "\" for scoring method for domain combinations similarity calculation: \"-"
+ + surfacing.SCORING_OPTION + "=<" + surfacing.SCORING_DOMAIN_COUNT_BASED
+ + "|" + surfacing.SCORING_PROTEIN_COUNT_BASED + "|"
+ + surfacing.SCORING_COMBINATION_BASED + ">\"" );
}
}
boolean sort_by_species_count_first = false;
Detailedness detailedness = DETAILEDNESS_DEFAULT;
if ( cla.isOptionSet( surfacing.DETAILEDNESS_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.DETAILEDNESS_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for -" + surfacing.DETAILEDNESS_OPTION + "=<"
- + surfacing.DETAILEDNESS_BASIC + "|" + surfacing.DETAILEDNESS_LIST_IDS + "|"
- + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for -" + surfacing.DETAILEDNESS_OPTION + "=<"
+ + surfacing.DETAILEDNESS_BASIC + "|" + surfacing.DETAILEDNESS_LIST_IDS
+ + "|" + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" );
}
final String detness = cla.getOptionValue( surfacing.DETAILEDNESS_OPTION ).toLowerCase();
if ( detness.equals( surfacing.DETAILEDNESS_BASIC ) ) {
detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS;
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + detness + "\" for detailedness: \"-"
- + surfacing.DETAILEDNESS_OPTION + "=<" + surfacing.DETAILEDNESS_BASIC + "|"
- + surfacing.DETAILEDNESS_LIST_IDS + "|" + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + detness + "\" for detailedness: \"-"
+ + surfacing.DETAILEDNESS_OPTION + "=<" + surfacing.DETAILEDNESS_BASIC
+ + "|" + surfacing.DETAILEDNESS_LIST_IDS + "|"
+ + surfacing.DETAILEDNESS_PUNCTILIOUS + ">\"" );
}
}
String automated_pairwise_comparison_suffix = null;
DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field_for_automated_pwc = DOMAIN_SORT_FILD_DEFAULT;
if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for domain combinations similarities sorting: -"
- + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|" + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|" + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + surfacing.DOMAIN_SIMILARITY_SORT_SD
- + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for domain combinations similarities sorting: -"
+ + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<"
+ + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_SD + ">\"" );
}
final String sort_str = cla.getOptionValue( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ).toLowerCase();
if ( sort_str.equals( surfacing.DOMAIN_SIMILARITY_SORT_ALPHA ) ) {
domain_similarity_sort_field_for_automated_pwc = DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE;
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort_str
- + "\" for domain combinations similarities sorting: \"-"
- + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<" + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|" + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|" + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + "|"
- + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|" + surfacing.DOMAIN_SIMILARITY_SORT_SD
- + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + sort_str
+ + "\" for domain combinations similarities sorting: \"-"
+ + surfacing.DOMAIN_SIMILARITY_SORT_OPTION + "=<"
+ + surfacing.DOMAIN_SIMILARITY_SORT_ALPHA + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MAX + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MIN + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_MEAN + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_DIFF + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_ABS_COUNTS_DIFF + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_COUNTS_DIFF + "|" + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_SPECIES_COUNT + "|"
+ + surfacing.DOMAIN_SIMILARITY_SORT_SD + ">\"" );
}
}
DomainSimilarity.PRINT_OPTION domain_similarity_print_option = DOMAIN_SIMILARITY_PRINT_OPTION_DEFAULT;
if ( cla.isOptionSet( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for print option: -"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "no value for print option: -"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" );
}
final String sort = cla.getOptionValue( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION ).toLowerCase();
if ( sort.equals( surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML ) ) {
domain_similarity_print_option = DomainSimilarity.PRINT_OPTION.SIMPLE_TAB_DELIMITED;
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort + "\" for print option: -"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|"
- + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + sort + "\" for print option: -"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_DETAILED_HTML + "|"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_HTML + "|"
+ + surfacing.DOMAIN_SIMILARITY_PRINT_OPTION_SIMPLE_TAB_DELIMITED + ">\"" );
}
}
GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order = DOMAINS_SORT_ORDER_DEFAULT;
if ( cla.isOptionSet( surfacing.DOMAIN_COUNT_SORT_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.DOMAIN_COUNT_SORT_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for sorting of domain counts: -"
- + surfacing.DOMAIN_COUNT_SORT_OPTION + "=<" + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|"
- + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|"
- + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|"
- + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for sorting of domain counts: -" + surfacing.DOMAIN_COUNT_SORT_OPTION
+ + "=<" + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|"
+ + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|"
+ + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|"
+ + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" );
}
final String sort = cla.getOptionValue( surfacing.DOMAIN_COUNT_SORT_OPTION ).toLowerCase();
if ( sort.equals( surfacing.DOMAIN_COUNT_SORT_ALPHA ) ) {
dc_sort_order = GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT;
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + sort
- + "\" for sorting of domain counts: \"-" + surfacing.DOMAIN_COUNT_SORT_OPTION + "=<"
- + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|" + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|"
- + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|"
- + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + sort + "\" for sorting of domain counts: \"-"
+ + surfacing.DOMAIN_COUNT_SORT_OPTION + "=<"
+ + surfacing.DOMAIN_COUNT_SORT_ALPHA + "|"
+ + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_COUNT + "|"
+ + surfacing.DOMAIN_COUNT_SORT_KEY_DOMAIN_PROTEINS_COUNT + "|"
+ + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" );
}
}
final String[][] input_file_properties = SurfacingUtil.processInputGenomesFile( input_genomes_file );
ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" );
}
if ( ( number_of_genomes < 3 ) && perform_pwc ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use : -"
- + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION
- + "=<suffix> to turn on pairwise analyses with less than three input files" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "cannot use : -" + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION
+ + "=<suffix> to turn on pairwise analyses with less than three input files" );
}
SurfacingUtil.checkWriteabilityForPairwiseComparisons( domain_similarity_print_option,
input_file_properties,
}
SurfacingUtil.checkForOutputFileWriteability( dcc_outfile );
}
- File pfam_to_go_file = null;
- Map<String, List<GoId>> domain_id_to_go_ids_map = null;
- int domain_id_to_go_ids_count = 0;
+ File pfam_to_go_file = new File( "pfam2go.txt" );
if ( cla.isOptionSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for Pfam to GO mapping file: -"
- + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for Pfam to GO mapping file: -"
+ + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=<file>" );
}
pfam_to_go_file = new File( cla.getOptionValue( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) );
- final String error = ForesterUtil.isReadableFile( pfam_to_go_file );
- if ( !ForesterUtil.isEmpty( error ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read Pfam to GO mapping file: " + error );
- }
- try {
- final PfamToGoParser parser = new PfamToGoParser( pfam_to_go_file );
- final List<PfamToGoMapping> pfam_to_go_mappings = parser.parse();
- domain_id_to_go_ids_map = SurfacingUtil.createDomainIdToGoIdMap( pfam_to_go_mappings );
- if ( parser.getMappingCount() < domain_id_to_go_ids_map.size() ) {
- ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME,
- "parser.getMappingCount() < domain_id_to_go_ids_map.size()" );
- }
- domain_id_to_go_ids_count = parser.getMappingCount();
- }
- catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from Pfam to GO mapping file: " + e );
+ }
+ final String error1 = ForesterUtil.isReadableFile( pfam_to_go_file );
+ if ( !ForesterUtil.isEmpty( error1 ) ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read Pfam to GO mapping file: " + error1 );
+ }
+ Map<String, List<GoId>> domain_id_to_go_ids_map = null;
+ int domain_id_to_go_ids_count = 0;
+ try {
+ final PfamToGoParser parser = new PfamToGoParser( pfam_to_go_file );
+ final List<PfamToGoMapping> pfam_to_go_mappings = parser.parse();
+ domain_id_to_go_ids_map = SurfacingUtil.createDomainIdToGoIdMap( pfam_to_go_mappings );
+ if ( parser.getMappingCount() < domain_id_to_go_ids_map.size() ) {
+ ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME,
+ "parser.getMappingCount() < domain_id_to_go_ids_map.size()" );
}
+ domain_id_to_go_ids_count = parser.getMappingCount();
}
- File go_obo_file = null;
- List<GoTerm> go_terms = null;
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from Pfam to GO mapping file: " + e );
+ }
+ File go_obo_file = new File( "go.obo" );
if ( cla.isOptionSet( surfacing.GO_OBO_FILE_USE_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.GO_OBO_FILE_USE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for GO OBO file: -"
- + surfacing.GO_OBO_FILE_USE_OPTION + "=<file>" );
- }
- if ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use GO OBO file (-"
- + surfacing.GO_OBO_FILE_USE_OPTION + "=<file>) without Pfam to GO mapping file ("
- + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=<file>)" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for GO OBO file: -" + surfacing.GO_OBO_FILE_USE_OPTION + "=<file>" );
}
go_obo_file = new File( cla.getOptionValue( surfacing.GO_OBO_FILE_USE_OPTION ) );
- final String error = ForesterUtil.isReadableFile( go_obo_file );
- if ( !ForesterUtil.isEmpty( error ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read GO OBO file: " + error );
- }
- try {
- final OBOparser parser = new OBOparser( go_obo_file, OBOparser.ReturnType.BASIC_GO_TERM );
- go_terms = parser.parse();
- if ( parser.getGoTermCount() != go_terms.size() ) {
- ForesterUtil
- .unexpectedFatalError( surfacing.PRG_NAME, "parser.getGoTermCount() != go_terms.size()" );
- }
- }
- catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from GO OBO file: " + e );
+ }
+ final String error2 = ForesterUtil.isReadableFile( go_obo_file );
+ if ( !ForesterUtil.isEmpty( error2 ) ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read GO OBO file: " + error2 );
+ }
+ List<GoTerm> go_terms = null;
+ try {
+ final OBOparser parser = new OBOparser( go_obo_file, OBOparser.ReturnType.BASIC_GO_TERM );
+ go_terms = parser.parse();
+ if ( parser.getGoTermCount() != go_terms.size() ) {
+ ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "parser.getGoTermCount() != go_terms.size()" );
}
}
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read from GO OBO file: " + e );
+ }
Map<GoId, GoTerm> go_id_to_term_map = null;
if ( ( ( domain_id_to_go_ids_map != null ) && ( domain_id_to_go_ids_map.size() > 0 ) )
&& ( ( go_terms != null ) && ( go_terms.size() > 0 ) ) ) {
GoNameSpace go_namespace_limit = null;
if ( cla.isOptionSet( surfacing.GO_NAMESPACE_LIMIT_OPTION ) ) {
if ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use GO namespace limit (-"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<namespace>) without Pfam to GO mapping file ("
- + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=<file>) and GO OBO file (-"
- + surfacing.GO_OBO_FILE_USE_OPTION + "=<file>)" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "cannot use GO namespace limit (-" + surfacing.GO_NAMESPACE_LIMIT_OPTION
+ + "=<namespace>) without Pfam to GO mapping file ("
+ + surfacing.PFAM_TO_GO_FILE_USE_OPTION + "=<file>) and GO OBO file (-"
+ + surfacing.GO_OBO_FILE_USE_OPTION + "=<file>)" );
}
if ( !cla.isOptionValueSet( surfacing.GO_NAMESPACE_LIMIT_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for GO namespace limit: \"-"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for GO namespace limit: \"-" + surfacing.GO_NAMESPACE_LIMIT_OPTION
+ + "=<" + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" );
}
final String go_namespace_limit_str = cla.getOptionValue( surfacing.GO_NAMESPACE_LIMIT_OPTION )
.toLowerCase();
go_namespace_limit = GoNameSpace.createCellularComponent();
}
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown value \"" + go_namespace_limit_str
- + "\" for GO namespace limit: \"-" + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|"
- + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "unknown value \"" + go_namespace_limit_str + "\" for GO namespace limit: \"-"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION + "=<"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION_MOLECULAR_FUNCTION + "|"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION_BIOLOGICAL_PROCESS + "|"
+ + surfacing.GO_NAMESPACE_LIMIT_OPTION_CELLULAR_COMPONENT + ">\"" );
}
}
if ( ( domain_similarity_sort_field == DomainSimilarity.DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE )
Phylogeny[] intrees = null;
if ( cla.isOptionSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) {
if ( number_of_genomes < 3 ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer gains and losses on input species trees (-"
- + surfacing.INPUT_SPECIES_TREE_OPTION + " without pairwise analyses ("
- + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION
- + "=<suffix for pairwise comparison output files>)" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "cannot infer gains and losses on input species trees (-"
+ + surfacing.INPUT_SPECIES_TREE_OPTION + " without pairwise analyses ("
+ + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION
+ + "=<suffix for pairwise comparison output files>)" );
}
if ( !cla.isOptionValueSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for input tree: -"
- + surfacing.INPUT_SPECIES_TREE_OPTION + "=<tree file in phyloXML format>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for input tree: -" + surfacing.INPUT_SPECIES_TREE_OPTION
+ + "=<tree file in phyloXML format>" );
}
final String intrees_str = cla.getOptionValue( surfacing.INPUT_SPECIES_TREE_OPTION );
if ( intrees_str.indexOf( "#" ) > 0 ) {
intree_files = new File[ 1 ];
intree_files[ 0 ] = new File( intrees_str );
}
- intrees = SurfacingUtil.obtainAndPreProcessIntrees( intree_files, number_of_genomes, input_file_properties );
+ intrees = SurfacingUtil.obtainAndPreProcessIntrees( intree_files,
+ number_of_genomes,
+ input_file_properties );
}
final Phylogeny intree_0_orig = SurfacingUtil.obtainFirstIntree( intree_files[ 0 ] );
long random_number_seed_for_fitch_parsimony = 0l;
boolean radomize_fitch_parsimony = false;
if ( cla.isOptionSet( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) {
if ( !cla.isOptionValueSet( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for random number seed: -"
- + surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + "=<seed>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for random number seed: -"
+ + surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION + "=<seed>" );
}
try {
random_number_seed_for_fitch_parsimony = cla
File[] secondary_features_map_files = null;
final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ DOMAIN_LENGTHS_ANALYSIS_SUFFIX );
- if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
+ if ( domain_length_analysis ) {
SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile );
}
if ( cla.isOptionSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) {
if ( !cla.isOptionValueSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for secondary features map file: -"
- + surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=<file>" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no value for secondary features map file: -"
+ + surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=<file>" );
}
final String[] secondary_features_map_files_strs = cla
.getOptionValue( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ).split( "#" );
"cannot read secondary features map file: " + e.getMessage() );
}
catch ( final Exception e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "problem with contents of features map file ["
- + secondary_features_map_files[ i ] + "]: " + e.getMessage() );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "problem with contents of features map file ["
+ + secondary_features_map_files[ i ] + "]: " + e.getMessage() );
}
i++;
}
}
if ( out_dir == null ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no output directory indicated (-"
- + surfacing.OUTPUT_DIR_OPTION + "=<dir>)" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no output directory indicated (-" + surfacing.OUTPUT_DIR_OPTION + "=<dir>)" );
}
if ( output_file == null ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "no name for (main) output file indicated (-"
- + surfacing.OUTPUT_FILE_OPTION + "=<file>)" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "no name for (main) output file indicated (-" + surfacing.OUTPUT_FILE_OPTION
+ + "=<file>)" );
}
if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() ) {
ForesterUtil.fatalError( surfacing.PRG_NAME,
System.out.println( "Cutoff scores file : " + cutoff_scores_file );
html_desc.append( "<tr><td>Cutoff scores file:</td><td>" + cutoff_scores_file + "</td></tr>" + nl );
}
- if ( e_value_max >= 0.0 ) {
- System.out.println( "E-value maximum (inclusive) : " + e_value_max );
- html_desc.append( "<tr><td>E-value maximum (inclusive):</td><td>" + e_value_max + "</td></tr>" + nl );
+ if ( ie_value_max >= 0.0 ) {
+ System.out.println( "iE-value maximum (incl) : " + ie_value_max );
+ html_desc.append( "<tr><td>iE-value maximum (inclusive):</td><td>" + ie_value_max + "</td></tr>" + nl );
+ }
+ if ( rel_env_length_ratio_cutoff > 0.0 ) {
+ System.out.println( "Rel env length ratio min : " + rel_env_length_ratio_cutoff );
+ html_desc.append( "<tr><td>Relative hmm envelope length ratio min (inclusive):</td><td>"
+ + rel_env_length_ratio_cutoff + "</td></tr>" + nl );
+ }
+ if ( fs_e_value_max >= 0.0 ) {
+ System.out.println( "FS E-value maximum (incl) : " + fs_e_value_max );
+ html_desc.append( "<tr><td>FS E-value maximum (inclusive):</td><td>" + fs_e_value_max + "</td></tr>" + nl );
}
if ( output_protein_lists_for_all_domains ) {
System.out.println( "Domain E-value max : " + output_list_of_all_proteins_per_domain_e_value_max );
System.out.println( "Ignore DUFs : " + ignore_dufs );
if ( ignore_virus_like_ids ) {
System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids );
- html_desc.append( "<tr><td>Ignore virus, phage, transposition related ids:</td><td>"
- + ignore_virus_like_ids + "</td></tr>" + nl );
+ html_desc.append( "<tr><td>Ignore virus, phage, transposition related ids:</td><td>" + ignore_virus_like_ids
+ + "</td></tr>" + nl );
}
html_desc.append( "<tr><td>Ignore DUFs:</td><td>" + ignore_dufs + "</td></tr>" + nl );
if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) {
System.out.println( "Max allowed domain overlap : " + max_allowed_overlap );
- html_desc.append( "<tr><td>Max allowed domain overlap:</td><td>" + max_allowed_overlap + "</td></tr>" + nl );
+ html_desc
+ .append( "<tr><td>Max allowed domain overlap:</td><td>" + max_allowed_overlap + "</td></tr>" + nl );
}
if ( no_engulfing_overlaps ) {
System.out.println( "Ignore engulfed domains : " + no_engulfing_overlaps );
+ ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "</td></tr>"
+ nl );
}
- System.out.println( "Use last in Fitch parimony : " + use_last_in_fitch_parsimony );
- html_desc.append( "<tr><td>Use last in Fitch parimon:</td><td>" + use_last_in_fitch_parsimony + "</td></tr>"
- + nl );
+ System.out.println( "Fitch parsimony of DCs : " + perform_dc_fich );
+ html_desc.append( "<tr><td>Fitch parsimony of DCs:</td><td>" + perform_dc_fich + "</td></tr>" + nl );
+ if ( perform_dc_fich ) {
+ System.out.println( "Use last in Fitch parsimony : " + use_last_in_fitch_parsimony );
+ html_desc.append( "<tr><td>Use last in Fitch parsimony:</td><td>" + use_last_in_fitch_parsimony
+ + "</td></tr>" + nl );
+ }
System.out.println( "Write to Nexus files : " + write_to_nexus );
html_desc.append( "<tr><td>Write to Nexus files:</td><td>" + write_to_nexus + "</td></tr>" + nl );
- System.out.println( "DC regain prot stats : " + perform_dc_regain_proteins_stats );
- html_desc.append( "<tr><td>DC regain prot stats:</td><td>" + perform_dc_regain_proteins_stats + "</td></tr>"
- + nl );
+ if ( perform_dc_fich ) {
+ System.out.println( "DC regain prot stats : " + perform_dc_regain_proteins_stats );
+ html_desc.append( "<tr><td>DC regain prot stats:</td><td>" + perform_dc_regain_proteins_stats + "</td></tr>"
+ + nl );
+ }
System.out.println( "DA analysis : " + da_analysis );
html_desc.append( "<tr><td>DA analysis :</td><td>" + da_analysis + "</td></tr>" + nl );
System.out.print( "Domain counts sort order : " );
html_desc.append( "counts difference" );
break;
default:
- ForesterUtil
- .unexpectedFatalError( surfacing.PRG_NAME, "unknown value for sorting for similarities" );
+ ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME,
+ "unknown value for sorting for similarities" );
}
System.out.println();
html_desc.append( "</td></tr>" + nl );
+ random_number_seed_for_fitch_parsimony + "</td></tr>" + nl );
System.out.println( " Random number seed : " + random_number_seed_for_fitch_parsimony );
}
- if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) {
+ if ( ( domain_id_to_secondary_features_maps != null )
+ && ( domain_id_to_secondary_features_maps.length > 0 ) ) {
for( int i = 0; i < secondary_features_map_files.length; i++ ) {
html_desc.append( "<tr><td>Secondary features map file:</td><td>"
+ secondary_features_map_files[ i ] + "</td></tr>" + nl );
}
} // if ( perform_pwc ) {
System.out.println();
- html_desc.append( "<tr><td>Command line:</td><td>\n" + cla.getCommandLineArgsAsString() + "\n</td></tr>" + nl );
+ html_desc.append( "<tr><td>Command line:</td><td>" + nl + nl + cla.getCommandLineArgsAsString() + nl + nl
+ + "</td></tr>" + nl );
System.out.println( "Command line : " + cla.getCommandLineArgsAsString() );
BufferedWriter[] query_domains_writer_ary = null;
List<String>[] query_domain_ids_array = null;
query_domains_writer_ary[ i ] = new BufferedWriter( new FileWriter( protein_names_writer_str ) );
}
catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "Could not open [" + protein_names_writer_str + "]: "
- + e.getLocalizedMessage() );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "Could not open [" + protein_names_writer_str + "]: "
+ + e.getLocalizedMessage() );
}
}
}
- SortedMap<Species, List<Protein>> protein_lists_per_species = null; //This will only be created if neede.
+ SortedMap<Species, List<Protein>> protein_lists_per_species = null; //This will only be created if needed.
boolean need_protein_lists_per_species = false;
- if ( ( plus_minus_analysis_high_copy_base_species.size() > 0 ) || output_protein_lists_for_all_domains ) {
+ if ( ( plus_minus_analysis_high_copy_base_species.size() > 0 ) || output_protein_lists_for_all_domains
+ || true ) { //TODO
need_protein_lists_per_species = true;
}
if ( need_protein_lists_per_species ) {
all_bin_domain_combinations_gained_fitch = new ArrayList<BinaryDomainCombination>();
all_bin_domain_combinations_lost_fitch = new ArrayList<BinaryDomainCombination>();
}
- DomainLengthsTable domain_lengths_table = new DomainLengthsTable();
final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR
+ output_file + D_PROMISCUITY_FILE_SUFFIX );
BufferedWriter per_genome_domain_promiscuity_statistics_writer = null;
per_genome_domain_promiscuity_statistics_writer.write( "Min:\t" );
per_genome_domain_promiscuity_statistics_writer.write( "Max:\t" );
per_genome_domain_promiscuity_statistics_writer.write( "N:\t" );
- per_genome_domain_promiscuity_statistics_writer.write( "Max Promiscuous Domains:"
- + ForesterUtil.LINE_SEPARATOR );
+ per_genome_domain_promiscuity_statistics_writer
+ .write( "Max Promiscuous Domains:" + ForesterUtil.LINE_SEPARATOR );
}
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
}
- final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
- final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
+ DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
+ DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
final SortedSet<String> domains_which_are_sometimes_single_sometimes_not = new TreeSet<String>();
protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
}
+ DomainLengthsTable domain_lengths_table = null;
+ if ( domain_length_analysis ) {
+ domain_lengths_table = new DomainLengthsTable();
+ }
// Main loop:
final SortedMap<String, Set<String>> distinct_domain_architecutures_per_genome = new TreeMap<String, Set<String>>();
final SortedMap<String, Integer> distinct_domain_architecuture_counts = new TreeMap<String, Integer>();
ind_score_cutoff,
true );
}
- if ( e_value_max >= 0.0 ) {
- parser.setEValueMaximum( e_value_max );
+ if ( fs_e_value_max >= 0.0 ) {
+ parser.setFsEValueMaximum( fs_e_value_max );
+ }
+ if ( ie_value_max >= 0.0 ) {
+ parser.setIEValueMaximum( ie_value_max );
+ }
+ if ( rel_env_length_ratio_cutoff > 0.0 ) {
+ parser.setRelEnvLengthRatioCutoff( rel_env_length_ratio_cutoff );
}
parser.setIgnoreDufs( ignore_dufs );
parser.setIgnoreVirusLikeIds( ignore_virus_like_ids );
System.out.println( "Coverage : "
+ ForesterUtil.roundToInt( 100.0 * coverage ) + "%" );
SurfacingUtil.log( "Coverage : "
- + ForesterUtil.roundToInt( 100.0 * coverage ) + "%",
- log_writer );
+ + ForesterUtil.roundToInt( 100.0 * coverage ) + "%", log_writer );
System.out.println( "Domains encountered : " + parser.getDomainsEncountered() );
SurfacingUtil.log( "Domains encountered : " + parser.getDomainsEncountered(),
log_writer );
log_writer );
System.out.println( "Distinct domains stored : "
+ parser.getDomainsStoredSet().size() );
- SurfacingUtil.log( "Distinct domains stored : "
- + parser.getDomainsStoredSet().size(), log_writer );
+ SurfacingUtil
+ .log( "Distinct domains stored : " + parser.getDomainsStoredSet().size(),
+ log_writer );
System.out.println( "Domains ignored due to individual score cutoffs: "
+ parser.getDomainsIgnoredDueToIndividualScoreCutoff() );
SurfacingUtil.log( "Domains ignored due to individual score cutoffs: "
- + parser.getDomainsIgnoredDueToIndividualScoreCutoff(),
- log_writer );
- System.out.println( "Domains ignored due to E-value : "
- + parser.getDomainsIgnoredDueToEval() );
- SurfacingUtil.log( "Domains ignored due to E-value : "
- + parser.getDomainsIgnoredDueToEval(),
- log_writer );
- System.out.println( "Domains ignored due to DUF designation : "
- + parser.getDomainsIgnoredDueToDuf() );
+ + parser.getDomainsIgnoredDueToIndividualScoreCutoff(), log_writer );
+ System.out.println( "Domains ignored due to FS E-value : "
+ + parser.getDomainsIgnoredDueToFsEval() );
+ SurfacingUtil
+ .log( "Domains ignored due to FS E-value : " + parser.getDomainsIgnoredDueToFsEval(),
+ log_writer );
+ System.out.println( "Domains ignored due to iE-value : "
+ + parser.getDomainsIgnoredDueToIEval() );
SurfacingUtil
- .log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(),
+ .log( "Domains ignored due to iE-value : " + parser.getDomainsIgnoredDueToIEval(),
log_writer );
+ System.out.println( "Domains ignored due to rel env length ratio : "
+ + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff() );
+ SurfacingUtil.log( "Domains ignored due to rel env length ratio : "
+ + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff(), log_writer );
+ System.out.println( "Domains ignored due to DUF designation : "
+ + parser.getDomainsIgnoredDueToDuf() );
+ SurfacingUtil.log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(),
+ log_writer );
if ( ignore_virus_like_ids ) {
System.out.println( "Domains ignored due virus like ids : "
+ parser.getDomainsIgnoredDueToVirusLikeIds() );
SurfacingUtil.log( "Domains ignored due virus like ids : "
- + parser.getDomainsIgnoredDueToVirusLikeIds(),
- log_writer );
+ + parser.getDomainsIgnoredDueToVirusLikeIds(), log_writer );
}
System.out.println( "Domains ignored due negative domain filter : "
+ parser.getDomainsIgnoredDueToNegativeDomainFilter() );
SurfacingUtil.log( "Domains ignored due negative domain filter : "
- + parser.getDomainsIgnoredDueToNegativeDomainFilter(),
- log_writer );
+ + parser.getDomainsIgnoredDueToNegativeDomainFilter(), log_writer );
System.out.println( "Domains ignored due to overlap : "
+ parser.getDomainsIgnoredDueToOverlap() );
- SurfacingUtil.log( "Domains ignored due to overlap : "
- + parser.getDomainsIgnoredDueToOverlap(),
- log_writer );
+ SurfacingUtil
+ .log( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap(),
+ log_writer );
if ( negative_filter_file != null ) {
System.out.println( "Proteins ignored due to negative filter : "
+ parser.getProteinsIgnoredDueToFilter() );
SurfacingUtil.log( "Proteins ignored due to negative filter : "
- + parser.getProteinsIgnoredDueToFilter(),
- log_writer );
+ + parser.getProteinsIgnoredDueToFilter(), log_writer );
}
if ( positive_filter_file != null ) {
System.out.println( "Proteins ignored due to positive filter : "
+ parser.getProteinsIgnoredDueToFilter() );
SurfacingUtil.log( "Proteins ignored due to positive filter : "
- + parser.getProteinsIgnoredDueToFilter(),
- log_writer );
+ + parser.getProteinsIgnoredDueToFilter(), log_writer );
}
if ( da_analysis ) {
System.out.println( "Distinct domain architectures stored : " + distinct_das );
try {
int count = 0;
for( final Protein protein : protein_list ) {
- dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" )
- .toString() );
+ dc_data_writer
+ .write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" ).toString() );
++count;
for( final Domain d : protein.getProteinDomains() ) {
final String d_str = d.getDomainId().toString();
domains_which_are_sometimes_single_sometimes_not,
domains_which_never_single,
domains_per_potein_stats_writer );
- domain_lengths_table.addLengths( protein_list );
+ if ( domain_length_analysis ) {
+ domain_lengths_table.addLengths( protein_list );
+ }
if ( !da_analysis ) {
- gwcd_list.add( BasicGenomeWideCombinableDomains
- .createInstance( protein_list,
- ignore_combination_with_same,
- new BasicSpecies( input_file_properties[ i ][ 1 ] ),
- domain_id_to_go_ids_map,
- dc_type,
- protein_length_stats_by_dc,
- domain_number_stats_by_dc ) );
+ gwcd_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list,
+ ignore_combination_with_same,
+ new BasicSpecies( input_file_properties[ i ][ 1 ] ),
+ domain_id_to_go_ids_map,
+ dc_type,
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc ) );
if ( gwcd_list.get( i ).getSize() > 0 ) {
- SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
- out_dir,
- per_genome_domain_promiscuity_statistics_writer,
- gwcd_list.get( i ),
- i,
- dc_sort_order );
+ if ( output_binary_domain_combinationsfor_counts ) {
+ SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
+ out_dir,
+ per_genome_domain_promiscuity_statistics_writer,
+ gwcd_list.get( i ),
+ i,
+ dc_sort_order );
+ }
if ( output_binary_domain_combinationsfor_graph_analysis ) {
SurfacingUtil.writeBinaryDomainCombinationsFileForGraphAnalysis( input_file_properties,
out_dir,
}
System.gc();
} // for( int i = 0; i < number_of_genomes; ++i ) {
- ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
- + per_genome_domain_promiscuity_statistics_file );
- //
+ ForesterUtil
+ .programMessage( PRG_NAME,
+ "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file );
+ final int LEVEL = 0;
+ try {
+ MinimalDomainomeCalculator.calc( false,
+ intrees[ 0 ],
+ LEVEL,
+ protein_lists_per_species,
+ SEPARATOR_FOR_DA,
+ -1,
+ out_dir.toString() + "/" + output_file,
+ true );
+ }
+ catch ( IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
+ }
+ try {
+ MinimalDomainomeCalculator.calc( true,
+ intrees[ 0 ],
+ LEVEL,
+ protein_lists_per_species,
+ SEPARATOR_FOR_DA,
+ -1,
+ out_dir.toString() + "/" + output_file,
+ true );
+ }
+ catch ( IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
+ }
if ( da_analysis ) {
SurfacingUtil.performDomainArchitectureAnalysis( distinct_domain_architecutures_per_genome,
distinct_domain_architecuture_counts,
domains_per_potein_stats_writer.write( "\t" );
domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.arithmeticMean() + "" );
domains_per_potein_stats_writer.write( "\t" );
- domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.sampleStandardDeviation() + "" );
- domains_per_potein_stats_writer.write( "\t" );
- domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" );
+ domains_per_potein_stats_writer
+ .write( all_genomes_domains_per_potein_stats.sampleStandardDeviation() + "" );
domains_per_potein_stats_writer.write( "\t" );
+ if ( all_genomes_domains_per_potein_stats.getN() <= 300 ) {
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ }
domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getN() + "" );
domains_per_potein_stats_writer.write( "\t" );
domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMin() + "" );
domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
domains_per_potein_stats_writer.write( "\n" );
domains_per_potein_stats_writer.close();
+ all_genomes_domains_per_potein_stats = null;
SurfacingUtil.printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer );
- ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
- + "_all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
- ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
- + "_domains_always_single_.txt" ), domains_which_are_always_single, "\n" );
- ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
- + "_domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" );
- ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
- + "_domains_always_combined.txt" ), domains_which_never_single, "\n" );
+ ForesterUtil.map2file(
+ new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "_all_genomes_domains_per_potein_histo.txt" ),
+ all_genomes_domains_per_potein_histo,
+ "\t",
+ "\n" );
+ ForesterUtil.collection2file(
+ new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "_domains_always_single_.txt" ),
+ domains_which_are_always_single,
+ "\n" );
+ ForesterUtil.collection2file(
+ new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "_domains_single_or_combined.txt" ),
+ domains_which_are_sometimes_single_sometimes_not,
+ "\n" );
+ ForesterUtil.collection2file(
+ new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "_domains_always_combined.txt" ),
+ domains_which_never_single,
+ "\n" );
ForesterUtil.programMessage( PRG_NAME,
"Average of proteins with a least one domain assigned: "
+ ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-"
+ ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)" );
- ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: "
- + ( 100 * protein_coverage_stats.getMin() ) + "%-" + ( 100 * protein_coverage_stats.getMax() )
- + "%" );
- SurfacingUtil.log( "Average of prot with a least one dom assigned : "
+ ForesterUtil.programMessage( PRG_NAME,
+ "Range of proteins with a least one domain assigned: "
+ + ( 100 * protein_coverage_stats.getMin() ) + "%-"
+ + ( 100 * protein_coverage_stats.getMax() ) + "%" );
+ SurfacingUtil.log(
+ "Average of prot with a least one dom assigned : "
+ ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-"
+ ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)",
log_writer );
SurfacingUtil.log( "Range of prot with a least one dom assigned : "
- + ( 100 * protein_coverage_stats.getMin() ) + "%-"
- + ( 100 * protein_coverage_stats.getMax() ) + "%",
- log_writer );
+ + ( 100 * protein_coverage_stats.getMin() ) + "%-" + ( 100 * protein_coverage_stats.getMax() )
+ + "%", log_writer );
+ protein_coverage_stats = null;
}
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
}
- if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
+ if ( domain_length_analysis ) {
try {
SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
number_of_genomes,
final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field,
sort_by_species_count_first,
number_of_genomes == 2,
- CALC_SIMILARITY_SCORES );
+ CALC_SIMILARITY_SCORES,
+ true );
switch ( scoring ) {
case COMBINATIONS:
pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator();
if ( domain_id_to_go_ids_map != null ) {
go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL;
}
- final SortedSet<DomainSimilarity> similarities = calc
- .calculateSimilarities( pw_calc,
- gwcd_list,
- ignore_domains_without_combs_in_all_spec,
- ignore_species_specific_domains );
+ final SortedSet<DomainSimilarity> similarities = calc.calculateSimilarities( pw_calc,
+ gwcd_list,
+ ignore_domains_without_combs_in_all_spec,
+ ignore_species_specific_domains );
SurfacingUtil.decoratePrintableDomainSimilarities( similarities, detailedness );
final Map<String, Integer> tax_code_to_id_map = SurfacingUtil.createTaxCodeToIdMap( intrees[ 0 ] );
try {
+ "</td></tr>" + nl );
html_desc.append( "</table>" + nl );
final Writer simple_tab_writer = new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR
- + my_outfile + ".tsv" ) );
+ + my_outfile.replaceFirst( ".html", ".tsv" ) ) );
SurfacingUtil.writeDomainSimilaritiesToFile( html_desc,
new StringBuilder( number_of_genomes + " genomes" ),
simple_tab_writer,
intree_0_orig,
positive_filter_file != null ? filter : null );
simple_tab_writer.close();
- ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \""
- + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME,
+ "Wrote main output (includes domain similarities) to: \""
+ + ( out_dir == null ? my_outfile
+ : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile )
+ + "\"" );
}
catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "Failed to write similarites to: \"" + output_file + "\" ["
- + e.getMessage() + "]" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "Failed to write similarites to: \"" + output_file + "\" [" + e.getMessage() + "]" );
}
System.out.println();
final Species[] species = new Species[ number_of_genomes ];
matrix_output_file = out_dir + ForesterUtil.FILE_SEPARATOR + matrix_output_file;
output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file );
}
- SurfacingUtil.writeMatrixToFile( new File( matrix_output_file
- + surfacing.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans() );
- SurfacingUtil
- .writeMatrixToFile( new File( matrix_output_file
- + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ),
- pwgc.getSharedBinaryCombinationsBasedDistances() );
- SurfacingUtil.writeMatrixToFile( new File( matrix_output_file
+ SurfacingUtil.writeMatrixToFile(
+ new File( matrix_output_file
+ + surfacing.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getDomainDistanceScoresMeans() );
+ SurfacingUtil.writeMatrixToFile(
+ new File( matrix_output_file
+ + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedBinaryCombinationsBasedDistances() );
+ SurfacingUtil.writeMatrixToFile(
+ new File( matrix_output_file
+ surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ),
pwgc.getSharedDomainsBasedDistances() );
final Phylogeny nj_gd = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file
- + surfacing.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans()
- .get( 0 ) );
- final Phylogeny nj_bc = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file
- + surfacing.NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedBinaryCombinationsBasedDistances().get( 0 ) );
- final Phylogeny nj_d = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file
- + surfacing.NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedDomainsBasedDistances().get( 0 ) );
+ + surfacing.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getDomainDistanceScoresMeans()
+ .get( 0 ) );
+ final Phylogeny nj_bc = SurfacingUtil.createNjTreeBasedOnMatrixToFile(
+ new File( matrix_output_file
+ + surfacing.NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedBinaryCombinationsBasedDistances()
+ .get( 0 ) );
+ final Phylogeny nj_d = SurfacingUtil.createNjTreeBasedOnMatrixToFile(
+ new File( matrix_output_file
+ + surfacing.NJ_TREE_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedDomainsBasedDistances()
+ .get( 0 ) );
inferred_trees = new ArrayList<Phylogeny>();
inferred_trees.add( nj_gd );
inferred_trees.add( nj_bc );
}
if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) {
final StringBuilder parameters_sb = SurfacingUtil.createParametersAsString( ignore_dufs,
- e_value_max,
+ ie_value_max,
+ fs_e_value_max,
max_allowed_overlap,
no_engulfing_overlaps,
cutoff_scores_file,
for( final Phylogeny intree : intrees ) {
final String outfile_name = ForesterUtil.removeSuffix( output_file.toString() ) + s
+ ForesterUtil.removeSuffix( intree_files[ i ].toString() );
- final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator.createInstance( intree,
- gwcd_list );
+ final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator
+ .createInstance( intree, gwcd_list );
SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony,
radomize_fitch_parsimony,
outfile_name,
domain_length_stats_by_domain,
tax_code_to_id_map,
write_to_nexus,
- use_last_in_fitch_parsimony );
- // Listing of all domain combinations gained is only done if only one input tree is used.
+ use_last_in_fitch_parsimony,
+ perform_dc_fich );
+ // Listing of all domain combinations gained is only done if only one input tree is used.
if ( ( domain_id_to_secondary_features_maps != null )
&& ( domain_id_to_secondary_features_maps.length > 0 ) ) {
int j = 0;
final Map<Species, MappingResults> mapping_results_map = new TreeMap<Species, MappingResults>();
final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator
.createInstance( intree, gwcd_list, domain_id_to_secondary_features_map );
- SurfacingUtil
- .executeParsimonyAnalysisForSecondaryFeatures( outfile_name
- + "_"
- + secondary_features_map_files[ j++ ],
- secondary_features_parsimony,
- intree,
- parameters_sb.toString(),
- mapping_results_map,
- use_last_in_fitch_parsimony );
+ SurfacingUtil.executeParsimonyAnalysisForSecondaryFeatures( outfile_name + "_"
+ + secondary_features_map_files[ j++ ],
+ secondary_features_parsimony,
+ intree,
+ parameters_sb.toString(),
+ mapping_results_map,
+ use_last_in_fitch_parsimony );
if ( i == 0 ) {
System.out.println();
System.out.println( "Mapping to secondary features:" );
SurfacingUtil.writeProteinListsForAllSpecies( out_dir,
protein_lists_per_species,
gwcd_list,
- output_list_of_all_proteins_per_domain_e_value_max );
+ output_list_of_all_proteins_per_domain_e_value_max,
+ positive_filter_file != null ? filter : null );
}
gwcd_list = null;
if ( all_bin_domain_combinations_gained_fitch != null ) {
try {
- SurfacingUtil
- .executeFitchGainsAnalysis( new File( output_file
- + surfacing.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ),
- all_bin_domain_combinations_gained_fitch,
- all_domains_encountered.size(),
- all_bin_domain_combinations_encountered,
- true );
+ SurfacingUtil.executeFitchGainsAnalysis(
+ new File( output_file
+ + surfacing.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ),
+ all_bin_domain_combinations_gained_fitch,
+ all_domains_encountered.size(),
+ all_bin_domain_combinations_encountered,
+ true );
}
catch ( final IOException e ) {
ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
}
if ( all_bin_domain_combinations_lost_fitch != null ) {
try {
- SurfacingUtil
- .executeFitchGainsAnalysis( new File( output_file
- + surfacing.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ),
- all_bin_domain_combinations_lost_fitch,
- all_domains_encountered.size(),
- all_bin_domain_combinations_encountered,
- false );
+ SurfacingUtil.executeFitchGainsAnalysis(
+ new File( output_file
+ + surfacing.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ),
+ all_bin_domain_combinations_lost_fitch,
+ all_domains_encountered.size(),
+ all_bin_domain_combinations_encountered,
+ false );
}
catch ( final IOException e ) {
ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
final Runtime rt = java.lang.Runtime.getRuntime();
final long free_memory = rt.freeMemory() / 1000000;
final long total_memory = rt.totalMemory() / 1000000;
- ForesterUtil.programMessage( PRG_NAME, "Time for analysis : " + ( new Date().getTime() - analysis_start_time )
- + "ms" );
+ ForesterUtil.programMessage( PRG_NAME,
+ "Time for analysis : " + ( new Date().getTime() - analysis_start_time ) + "ms" );
ForesterUtil.programMessage( PRG_NAME, "Total running time: " + ( new Date().getTime() - start_time ) + "ms " );
- ForesterUtil.programMessage( PRG_NAME, "Free memory : " + free_memory + "MB, total memory: "
- + total_memory + "MB" );
+ ForesterUtil
+ .programMessage( PRG_NAME,
+ "Free memory : " + free_memory + "MB, total memory: " + total_memory + "MB" );
ForesterUtil.programMessage( PRG_NAME, "If this application is useful to you, please cite:" );
ForesterUtil.programMessage( PRG_NAME, surfacing.WWW );
+ ForesterUtil
+ .programMessage( PRG_NAME,
+ "[next step for phylogenomic analysis pipeline (example, in \"DAS\" dir): % mse.rb .prot . FL_seqs DA_seqs path/to/genome_locations.txt]" );
ForesterUtil.programMessage( PRG_NAME, "OK" );
System.out.println();
}
+ DETAILEDNESS_DEFAULT + ")" );
System.out.println( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION
+ ": to ignore combinations with self (default: not to ignore)" );
- System.out
- .println( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION
- + ": to ignore domains without combinations in any species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" );
- System.out
- .println( surfacing.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION
- + ": to ignore domains specific to one species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" );
+ System.out.println( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION
+ + ": to ignore domains without combinations in any species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" );
+ System.out.println( surfacing.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION
+ + ": to ignore domains specific to one species (for similarity calc purposes, not for parsimony analyses) (default: not to ignore)" );
System.out.println( surfacing.NOT_IGNORE_DUFS_OPTION
+ ": to _not_ ignore DUFs (domains with unknown function) (default: ignore DUFs)" );
- System.out
- .println( surfacing.IGNORE_VIRAL_IDS
- + ": to ignore domains with ids containing 'vir', 'retro', 'transpos', 'phage', or starting with 'rv' or 'gag_'" );
+ System.out.println( surfacing.IGNORE_VIRAL_IDS
+ + ": to ignore domains with ids containing 'vir', 'retro', 'transpos', 'phage', or starting with 'rv' or 'gag_'" );
System.out.println( surfacing.DOMAIN_SIMILARITY_SORT_OPTION + ": sorting for similarities (default: "
+ DOMAIN_SORT_FILD_DEFAULT + ")" );
System.out.println( surfacing.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" );
- System.out.println( surfacing.MAX_E_VALUE_OPTION + ": max (inclusive) E-value" );
+ System.out.println( surfacing.MAX_I_E_VALUE_OPTION + ": max (inclusive) iE-value" );
+ System.out.println( surfacing.MAX_FS_E_VALUE_OPTION + ": max (inclusive) FS E-value" );
+ System.out.println( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION
+ + ": min (inclusive) relative envelope length ratio" );
System.out.println( surfacing.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" );
System.out.println( surfacing.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" );
System.out.println( surfacing.SPECIES_MATRIX_OPTION + ": species matrix" );
+ "[=<suffix for pairwise comparison output files>]: to perform pairwise comparison based analyses" );
System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION
+ ": species tree, to perform (Dollo, Fitch) parismony analyses" );
- System.out
- .println( surfacing.INPUT_SPECIES_TREE_OPTION
- + "=<treefiles in phyloXML format, separated by #>: to infer domain/binary domain combination gains/losses on given species trees" );
+ System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION
+ + "=<treefiles in phyloXML format, separated by #>: to infer domain/binary domain combination gains/losses on given species trees" );
System.out.println( surfacing.FILTER_POSITIVE_OPTION
+ "=<file>: to filter out proteins not containing at least one domain listed in <file>" );
System.out.println( surfacing.FILTER_NEGATIVE_OPTION
System.out.println( surfacing.FILTER_NEGATIVE_DOMAINS_OPTION
+ "=<file>: to filter out (ignore) domains listed in <file>" );
System.out.println( surfacing.INPUT_GENOMES_FILE_OPTION + "=<file>: to read input files from <file>" );
- System.out
- .println( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION
- + "=<seed>: seed for random number generator for Fitch Parsimony analysis (type: long, default: no randomization - given a choice, prefer absence" );
+ System.out.println( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION
+ + "=<seed>: seed for random number generator for Fitch Parsimony analysis (type: long, default: no randomization - given a choice, prefer absence" );
System.out.println( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS
+ ": to consider directedness in binary combinations: e.g. A-B != B-A" );
System.out.println( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY
+ ": to consider directedness and adjacency in binary combinations" );
- System.out
- .println( surfacing.SEQ_EXTRACT_OPTION
- + "=<domain ids (Pfam names)>: to extract sequence names of sequences containing matching domains and/or domain-sequences (order N to C) (domain separator: '~', domain sequences speparator: '#', e.g. 'NACHT#BIR~CARD')" );
+ System.out.println( surfacing.SEQ_EXTRACT_OPTION
+ + "=<domain ids (Pfam names)>: to extract sequence names of sequences containing matching domains and/or domain-sequences (order N to C) (domain separator: '~', domain sequences speparator: '#', e.g. 'NACHT#BIR~CARD')" );
System.out.println( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE
+ "=<file>: to perfom parsimony analysis on secondary features" );
System.out.println( surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=<file>: to presence/absence genome analysis" );
+ System.out.println( surfacing.DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION
+ + ": to output binary domain counts (as individual files)" );
System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS
+ ": to output binary domain combinations for (downstream) graph analysis" );
System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" );
+ ": e value max per domain for output of all proteins per domain" );
System.out.println( surfacing.USE_LAST_IN_FITCH_OPTION + ": to use last in Fitch parsimony" );
System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" );
+ System.out.println( PERFORM_DC_FITCH + ": to perform DC Fitch parsimony" );
System.out.println( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION + ": to perform DC regain protein statistics" );
- System.out.println( DA_ANALYSIS_OPTION + ": to do DA analysis" );
+ System.out.println( DA_ANALYSIS_OPTION + ": to perform DA analysis" );
+ System.out.println( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION + ": to perform domain length analysis" );
System.out.println();
- System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar"
- + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1"
- + " -no_eo -mo=0 -genomes=eukaryotes.txt -out_dir=out -o=o "
- + " -species_tree=tol.xml -obo=gene_ontology_2012_02_07.obo -pos_filter=f.txt -all_prot" );
System.out.println();
- System.out.println( "Example 2: java -Xms128m -Xmx512m -cp path/to/forester.jar"
- + " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST"
- + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo "
- + "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo -genomes=eukaryotes.txt "
- + "-ds_output=detailed_html -scoring=domains -sort=alpha " );
+ System.out
+ .println( "Example 1: surfacing -p2g=pfam2go.txt -obo=go.obo -species_tree=tol_156.xml -no_eo -ie=0.01 -dufs -genomes=genomes_all.txt -pos_filter=tf_1.txt -out_dir=_tf1 -o=tf1" );
+ System.out.println();
+ System.out
+ .println( "Example 2: surfacing -p2g=pfam2go.txt -obo=go.obo -species_tree=tol_156.xml -last -ignore_viral_ids -no_eo -ie=0.1 -dufs -genomes=genomes_all.txt -pos_filter=tf_1.txt -all_prot -all_prot_e=0.1 -out_dir=_tf1_e01_ape01 -o=tf1_e01_ape01" );
+ System.out.println();
+ System.out
+ .println( "Example 3: surfacing -species_tree=master_tree.xml -no_eo -ie=1e-6 -mrel=0.5 -mo=10 -dufs -genomes=genomes.txt -out_dir=a605 -o=a605" );
+ System.out.println();
+ System.out
+ .println( "[next step for phylogenomic analysis pipeline (example, in \"DAS\" dir): % mse.rb .prot . FL_seqs DA_seqs path/to/genome_locations.txt]" );
System.out.println();
}
}