X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fsurfacing.java;h=fc98702421f0aebc64683317e0dbb2e1267c9f38;hb=7d5ee914b2beb6ec20aa906326113bba36551a4d;hp=f15ea369d6d715f20ef9fbb485b0cc1b64613f47;hpb=47608e88e2f88eb5729d4eceb47af7e402495b27;p=jalview.git diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index f15ea36..fc98702 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -54,6 +54,7 @@ import org.forester.go.PfamToGoMapping; import org.forester.go.PfamToGoParser; import org.forester.io.parsers.HmmscanPerDomainTableParser; import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; @@ -62,7 +63,6 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; -import org.forester.protein.DomainId; import org.forester.protein.Protein; import org.forester.species.BasicSpecies; import org.forester.species.Species; @@ -187,6 +187,9 @@ public class surfacing { final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; + final static private String DA_ANALYSIS_OPTION = "DA_analyis"; + final static private String USE_LAST_IN_FITCH_OPTION = "last"; final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; final static private String OUTPUT_FILE_OPTION = "o"; @@ -216,20 +219,14 @@ public class surfacing { + ForesterConstants.PHYLO_XML_SUFFIX; final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String JACKNIFE_OPTION = "jack"; - final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; - final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; - private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100; - final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; - final static private double JACKNIFE_RATIO_DEFAULT = 0.5; final static private String FILTER_POSITIVE_OPTION = "pos_filter"; final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.260"; - final static private String PRG_DATE = "130721"; + final static private String PRG_VERSION = "2.280"; + final static private String PRG_DATE = "130701"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -268,6 +265,7 @@ public class surfacing { private static final String LOG_FILE_SUFFIX = "_log.txt"; private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; + private static final String WRITE_TO_NEXUS_OPTION = "nexus"; private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; @@ -280,8 +278,6 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; - private static final boolean PERFORM_DC_REGAIN_PROTEINS_STATS = true; - private static final boolean DA_ANALYSIS = false; private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, @@ -367,8 +363,8 @@ public class surfacing { final Writer out = ForesterUtil.createBufferedWriter( output_file ); final SortedMap bdc_to_counts = ForesterUtil .listToSortedCountsMap( all_bin_domain_combinations_changed ); - final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); - final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); int above_one = 0; int one = 0; for( final Object bdc_object : bdc_to_counts.keySet() ) { @@ -436,7 +432,7 @@ public class surfacing { final List plus_minus_analysis_low_copy, final List gwcd_list, final SortedMap> protein_lists_per_species, - final Map> domain_id_to_go_ids_map, + final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final List plus_minus_analysis_numbers ) { final Set all_spec = new HashSet(); @@ -604,9 +600,9 @@ public class surfacing { allowed_options.add( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); allowed_options.add( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); allowed_options.add( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); - allowed_options.add( JACKNIFE_OPTION ); - allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); - allowed_options.add( JACKNIFE_RATIO_OPTION ); + //allowed_options.add( JACKNIFE_OPTION ); + // allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); + // allowed_options.add( JACKNIFE_RATIO_OPTION ); allowed_options.add( INPUT_SPECIES_TREE_OPTION ); allowed_options.add( FILTER_POSITIVE_OPTION ); allowed_options.add( FILTER_NEGATIVE_OPTION ); @@ -621,6 +617,10 @@ public class surfacing { allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + allowed_options.add( WRITE_TO_NEXUS_OPTION ); + allowed_options.add( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION ); + allowed_options.add( DA_ANALYSIS_OPTION ); + allowed_options.add( USE_LAST_IN_FITCH_OPTION ); boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT; boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; double e_value_max = surfacing.MAX_E_VALUE_DEFAULT; @@ -629,6 +629,22 @@ public class surfacing { if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown option(s): " + dissallowed_options ); } + boolean use_last_in_fitch_parsimony = false; + if ( cla.isOptionSet( USE_LAST_IN_FITCH_OPTION ) ) { + use_last_in_fitch_parsimony = true; + } + boolean write_to_nexus = false; + if ( cla.isOptionSet( WRITE_TO_NEXUS_OPTION ) ) { + write_to_nexus = true; + } + boolean perform_dc_regain_proteins_stats = false; + if ( cla.isOptionSet( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION ) ) { + perform_dc_regain_proteins_stats = true; + } + boolean da_analysis = false; + if ( cla.isOptionSet( DA_ANALYSIS_OPTION ) ) { + da_analysis = true; + } boolean output_binary_domain_combinationsfor_graph_analysis = false; if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { output_binary_domain_combinationsfor_graph_analysis = true; @@ -1019,12 +1035,6 @@ public class surfacing { } } final String[][] input_file_properties = processInputGenomesFile( input_genomes_file ); - for( final String[] input_file_propertie : input_file_properties ) { - for( final String element : input_file_propertie ) { - System.out.print( element + " " ); - } - System.out.println(); - } final int number_of_genomes = input_file_properties.length; if ( number_of_genomes < 2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); @@ -1047,7 +1057,7 @@ public class surfacing { SurfacingUtil.checkForOutputFileWriteability( dcc_outfile ); } File pfam_to_go_file = null; - Map> domain_id_to_go_ids_map = null; + Map> domain_id_to_go_ids_map = null; int domain_id_to_go_ids_count = 0; if ( cla.isOptionSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { @@ -1145,56 +1155,9 @@ public class surfacing { && ( number_of_genomes > 2 ) ) { domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; } - boolean jacknifed_distances = false; - int jacknife_resamplings = JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT; - double jacknife_ratio = JACKNIFE_RATIO_DEFAULT; - long random_seed = JACKNIFE_RANDOM_SEED_DEFAULT; - if ( cla.isOptionSet( surfacing.JACKNIFE_OPTION ) ) { - if ( ( number_of_genomes < 3 ) || !perform_pwc ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use jacknife resampling analysis (-" - + surfacing.JACKNIFE_OPTION + "[=]) without pairwise analyses (" - + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION - + "=)" ); - } - jacknifed_distances = true; - if ( cla.isOptionHasAValue( surfacing.JACKNIFE_OPTION ) ) { - try { - jacknife_resamplings = cla.getOptionValueAsInt( surfacing.JACKNIFE_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for number of resamplings" ); - } - if ( jacknife_resamplings < 2 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use less than 2 resamplings" ); - } - } - if ( cla.isOptionSet( surfacing.JACKNIFE_RATIO_OPTION ) - && cla.isOptionHasAValue( surfacing.JACKNIFE_RATIO_OPTION ) ) { - try { - jacknife_ratio = cla.getOptionValueAsDouble( surfacing.JACKNIFE_RATIO_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for jacknife ratio" ); - } - if ( ( jacknife_ratio <= 0.0 ) || ( jacknife_ratio >= 1.0 ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use illegal value for jacknife ratio: " - + jacknife_ratio ); - } - } - if ( cla.isOptionSet( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) - && cla.isOptionHasAValue( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) ) { - try { - random_seed = cla.getOptionValueAsLong( surfacing.JACKNIFE_RANDOM_SEED_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for random generator seed" ); - } - } - } File[] intree_files = null; Phylogeny[] intrees = null; if ( cla.isOptionSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) { - // TODO FIXME if jacknife.... maybe not if ( number_of_genomes < 3 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer gains and losses on input species trees (-" + surfacing.INPUT_SPECIES_TREE_OPTION + " without pairwise analyses (" @@ -1236,10 +1199,10 @@ public class surfacing { } radomize_fitch_parsimony = true; } - SortedSet filter = null; + SortedSet filter = null; if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) || ( negative_domains_filter_file != null ) ) { - filter = new TreeSet(); + filter = new TreeSet(); if ( positive_filter_file != null ) { processFilter( positive_filter_file, filter ); } @@ -1250,7 +1213,7 @@ public class surfacing { processFilter( negative_domains_filter_file, filter ); } } - Map>[] domain_id_to_secondary_features_maps = null; + Map>[] domain_id_to_secondary_features_maps = null; File[] secondary_features_map_files = null; final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); @@ -1398,19 +1361,34 @@ public class surfacing { + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "" + nl ); } + System.out.println( "Use last in Fitch parimony : " + use_last_in_fitch_parsimony ); + html_desc.append( "Use last in Fitch parimon:" + use_last_in_fitch_parsimony + "" + + nl ); + System.out.println( "Write to Nexus files : " + write_to_nexus ); + html_desc.append( "Write to Nexus files:" + write_to_nexus + "" + nl ); + System.out.println( "DC regain prot stats : " + perform_dc_regain_proteins_stats ); + html_desc.append( "DC regain prot stats:" + perform_dc_regain_proteins_stats + "" + + nl ); + System.out.println( "DA analysis : " + da_analysis ); + html_desc.append( "DA analysis :" + da_analysis + "" + nl ); System.out.print( "Domain counts sort order : " ); + html_desc.append( "Domain counts sort order:" ); switch ( dc_sort_order ) { case ALPHABETICAL_KEY_ID: System.out.println( "alphabetical" ); + html_desc.append( "alphabetical" + "" + nl ); break; case KEY_DOMAIN_COUNT: System.out.println( "domain count" ); + html_desc.append( "domain count" + "" + nl ); break; case KEY_DOMAIN_PROTEINS_COUNT: System.out.println( "domain proteins count" ); + html_desc.append( "domain proteins count" + "" + nl ); break; case COMBINATIONS_COUNT: System.out.println( "domain combinations count" ); + html_desc.append( "domain combinations count" + "" + nl ); break; default: ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for dc sort order" ); @@ -1574,15 +1552,6 @@ public class surfacing { } System.out.println(); html_desc.append( "" + nl ); - if ( jacknifed_distances ) { - html_desc.append( "Jacknife:" + jacknife_resamplings + " resamplings" + nl ); - html_desc.append( "Jacknife ratio:" + ForesterUtil.round( jacknife_ratio, 2 ) - + "" + nl ); - html_desc.append( "Jacknife random number seed:" + random_seed + "" + nl ); - System.out.println( " Jacknife : " + jacknife_resamplings + " resamplings" ); - System.out.println( " Ratio : " + ForesterUtil.round( jacknife_ratio, 2 ) ); - System.out.println( " Random number seed : " + random_seed ); - } if ( ( intrees != null ) && ( intrees.length > 0 ) ) { for( final File intree_file : intree_files ) { html_desc.append( "Intree for gain/loss parsimony analysis:" + intree_file @@ -1604,8 +1573,8 @@ public class surfacing { if ( VERBOSE ) { System.out.println(); System.out.println( "Domain ids to secondary features map:" ); - for( final DomainId domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { - System.out.print( domain_id.getId() ); + for( final String domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { + System.out.print( domain_id ); System.out.print( " => " ); for( final String sec : domain_id_to_secondary_features_maps[ i ].get( domain_id ) ) { System.out.print( sec ); @@ -1621,7 +1590,7 @@ public class surfacing { html_desc.append( "Command line:\n" + cla.getCommandLineArgsAsString() + "\n" + nl ); System.out.println( "Command line : " + cla.getCommandLineArgsAsString() ); BufferedWriter[] query_domains_writer_ary = null; - List[] query_domain_ids_array = null; + List[] query_domain_ids_array = null; if ( query_domain_ids != null ) { final String[] query_domain_ids_str_array = query_domain_ids.split( "#" ); query_domain_ids_array = new ArrayList[ query_domain_ids_str_array.length ]; @@ -1629,9 +1598,9 @@ public class surfacing { for( int i = 0; i < query_domain_ids_str_array.length; i++ ) { String query_domain_ids_str = query_domain_ids_str_array[ i ]; final String[] query_domain_ids_str_ary = query_domain_ids_str.split( "~" ); - final List query = new ArrayList(); + final List query = new ArrayList(); for( final String element : query_domain_ids_str_ary ) { - query.add( new DomainId( element ) ); + query.add( element ); } query_domain_ids_array[ i ] = query; query_domain_ids_str = query_domain_ids_str.replace( '~', '_' ); @@ -1656,8 +1625,8 @@ public class surfacing { if ( need_protein_lists_per_species ) { protein_lists_per_species = new TreeMap>(); } - final List gwcd_list = new ArrayList( number_of_genomes ); - final SortedSet all_domains_encountered = new TreeSet(); + List gwcd_list = new ArrayList( number_of_genomes ); + final SortedSet all_domains_encountered = new TreeSet(); final SortedSet all_bin_domain_combinations_encountered = new TreeSet(); List all_bin_domain_combinations_gained_fitch = null; List all_bin_domain_combinations_lost_fitch = null; @@ -1732,7 +1701,7 @@ public class surfacing { Map protein_length_stats_by_dc = null; Map domain_number_stats_by_dc = null; final Map domain_length_stats_by_domain = new HashMap(); - if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) { + if ( perform_dc_regain_proteins_stats ) { protein_length_stats_by_dc = new HashMap(); domain_number_stats_by_dc = new HashMap(); } @@ -1809,7 +1778,7 @@ public class surfacing { final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered(); protein_coverage_stats.addValue( coverage ); int distinct_das = -1; - if ( DA_ANALYSIS ) { + if ( da_analysis ) { final String genome = input_file_properties[ i ][ 0 ]; distinct_das = SurfacingUtil.storeDomainArchitectures( genome, distinct_domain_architecutures_per_genome, @@ -1869,7 +1838,7 @@ public class surfacing { log( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter(), log_writer ); } - if ( DA_ANALYSIS ) { + if ( da_analysis ) { System.out.println( "Distinct domain architectures stored : " + distinct_das ); log( "Distinct domain architectures stored : " + distinct_das, log_writer ); } @@ -1919,7 +1888,7 @@ public class surfacing { domains_which_never_single, domains_per_potein_stats_writer ); domain_lengths_table.addLengths( protein_list ); - if ( !DA_ANALYSIS ) { + if ( !da_analysis ) { gwcd_list.add( BasicGenomeWideCombinableDomains .createInstance( protein_list, ignore_combination_with_same, @@ -1976,7 +1945,7 @@ public class surfacing { ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); // - if ( DA_ANALYSIS ) { + if ( da_analysis ) { SurfacingUtil.performDomainArchitectureAnalysis( distinct_domain_architecutures_per_genome, distinct_domain_architecuture_counts, 10, @@ -2093,7 +2062,7 @@ public class surfacing { go_annotation_output, go_id_to_term_map, go_namespace_limit ); - DescriptiveStatistics pw_stats = null; + final Map tax_code_to_id_map = SurfacingUtil.createTaxCodeToIdMap( intrees[ 0 ] ); try { String my_outfile = output_file.toString(); Map split_writers = null; @@ -2124,7 +2093,7 @@ public class surfacing { + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + "" + nl ); html_desc.append( "" + nl ); - pw_stats = SurfacingUtil + final DescriptiveStatistics pw_stats = SurfacingUtil .writeDomainSimilaritiesToFile( html_desc, new StringBuilder( number_of_genomes + " genomes" ), writer, @@ -2135,7 +2104,8 @@ public class surfacing { domain_similarity_print_option, domain_similarity_sort_field, scoring, - true ); + true, + tax_code_to_id_map ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); } @@ -2144,7 +2114,6 @@ public class surfacing { + e.getMessage() + "]" ); } System.out.println(); - // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); final Species[] species = new Species[ number_of_genomes ]; for( int i = 0; i < number_of_genomes; ++i ) { species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); @@ -2172,7 +2141,8 @@ public class surfacing { surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, surfacing.PRG_NAME, out_dir, - write_pwc_files ); + write_pwc_files, + tax_code_to_id_map ); String matrix_output_file = new String( output_file.toString() ); if ( matrix_output_file.indexOf( '.' ) > 1 ) { matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); @@ -2203,40 +2173,13 @@ public class surfacing { inferred_trees.add( nj_gd ); inferred_trees.add( nj_bc ); inferred_trees.add( nj_d ); - if ( jacknifed_distances ) { - pwgc.performPairwiseComparisonsJacknifed( species, - number_of_genomes, - gwcd_list, - true, - jacknife_resamplings, - jacknife_ratio, - random_seed ); - SurfacingUtil - .writeMatrixToFile( new File( matrix_output_file - + "_" - + ForesterUtil.round( jacknife_ratio, 2 ) - + "_" - + jacknife_resamplings - + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), - pwgc.getSharedBinaryCombinationsBasedDistances() ); - SurfacingUtil - .writeMatrixToFile( new File( matrix_output_file + "_" + ForesterUtil.round( jacknife_ratio, 2 ) - + "_" + jacknife_resamplings - + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), - pwgc.getSharedDomainsBasedDistances() ); - // if ( infer_species_trees ) { - // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings - // + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc - // .getSharedBinaryCombinationsBasedDistances() ); - // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings - // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); - // } - } } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) if ( ( out_dir != null ) && ( !perform_pwc ) ) { output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); } - writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( write_to_nexus ) { + writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + } if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, e_value_max, @@ -2271,12 +2214,15 @@ public class surfacing { dc_type, protein_length_stats_by_dc, domain_number_stats_by_dc, - domain_length_stats_by_domain ); + domain_length_stats_by_domain, + tax_code_to_id_map, + write_to_nexus, + use_last_in_fitch_parsimony ); // Listing of all domain combinations gained is only done if only one input tree is used. if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { int j = 0; - for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { final Map mapping_results_map = new TreeMap(); final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator .createInstance( intree, gwcd_list, domain_id_to_secondary_features_map ); @@ -2287,7 +2233,8 @@ public class surfacing { secondary_features_parsimony, intree, parameters_sb.toString(), - mapping_results_map ); + mapping_results_map, + use_last_in_fitch_parsimony ); if ( i == 0 ) { System.out.println(); System.out.println( "Mapping to secondary features:" ); @@ -2329,6 +2276,7 @@ public class surfacing { gwcd_list, output_list_of_all_proteins_per_domain_e_value_max ); } + gwcd_list = null; if ( all_bin_domain_combinations_gained_fitch != null ) { try { executeFitchGainsAnalysis( new File( output_file @@ -2544,17 +2492,6 @@ public class surfacing { System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); System.out - .println( JACKNIFE_OPTION - + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " - + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); - System.out.println( JACKNIFE_RATIO_OPTION + ": ratio for jacknife resampling [default: " - + JACKNIFE_RATIO_DEFAULT + "]" ); - System.out.println( JACKNIFE_RANDOM_SEED_OPTION - + ": seed for random number generator for jacknife resampling [default: " - + JACKNIFE_RANDOM_SEED_DEFAULT + "]" ); - // System.out.println( surfacing.INFER_SPECIES_TREES_OPTION - // + ": to infer NJ species trees based on shared domains/binary domain combinations" ); - System.out .println( surfacing.INPUT_SPECIES_TREE_OPTION + "=: to infer domain/binary domain combination gains/losses on given species trees" ); System.out.println( surfacing.FILTER_POSITIVE_OPTION @@ -2582,6 +2519,10 @@ public class surfacing { System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION + ": e value max per domain for output of all proteins per domain" ); + System.out.println( surfacing.USE_LAST_IN_FITCH_OPTION + ": to use last in Fitch parsimony" ); + System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" ); + System.out.println( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION + ": to perform DC regain protein statistics" ); + System.out.println( DA_ANALYSIS_OPTION + ": to do DA analysis" ); System.out.println(); System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" @@ -2596,7 +2537,7 @@ public class surfacing { System.out.println(); } - private static void processFilter( final File filter_file, final SortedSet filter ) { + private static void processFilter( final File filter_file, final SortedSet filter ) { SortedSet filter_str = null; try { filter_str = ForesterUtil.file2set( filter_file ); @@ -2606,13 +2547,13 @@ public class surfacing { } if ( filter_str != null ) { for( final String string : filter_str ) { - filter.add( new DomainId( string ) ); + filter.add( string ); } } if ( VERBOSE ) { System.out.println( "Filter:" ); - for( final DomainId domainId : filter ) { - System.out.println( domainId.getId() ); + for( final String domainId : filter ) { + System.out.println( domainId ); } } } @@ -2627,7 +2568,23 @@ public class surfacing { "genomes files is to be in the following format \" \": " + e.getLocalizedMessage() ); } + final Set specs = new HashSet(); + final Set paths = new HashSet(); for( int i = 0; i < input_file_properties.length; ++i ) { + if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: " + + input_file_properties[ i ][ 1 ] ); + } + if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ] + + " is not unique" ); + } + specs.add( input_file_properties[ i ][ 1 ] ); + if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ] + + " is not unique" ); + } + paths.add( input_file_properties[ i ][ 0 ] ); final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); if ( !ForesterUtil.isEmpty( error ) ) { ForesterUtil.fatalError( surfacing.PRG_NAME, error ); @@ -2736,7 +2693,7 @@ public class surfacing { private static void writePresentToNexus( final File output_file, final File positive_filter_file, - final SortedSet filter, + final SortedSet filter, final List gwcd_list ) { try { SurfacingUtil @@ -2756,11 +2713,11 @@ public class surfacing { final SortedMap> protein_lists_per_species, final List gwcd_list, final double domain_e_cutoff ) { - final SortedSet all_domains = new TreeSet(); + final SortedSet all_domains = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { all_domains.addAll( gwcd.getAllDomainIds() ); } - for( final DomainId domain : all_domains ) { + for( final String domain : all_domains ) { final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); SurfacingUtil.checkForOutputFileWriteability( out ); try {