X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fsurfacing.java;h=10a615ff967c5c40e4f1358849b3a4e5467e4349;hb=249882688f35000b8cedfff3b4da1845d749d18e;hp=f2f0c9e5d4ea7b36a1fcc8e2ae255d8c5485995d;hpb=cbc5c71b164a57b8ad6c988d015057c7f0972478;p=jalview.git diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index f2f0c9e..10a615f 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -63,7 +63,6 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; -import org.forester.protein.DomainId; import org.forester.protein.Protein; import org.forester.species.BasicSpecies; import org.forester.species.Species; @@ -188,6 +187,9 @@ public class surfacing { final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; + final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; + final static private String DA_ANALYSIS_OPTION = "DA_analyis"; + final static private String USE_LAST_IN_FITCH_OPTION = "last"; final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; final static private String OUTPUT_FILE_OPTION = "o"; @@ -217,22 +219,16 @@ public class surfacing { + ForesterConstants.PHYLO_XML_SUFFIX; final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String JACKNIFE_OPTION = "jack"; - final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; - final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; - private static final int JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT = 100; - final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; - final static private double JACKNIFE_RATIO_DEFAULT = 0.5; final static private String FILTER_POSITIVE_OPTION = "pos_filter"; final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.260"; - final static private String PRG_DATE = "130721"; + final static private String PRG_VERSION = "2.280"; + final static private String PRG_DATE = "130701"; final static private String E_MAIL = "czmasek@burnham.org"; - final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; final static private double MAX_E_VALUE_DEFAULT = -1; @@ -269,6 +265,7 @@ public class surfacing { private static final String LOG_FILE_SUFFIX = "_log.txt"; private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; + private static final String WRITE_TO_NEXUS_OPTION = "nexus"; private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; @@ -281,8 +278,6 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; - private static final boolean PERFORM_DC_REGAIN_PROTEINS_STATS = true; - private static final boolean DA_ANALYSIS = false; private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, @@ -368,8 +363,8 @@ public class surfacing { final Writer out = ForesterUtil.createBufferedWriter( output_file ); final SortedMap bdc_to_counts = ForesterUtil .listToSortedCountsMap( all_bin_domain_combinations_changed ); - final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); - final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); int above_one = 0; int one = 0; for( final Object bdc_object : bdc_to_counts.keySet() ) { @@ -437,7 +432,7 @@ public class surfacing { final List plus_minus_analysis_low_copy, final List gwcd_list, final SortedMap> protein_lists_per_species, - final Map> domain_id_to_go_ids_map, + final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final List plus_minus_analysis_numbers ) { final Set all_spec = new HashSet(); @@ -605,9 +600,9 @@ public class surfacing { allowed_options.add( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); allowed_options.add( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); allowed_options.add( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); - allowed_options.add( JACKNIFE_OPTION ); - allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); - allowed_options.add( JACKNIFE_RATIO_OPTION ); + //allowed_options.add( JACKNIFE_OPTION ); + // allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); + // allowed_options.add( JACKNIFE_RATIO_OPTION ); allowed_options.add( INPUT_SPECIES_TREE_OPTION ); allowed_options.add( FILTER_POSITIVE_OPTION ); allowed_options.add( FILTER_NEGATIVE_OPTION ); @@ -622,6 +617,10 @@ public class surfacing { allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + allowed_options.add( WRITE_TO_NEXUS_OPTION ); + allowed_options.add( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION ); + allowed_options.add( DA_ANALYSIS_OPTION ); + allowed_options.add( USE_LAST_IN_FITCH_OPTION ); boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT; boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; double e_value_max = surfacing.MAX_E_VALUE_DEFAULT; @@ -630,6 +629,22 @@ public class surfacing { if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown option(s): " + dissallowed_options ); } + boolean use_last_in_fitch_parsimony = false; + if ( cla.isOptionSet( USE_LAST_IN_FITCH_OPTION ) ) { + use_last_in_fitch_parsimony = true; + } + boolean write_to_nexus = false; + if ( cla.isOptionSet( WRITE_TO_NEXUS_OPTION ) ) { + write_to_nexus = true; + } + boolean perform_dc_regain_proteins_stats = false; + if ( cla.isOptionSet( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION ) ) { + perform_dc_regain_proteins_stats = true; + } + boolean da_analysis = false; + if ( cla.isOptionSet( DA_ANALYSIS_OPTION ) ) { + da_analysis = true; + } boolean output_binary_domain_combinationsfor_graph_analysis = false; if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { output_binary_domain_combinationsfor_graph_analysis = true; @@ -1020,12 +1035,6 @@ public class surfacing { } } final String[][] input_file_properties = processInputGenomesFile( input_genomes_file ); - // for( final String[] input_file_propertie : input_file_properties ) { - // for( final String element : input_file_propertie ) { - // System.out.print( element + " " ); - // } - // System.out.println(); - // } final int number_of_genomes = input_file_properties.length; if ( number_of_genomes < 2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); @@ -1048,7 +1057,7 @@ public class surfacing { SurfacingUtil.checkForOutputFileWriteability( dcc_outfile ); } File pfam_to_go_file = null; - Map> domain_id_to_go_ids_map = null; + Map> domain_id_to_go_ids_map = null; int domain_id_to_go_ids_count = 0; if ( cla.isOptionSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.PFAM_TO_GO_FILE_USE_OPTION ) ) { @@ -1146,56 +1155,9 @@ public class surfacing { && ( number_of_genomes > 2 ) ) { domain_similarity_sort_field = DomainSimilarity.DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE; } - boolean jacknifed_distances = false; - int jacknife_resamplings = JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT; - double jacknife_ratio = JACKNIFE_RATIO_DEFAULT; - long random_seed = JACKNIFE_RANDOM_SEED_DEFAULT; - if ( cla.isOptionSet( surfacing.JACKNIFE_OPTION ) ) { - if ( ( number_of_genomes < 3 ) || !perform_pwc ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot use jacknife resampling analysis (-" - + surfacing.JACKNIFE_OPTION + "[=]) without pairwise analyses (" - + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION - + "=)" ); - } - jacknifed_distances = true; - if ( cla.isOptionHasAValue( surfacing.JACKNIFE_OPTION ) ) { - try { - jacknife_resamplings = cla.getOptionValueAsInt( surfacing.JACKNIFE_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for number of resamplings" ); - } - if ( jacknife_resamplings < 2 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use less than 2 resamplings" ); - } - } - if ( cla.isOptionSet( surfacing.JACKNIFE_RATIO_OPTION ) - && cla.isOptionHasAValue( surfacing.JACKNIFE_RATIO_OPTION ) ) { - try { - jacknife_ratio = cla.getOptionValueAsDouble( surfacing.JACKNIFE_RATIO_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for jacknife ratio" ); - } - if ( ( jacknife_ratio <= 0.0 ) || ( jacknife_ratio >= 1.0 ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "attempt to use illegal value for jacknife ratio: " - + jacknife_ratio ); - } - } - if ( cla.isOptionSet( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) - && cla.isOptionHasAValue( surfacing.JACKNIFE_RANDOM_SEED_OPTION ) ) { - try { - random_seed = cla.getOptionValueAsLong( surfacing.JACKNIFE_RANDOM_SEED_OPTION ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for random generator seed" ); - } - } - } File[] intree_files = null; Phylogeny[] intrees = null; if ( cla.isOptionSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) { - // TODO FIXME if jacknife.... maybe not if ( number_of_genomes < 3 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot infer gains and losses on input species trees (-" + surfacing.INPUT_SPECIES_TREE_OPTION + " without pairwise analyses (" @@ -1237,10 +1199,10 @@ public class surfacing { } radomize_fitch_parsimony = true; } - SortedSet filter = null; + SortedSet filter = null; if ( ( positive_filter_file != null ) || ( negative_filter_file != null ) || ( negative_domains_filter_file != null ) ) { - filter = new TreeSet(); + filter = new TreeSet(); if ( positive_filter_file != null ) { processFilter( positive_filter_file, filter ); } @@ -1251,7 +1213,7 @@ public class surfacing { processFilter( negative_domains_filter_file, filter ); } } - Map>[] domain_id_to_secondary_features_maps = null; + Map>[] domain_id_to_secondary_features_maps = null; File[] secondary_features_map_files = null; final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); @@ -1399,19 +1361,34 @@ public class surfacing { + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "" + nl ); } + System.out.println( "Use last in Fitch parimony : " + use_last_in_fitch_parsimony ); + html_desc.append( "Use last in Fitch parimon:" + use_last_in_fitch_parsimony + "" + + nl ); + System.out.println( "Write to Nexus files : " + write_to_nexus ); + html_desc.append( "Write to Nexus files:" + write_to_nexus + "" + nl ); + System.out.println( "DC regain prot stats : " + perform_dc_regain_proteins_stats ); + html_desc.append( "DC regain prot stats:" + perform_dc_regain_proteins_stats + "" + + nl ); + System.out.println( "DA analysis : " + da_analysis ); + html_desc.append( "DA analysis :" + da_analysis + "" + nl ); System.out.print( "Domain counts sort order : " ); + html_desc.append( "Domain counts sort order:" ); switch ( dc_sort_order ) { case ALPHABETICAL_KEY_ID: System.out.println( "alphabetical" ); + html_desc.append( "alphabetical" + "" + nl ); break; case KEY_DOMAIN_COUNT: System.out.println( "domain count" ); + html_desc.append( "domain count" + "" + nl ); break; case KEY_DOMAIN_PROTEINS_COUNT: System.out.println( "domain proteins count" ); + html_desc.append( "domain proteins count" + "" + nl ); break; case COMBINATIONS_COUNT: System.out.println( "domain combinations count" ); + html_desc.append( "domain combinations count" + "" + nl ); break; default: ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "unknown value for dc sort order" ); @@ -1575,15 +1552,6 @@ public class surfacing { } System.out.println(); html_desc.append( "" + nl ); - if ( jacknifed_distances ) { - html_desc.append( "Jacknife:" + jacknife_resamplings + " resamplings" + nl ); - html_desc.append( "Jacknife ratio:" + ForesterUtil.round( jacknife_ratio, 2 ) - + "" + nl ); - html_desc.append( "Jacknife random number seed:" + random_seed + "" + nl ); - System.out.println( " Jacknife : " + jacknife_resamplings + " resamplings" ); - System.out.println( " Ratio : " + ForesterUtil.round( jacknife_ratio, 2 ) ); - System.out.println( " Random number seed : " + random_seed ); - } if ( ( intrees != null ) && ( intrees.length > 0 ) ) { for( final File intree_file : intree_files ) { html_desc.append( "Intree for gain/loss parsimony analysis:" + intree_file @@ -1605,8 +1573,8 @@ public class surfacing { if ( VERBOSE ) { System.out.println(); System.out.println( "Domain ids to secondary features map:" ); - for( final DomainId domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { - System.out.print( domain_id.getId() ); + for( final String domain_id : domain_id_to_secondary_features_maps[ i ].keySet() ) { + System.out.print( domain_id ); System.out.print( " => " ); for( final String sec : domain_id_to_secondary_features_maps[ i ].get( domain_id ) ) { System.out.print( sec ); @@ -1622,7 +1590,7 @@ public class surfacing { html_desc.append( "Command line:\n" + cla.getCommandLineArgsAsString() + "\n" + nl ); System.out.println( "Command line : " + cla.getCommandLineArgsAsString() ); BufferedWriter[] query_domains_writer_ary = null; - List[] query_domain_ids_array = null; + List[] query_domain_ids_array = null; if ( query_domain_ids != null ) { final String[] query_domain_ids_str_array = query_domain_ids.split( "#" ); query_domain_ids_array = new ArrayList[ query_domain_ids_str_array.length ]; @@ -1630,9 +1598,9 @@ public class surfacing { for( int i = 0; i < query_domain_ids_str_array.length; i++ ) { String query_domain_ids_str = query_domain_ids_str_array[ i ]; final String[] query_domain_ids_str_ary = query_domain_ids_str.split( "~" ); - final List query = new ArrayList(); + final List query = new ArrayList(); for( final String element : query_domain_ids_str_ary ) { - query.add( new DomainId( element ) ); + query.add( element ); } query_domain_ids_array[ i ] = query; query_domain_ids_str = query_domain_ids_str.replace( '~', '_' ); @@ -1657,8 +1625,8 @@ public class surfacing { if ( need_protein_lists_per_species ) { protein_lists_per_species = new TreeMap>(); } - final List gwcd_list = new ArrayList( number_of_genomes ); - final SortedSet all_domains_encountered = new TreeSet(); + List gwcd_list = new ArrayList( number_of_genomes ); + final SortedSet all_domains_encountered = new TreeSet(); final SortedSet all_bin_domain_combinations_encountered = new TreeSet(); List all_bin_domain_combinations_gained_fitch = null; List all_bin_domain_combinations_lost_fitch = null; @@ -1733,7 +1701,7 @@ public class surfacing { Map protein_length_stats_by_dc = null; Map domain_number_stats_by_dc = null; final Map domain_length_stats_by_domain = new HashMap(); - if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) { + if ( perform_dc_regain_proteins_stats ) { protein_length_stats_by_dc = new HashMap(); domain_number_stats_by_dc = new HashMap(); } @@ -1810,7 +1778,7 @@ public class surfacing { final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered(); protein_coverage_stats.addValue( coverage ); int distinct_das = -1; - if ( DA_ANALYSIS ) { + if ( da_analysis ) { final String genome = input_file_properties[ i ][ 0 ]; distinct_das = SurfacingUtil.storeDomainArchitectures( genome, distinct_domain_architecutures_per_genome, @@ -1870,20 +1838,19 @@ public class surfacing { log( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter(), log_writer ); } - if ( DA_ANALYSIS ) { + if ( da_analysis ) { System.out.println( "Distinct domain architectures stored : " + distinct_das ); log( "Distinct domain architectures stored : " + distinct_das, log_writer ); } System.out.println( "Time for processing : " + parser.getTime() + "ms" ); log( "", log_writer ); - html_desc.append( "" + input_file_properties[ i ][ 0 ] + " [species: " - + input_file_properties[ i ][ 1 ] + "]" + ":domains analyzed: " - + parser.getDomainsStored() + "; domains ignored: [ind score cutoffs: " + html_desc.append( "" + input_file_properties[ i ][ 0 ] + ":doms analyzed: " + + parser.getDomainsStored() + "; doms ignored: [ind score cutoffs: " + parser.getDomainsIgnoredDueToIndividualScoreCutoff() + "] [E-value cutoff: " + parser.getDomainsIgnoredDueToEval() + "] [DUF: " + parser.getDomainsIgnoredDueToDuf() - + "] [virus like ids: " + parser.getDomainsIgnoredDueToVirusLikeIds() - + "] [negative domain filter: " + parser.getDomainsIgnoredDueToNegativeDomainFilter() - + "] [overlap: " + parser.getDomainsIgnoredDueToOverlap() + "]" ); + + "] [virus like ids: " + parser.getDomainsIgnoredDueToVirusLikeIds() + "] [negative dom filter: " + + parser.getDomainsIgnoredDueToNegativeDomainFilter() + "] [overlap: " + + parser.getDomainsIgnoredDueToOverlap() + "]" ); if ( negative_filter_file != null ) { html_desc.append( "; proteins ignored due to negative filter: " + parser.getProteinsIgnoredDueToFilter() ); @@ -1920,7 +1887,7 @@ public class surfacing { domains_which_never_single, domains_per_potein_stats_writer ); domain_lengths_table.addLengths( protein_list ); - if ( !DA_ANALYSIS ) { + if ( !da_analysis ) { gwcd_list.add( BasicGenomeWideCombinableDomains .createInstance( protein_list, ignore_combination_with_same, @@ -1977,7 +1944,7 @@ public class surfacing { ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); // - if ( DA_ANALYSIS ) { + if ( da_analysis ) { SurfacingUtil.performDomainArchitectureAnalysis( distinct_domain_architecutures_per_genome, distinct_domain_architecuture_counts, 10, @@ -2089,12 +2056,8 @@ public class surfacing { gwcd_list, ignore_domains_without_combs_in_all_spec, ignore_species_specific_domains ); - SurfacingUtil.decoratePrintableDomainSimilarities( similarities, - detailedness, - go_annotation_output, - go_id_to_term_map, - go_namespace_limit ); - DescriptiveStatistics pw_stats = null; + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, detailedness ); + final Map tax_code_to_id_map = SurfacingUtil.createTaxCodeToIdMap( intrees[ 0 ] ); try { String my_outfile = output_file.toString(); Map split_writers = null; @@ -2125,7 +2088,7 @@ public class surfacing { + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + "" + nl ); html_desc.append( "" + nl ); - pw_stats = SurfacingUtil + final DescriptiveStatistics pw_stats = SurfacingUtil .writeDomainSimilaritiesToFile( html_desc, new StringBuilder( number_of_genomes + " genomes" ), writer, @@ -2136,7 +2099,8 @@ public class surfacing { domain_similarity_print_option, domain_similarity_sort_field, scoring, - true ); + true, + tax_code_to_id_map ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); } @@ -2145,7 +2109,6 @@ public class surfacing { + e.getMessage() + "]" ); } System.out.println(); - // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); final Species[] species = new Species[ number_of_genomes ]; for( int i = 0; i < number_of_genomes; ++i ) { species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); @@ -2173,7 +2136,8 @@ public class surfacing { surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, surfacing.PRG_NAME, out_dir, - write_pwc_files ); + write_pwc_files, + tax_code_to_id_map ); String matrix_output_file = new String( output_file.toString() ); if ( matrix_output_file.indexOf( '.' ) > 1 ) { matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); @@ -2204,40 +2168,13 @@ public class surfacing { inferred_trees.add( nj_gd ); inferred_trees.add( nj_bc ); inferred_trees.add( nj_d ); - if ( jacknifed_distances ) { - pwgc.performPairwiseComparisonsJacknifed( species, - number_of_genomes, - gwcd_list, - true, - jacknife_resamplings, - jacknife_ratio, - random_seed ); - SurfacingUtil - .writeMatrixToFile( new File( matrix_output_file - + "_" - + ForesterUtil.round( jacknife_ratio, 2 ) - + "_" - + jacknife_resamplings - + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), - pwgc.getSharedBinaryCombinationsBasedDistances() ); - SurfacingUtil - .writeMatrixToFile( new File( matrix_output_file + "_" + ForesterUtil.round( jacknife_ratio, 2 ) - + "_" + jacknife_resamplings - + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), - pwgc.getSharedDomainsBasedDistances() ); - // if ( infer_species_trees ) { - // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings - // + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc - // .getSharedBinaryCombinationsBasedDistances() ); - // inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings - // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); - // } - } } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) if ( ( out_dir != null ) && ( !perform_pwc ) ) { output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); } - writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( write_to_nexus ) { + writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + } if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, e_value_max, @@ -2272,12 +2209,15 @@ public class surfacing { dc_type, protein_length_stats_by_dc, domain_number_stats_by_dc, - domain_length_stats_by_domain ); + domain_length_stats_by_domain, + tax_code_to_id_map, + write_to_nexus, + use_last_in_fitch_parsimony ); // Listing of all domain combinations gained is only done if only one input tree is used. if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { int j = 0; - for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { final Map mapping_results_map = new TreeMap(); final DomainParsimonyCalculator secondary_features_parsimony = DomainParsimonyCalculator .createInstance( intree, gwcd_list, domain_id_to_secondary_features_map ); @@ -2288,7 +2228,8 @@ public class surfacing { secondary_features_parsimony, intree, parameters_sb.toString(), - mapping_results_map ); + mapping_results_map, + use_last_in_fitch_parsimony ); if ( i == 0 ) { System.out.println(); System.out.println( "Mapping to secondary features:" ); @@ -2330,6 +2271,7 @@ public class surfacing { gwcd_list, output_list_of_all_proteins_per_domain_e_value_max ); } + gwcd_list = null; if ( all_bin_domain_combinations_gained_fitch != null ) { try { executeFitchGainsAnalysis( new File( output_file @@ -2545,17 +2487,6 @@ public class surfacing { System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); System.out - .println( JACKNIFE_OPTION - + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " - + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); - System.out.println( JACKNIFE_RATIO_OPTION + ": ratio for jacknife resampling [default: " - + JACKNIFE_RATIO_DEFAULT + "]" ); - System.out.println( JACKNIFE_RANDOM_SEED_OPTION - + ": seed for random number generator for jacknife resampling [default: " - + JACKNIFE_RANDOM_SEED_DEFAULT + "]" ); - // System.out.println( surfacing.INFER_SPECIES_TREES_OPTION - // + ": to infer NJ species trees based on shared domains/binary domain combinations" ); - System.out .println( surfacing.INPUT_SPECIES_TREE_OPTION + "=: to infer domain/binary domain combination gains/losses on given species trees" ); System.out.println( surfacing.FILTER_POSITIVE_OPTION @@ -2583,6 +2514,10 @@ public class surfacing { System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION + ": e value max per domain for output of all proteins per domain" ); + System.out.println( surfacing.USE_LAST_IN_FITCH_OPTION + ": to use last in Fitch parsimony" ); + System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" ); + System.out.println( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION + ": to perform DC regain protein statistics" ); + System.out.println( DA_ANALYSIS_OPTION + ": to do DA analysis" ); System.out.println(); System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" @@ -2597,7 +2532,7 @@ public class surfacing { System.out.println(); } - private static void processFilter( final File filter_file, final SortedSet filter ) { + private static void processFilter( final File filter_file, final SortedSet filter ) { SortedSet filter_str = null; try { filter_str = ForesterUtil.file2set( filter_file ); @@ -2607,13 +2542,13 @@ public class surfacing { } if ( filter_str != null ) { for( final String string : filter_str ) { - filter.add( new DomainId( string ) ); + filter.add( string ); } } if ( VERBOSE ) { System.out.println( "Filter:" ); - for( final DomainId domainId : filter ) { - System.out.println( domainId.getId() ); + for( final String domainId : filter ) { + System.out.println( domainId ); } } } @@ -2753,7 +2688,7 @@ public class surfacing { private static void writePresentToNexus( final File output_file, final File positive_filter_file, - final SortedSet filter, + final SortedSet filter, final List gwcd_list ) { try { SurfacingUtil @@ -2773,11 +2708,11 @@ public class surfacing { final SortedMap> protein_lists_per_species, final List gwcd_list, final double domain_e_cutoff ) { - final SortedSet all_domains = new TreeSet(); + final SortedSet all_domains = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { all_domains.addAll( gwcd.getAllDomainIds() ); } - for( final DomainId domain : all_domains ) { + for( final String domain : all_domains ) { final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); SurfacingUtil.checkForOutputFileWriteability( out ); try {