X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fsurfacing.java;h=89ce71cf657f01e2468767cee00bdc861a99537f;hb=90e29ddfe15f6c44f3e3ba419a67c3e267e279bd;hp=40fc471b85ff70d29317f84967aeaa713bfd00f3;hpb=ed149d1b26c50c0673b5491fc639cdcb6afdde2f;p=jalview.git diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 40fc471..89ce71c 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -87,6 +87,7 @@ public class surfacing { private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000; public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out"; + public final static String DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION = "dcc"; public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot"; public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot"; public final static String DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX = ".dcc"; @@ -177,7 +178,7 @@ public class surfacing { final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; - final static private String DA_ANALYSIS_OPTION = "DA_analyis"; + final static private String DA_ANALYSIS_OPTION = "da_analyis"; final static private String USE_LAST_IN_FITCH_OPTION = "last"; public final static String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; @@ -214,8 +215,8 @@ public class surfacing { final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.402"; - final static private String PRG_DATE = "131126"; + final static private String PRG_VERSION = "2.404"; + final static private String PRG_DATE = "140319"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -240,7 +241,7 @@ public class surfacing { private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; - private static final boolean PERFORM_DOMAIN_LENGTH_ANALYSIS = true; + private static final String PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION = "dla"; public static final String ALL_PFAMS_ENCOUNTERED_SUFFIX = "_all_encountered_pfams"; public static final String ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX = "_all_encountered_pfams_with_go_annotation"; public static final String ENCOUNTERED_PFAMS_SUMMARY_SUFFIX = "_encountered_pfams_summary"; @@ -336,6 +337,7 @@ public class surfacing { allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); + allowed_options.add( DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION ); allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); allowed_options.add( WRITE_TO_NEXUS_OPTION ); @@ -343,6 +345,7 @@ public class surfacing { allowed_options.add( DA_ANALYSIS_OPTION ); allowed_options.add( USE_LAST_IN_FITCH_OPTION ); allowed_options.add( PERFORM_DC_FITCH ); + allowed_options.add( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION ); boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT; boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT; @@ -376,6 +379,10 @@ public class surfacing { if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { output_binary_domain_combinationsfor_graph_analysis = true; } + boolean output_binary_domain_combinationsfor_counts = false; + if ( cla.isOptionSet( DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION ) ) { + output_binary_domain_combinationsfor_counts = true; + } if ( cla.isOptionSet( surfacing.MAX_FS_E_VALUE_OPTION ) ) { try { fs_e_value_max = cla.getOptionValueAsDouble( surfacing.MAX_FS_E_VALUE_OPTION ); @@ -414,6 +421,10 @@ public class surfacing { if ( cla.isOptionSet( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) { ignore_combination_with_same = true; } + boolean domain_length_analysis = false; + if ( cla.isOptionSet( surfacing.PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION ) ) { + domain_length_analysis = true; + } boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT; if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) { ignore_domains_without_combs_in_all_spec = true; @@ -957,7 +968,7 @@ public class surfacing { File[] secondary_features_map_files = null; final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + DOMAIN_LENGTHS_ANALYSIS_SUFFIX ); - if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + if ( domain_length_analysis ) { SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile ); } if ( cla.isOptionSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) { @@ -1385,7 +1396,6 @@ public class surfacing { all_bin_domain_combinations_gained_fitch = new ArrayList(); all_bin_domain_combinations_lost_fitch = new ArrayList(); } - DomainLengthsTable domain_lengths_table = new DomainLengthsTable(); final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + D_PROMISCUITY_FILE_SUFFIX ); BufferedWriter per_genome_domain_promiscuity_statistics_writer = null; @@ -1421,8 +1431,8 @@ public class surfacing { catch ( final IOException e2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() ); } - final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics(); final SortedMap all_genomes_domains_per_potein_histo = new TreeMap(); final SortedSet domains_which_are_always_single = new TreeSet(); final SortedSet domains_which_are_sometimes_single_sometimes_not = new TreeSet(); @@ -1456,6 +1466,10 @@ public class surfacing { protein_length_stats_by_dc = new HashMap(); domain_number_stats_by_dc = new HashMap(); } + DomainLengthsTable domain_lengths_table = null; + if ( domain_length_analysis ) { + domain_lengths_table = new DomainLengthsTable(); + } // Main loop: final SortedMap> distinct_domain_architecutures_per_genome = new TreeMap>(); final SortedMap distinct_domain_architecuture_counts = new TreeMap(); @@ -1642,7 +1656,9 @@ public class surfacing { domains_which_are_sometimes_single_sometimes_not, domains_which_never_single, domains_per_potein_stats_writer ); - domain_lengths_table.addLengths( protein_list ); + if ( domain_length_analysis ) { + domain_lengths_table.addLengths( protein_list ); + } if ( !da_analysis ) { gwcd_list.add( BasicGenomeWideCombinableDomains .createInstance( protein_list, @@ -1653,12 +1669,15 @@ public class surfacing { protein_length_stats_by_dc, domain_number_stats_by_dc ) ); if ( gwcd_list.get( i ).getSize() > 0 ) { - SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, - out_dir, - per_genome_domain_promiscuity_statistics_writer, - gwcd_list.get( i ), - i, - dc_sort_order ); + if ( output_binary_domain_combinationsfor_counts ) { + SurfacingUtil + .writeDomainCombinationsCountsFile( input_file_properties, + out_dir, + per_genome_domain_promiscuity_statistics_writer, + gwcd_list.get( i ), + i, + dc_sort_order ); + } if ( output_binary_domain_combinationsfor_graph_analysis ) { SurfacingUtil.writeBinaryDomainCombinationsFileForGraphAnalysis( input_file_properties, out_dir, @@ -1719,8 +1738,10 @@ public class surfacing { domains_per_potein_stats_writer.write( "\t" ); domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.sampleStandardDeviation() + "" ); domains_per_potein_stats_writer.write( "\t" ); - domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" ); - domains_per_potein_stats_writer.write( "\t" ); + if ( all_genomes_domains_per_potein_stats.getN() <= 300 ) { + domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" ); + domains_per_potein_stats_writer.write( "\t" ); + } domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getN() + "" ); domains_per_potein_stats_writer.write( "\t" ); domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMin() + "" ); @@ -1728,6 +1749,7 @@ public class surfacing { domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" ); domains_per_potein_stats_writer.write( "\n" ); domains_per_potein_stats_writer.close(); + all_genomes_domains_per_potein_stats = null; SurfacingUtil.printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer ); ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + "_all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" ); @@ -1752,6 +1774,7 @@ public class surfacing { + ( 100 * protein_coverage_stats.getMin() ) + "%-" + ( 100 * protein_coverage_stats.getMax() ) + "%", log_writer ); + protein_coverage_stats = null; } catch ( final IOException e2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); @@ -1774,7 +1797,7 @@ public class surfacing { catch ( final IOException e2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); } - if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { + if ( domain_length_analysis ) { try { SurfacingUtil.executeDomainLengthAnalysis( input_file_properties, number_of_genomes, @@ -1794,7 +1817,8 @@ public class surfacing { final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, number_of_genomes == 2, - CALC_SIMILARITY_SCORES ); + CALC_SIMILARITY_SCORES, + true ); switch ( scoring ) { case COMBINATIONS: pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); @@ -2038,7 +2062,8 @@ public class surfacing { SurfacingUtil.writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list, - output_list_of_all_proteins_per_domain_e_value_max ); + output_list_of_all_proteins_per_domain_e_value_max, + positive_filter_file != null ? filter : null ); } gwcd_list = null; if ( all_bin_domain_combinations_gained_fitch != null ) { @@ -2157,6 +2182,8 @@ public class surfacing { System.out.println( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE + "=: to perfom parsimony analysis on secondary features" ); System.out.println( surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=: to presence/absence genome analysis" ); + System.out.println( surfacing.DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION + + ": to output binary domain counts (as individual files)" ); System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + ": to output binary domain combinations for (downstream) graph analysis" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); @@ -2166,18 +2193,12 @@ public class surfacing { System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" ); System.out.println( PERFORM_DC_FITCH + ": to perform DC Fitch parsimony" ); System.out.println( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION + ": to perform DC regain protein statistics" ); - System.out.println( DA_ANALYSIS_OPTION + ": to do DA analysis" ); + System.out.println( DA_ANALYSIS_OPTION + ": to perform DA analysis" ); + System.out.println( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION + ": to perform domain length analysis" ); System.out.println(); - System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" - + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" - + " -no_eo -mo=0 -genomes=eukaryotes.txt -out_dir=out -o=o " - + " -species_tree=tol.xml -obo=gene_ontology_2012_02_07.obo -pos_filter=f.txt -all_prot" ); System.out.println(); - System.out.println( "Example 2: java -Xms128m -Xmx512m -cp path/to/forester.jar" - + " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST" - + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo " - + "-dc_sort=dom -ignore_with_self -no_singles -ie=0.001 -mo=1 -no_eo -genomes=eukaryotes.txt " - + "-ds_output=detailed_html -scoring=domains -sort=alpha " ); + System.out + .println( "Example: surfacing -p2g=pfam2go_130621.txt -obo=gene_onotology_130621.obo -species_tree=tol_156.xml -last -detail=punctilious -ignore_viral_ids -no_eo -ie=0.1 -dufs -genomes=genomes_all.txt -pos_filter=tf_1.txt -all_prot -all_prot_e=0.1 -out_dir=_tf1_e01_ape01 -o=tf1_e01_ape01" ); System.out.println(); } }