X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fsurfacing.java;h=99e946cfab2541c9f18ae453abdf5bb938658ec3;hb=e0914db6b0fa3516bc77186eec4d36dd9a753a24;hp=fc9567dbc81ce0217ffe369d93fa7946a2d4f6e3;hpb=73c5b11c1a00e539afb246e345ebf3f042dded78;p=jalview.git diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index fc9567d..99e946c 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -44,9 +44,7 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.evoinference.distance.NeighborJoining; import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; -import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; @@ -57,13 +55,13 @@ import org.forester.go.PfamToGoParser; import org.forester.io.parsers.HmmscanPerDomainTableParser; import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; import org.forester.io.parsers.util.ParserUtils; -import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BinaryDomainCombination; +import org.forester.protein.Domain; import org.forester.protein.DomainId; import org.forester.protein.Protein; import org.forester.species.BasicSpecies; @@ -225,8 +223,6 @@ public class surfacing { final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; final static private double JACKNIFE_RATIO_DEFAULT = 0.5; //final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference"; - final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh"; - final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh"; final static private String FILTER_POSITIVE_OPTION = "pos_filter"; final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; @@ -234,8 +230,8 @@ public class surfacing { final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; - final static private String PRG_VERSION = "2.230"; - final static private String PRG_DATE = "2012.04.22"; + final static private String PRG_VERSION = "2.252"; + final static private String PRG_DATE = "2012.08.01"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -255,6 +251,7 @@ public class surfacing { private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e"; private static final boolean VERBOSE = false; private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; @@ -273,7 +270,7 @@ public class surfacing { private static final String LOG_FILE_SUFFIX = "_log.txt"; private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; - private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; + private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; @@ -285,6 +282,7 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; + private static final boolean PERFORM_DC_REGAIN_PROTEINS_STATS = true; private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, @@ -358,6 +356,7 @@ public class surfacing { * @param sum_of_all_domains_encountered * @param all_bin_domain_combinations_encountered * @param is_gains_analysis + * @param protein_length_stats_by_dc * @throws IOException */ private static void executeFitchGainsAnalysis( final File output_file, @@ -543,19 +542,6 @@ public class surfacing { return intrees; } - private static List inferSpeciesTrees( final File outfile, final List distances_list ) { - final NeighborJoining nj = NeighborJoining.createInstance(); - final List phylogenies = nj.execute( distances_list ); - final PhylogenyWriter w = new PhylogenyWriter(); - try { - w.toNewHampshire( phylogenies, true, true, outfile, ";" ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); - } - return phylogenies; - } - private static void log( final String msg, final Writer w ) { try { w.write( msg ); @@ -631,6 +617,7 @@ public class surfacing { allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION ); allowed_options.add( IGNORE_VIRAL_IDS ); allowed_options.add( SEQ_EXTRACT_OPTION ); + allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ); allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); @@ -863,8 +850,20 @@ public class surfacing { species_matrix = true; } boolean output_protein_lists_for_all_domains = false; + double output_list_of_all_proteins_per_domain_e_value_max = -1; if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) { output_protein_lists_for_all_domains = true; + // + if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ) ) { + try { + output_list_of_all_proteins_per_domain_e_value_max = cla + .getOptionValueAsDouble( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for per domain E-value maximum" ); + } + } + // } Detailedness detailedness = DETAILEDNESS_DEFAULT; if ( cla.isOptionSet( surfacing.DETAILEDNESS_OPTION ) ) { @@ -1385,6 +1384,11 @@ public class surfacing { System.out.println( "E-value maximum (inclusive) : " + e_value_max ); html_desc.append( "E-value maximum (inclusive):" + e_value_max + "" + nl ); } + if ( output_protein_lists_for_all_domains ) { + System.out.println( "Domain E-value max : " + output_list_of_all_proteins_per_domain_e_value_max ); + html_desc.append( "Protein lists: E-value maximum per domain (inclusive):" + + output_list_of_all_proteins_per_domain_e_value_max + "" + nl ); + } System.out.println( "Ignore DUFs : " + ignore_dufs ); if ( ignore_virus_like_ids ) { System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids ); @@ -1412,7 +1416,6 @@ public class surfacing { System.out.println( "Ignore combination with self: " + ignore_combination_with_same ); html_desc.append( "Ignore combination with self for domain combination similarity analyses:" + ignore_combination_with_same + "" + nl ); - ; System.out.println( "Consider directedness : " + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) ); html_desc.append( "Consider directedness of binary domain combinations:" @@ -1759,6 +1762,13 @@ public class surfacing { catch ( final IOException e3 ) { e3.printStackTrace(); } + Map protein_length_stats_by_dc = null; + Map domain_number_stats_by_dc = null; + final Map domain_length_stats_by_domain = new HashMap(); + if ( PERFORM_DC_REGAIN_PROTEINS_STATS ) { + protein_length_stats_by_dc = new HashMap(); + domain_number_stats_by_dc = new HashMap(); + } // Main loop: for( int i = 0; i < number_of_genomes; ++i ) { System.out.println(); @@ -1909,6 +1919,13 @@ public class surfacing { dc_data_writer.write( SurfacingUtil.proteinToDomainCombinations( protein, count + "", "\t" ) .toString() ); ++count; + for( final Domain d : protein.getProteinDomains() ) { + final String d_str = d.getDomainId().toString(); + if ( !domain_length_stats_by_domain.containsKey( d_str ) ) { + domain_length_stats_by_domain.put( d_str, new BasicDescriptiveStatistics() ); + } + domain_length_stats_by_domain.get( d_str ).addValue( d.getLength() ); + } } } catch ( final IOException e ) { @@ -1927,7 +1944,9 @@ public class surfacing { ignore_combination_with_same, new BasicSpecies( input_file_properties[ i ][ 1 ] ), domain_id_to_go_ids_map, - dc_type ) ); + dc_type, + protein_length_stats_by_dc, + domain_number_stats_by_dc ) ); domain_lengths_table.addLengths( protein_list ); if ( gwcd_list.get( i ).getSize() > 0 ) { SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, @@ -2255,7 +2274,10 @@ public class surfacing { output_binary_domain_combinationsfor_graph_analysis, all_bin_domain_combinations_gained_fitch, all_bin_domain_combinations_lost_fitch, - dc_type ); + dc_type, + protein_length_stats_by_dc, + domain_number_stats_by_dc, + domain_length_stats_by_domain ); // Listing of all domain combinations gained is only done if only one input tree is used. if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { @@ -2308,39 +2330,11 @@ public class surfacing { plus_minus_analysis_numbers ); } if ( output_protein_lists_for_all_domains ) { - writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list ); - } - // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) { - // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, - // e_value_max, - // max_allowed_overlap, - // no_engulfing_overlaps, - // cutoff_scores_file ); - // String s = "_"; - // if ( radomize_fitch_parsimony ) { - // s += random_number_seed_for_fitch_parsimony + "_"; - // } - // int i = 0; - // for( final Phylogeny inferred_tree : inferred_trees ) { - // if ( !inferred_tree.isRooted() ) { - // intrees[ 0 ].getRoot().getName(); - // inferred_tree.r - // } - // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s; - // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator - // .createInstance( inferred_tree, gwcd_list ); - // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, - // radomize_fitch_parsimony, - // outfile_name, - // domain_parsimony, - // inferred_tree, - // domain_id_to_go_ids_map, - // go_id_to_term_map, - // go_namespace_limit, - // parameters_sb.toString() ); - // i++; - // } - // } + writeProteinListsForAllSpecies( out_dir, + protein_lists_per_species, + gwcd_list, + output_list_of_all_proteins_per_domain_e_value_max ); + } if ( all_bin_domain_combinations_gained_fitch != null ) { try { executeFitchGainsAnalysis( new File( output_file @@ -2505,23 +2499,6 @@ public class surfacing { } } - // public static StringBuffer stringCombinableDomainsMapToStringBuffer( - // final SortedMap map ) { - // final StringBuffer sb = new StringBuffer(); - // for( final Iterator iter = map.keySet().iterator(); - // iter.hasNext(); ) { - // final Object key = iter.next(); - // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ', - // false ) ); - // final CombinableDomains domain_combination = map.get( key ); - // sb.append( ForesterUtil.pad( new StringBuffer( "" + - // domain_combination.getNumberOfCombiningDomains() ), 8, - // ' ', false ) ); - // sb.append( domain_combination.toStringBuffer() ); - // sb.append( ForesterUtil.getLineSeparator() ); - // } - // return sb; - // } private static void printHelp() { System.out.println(); System.out.println( "Usage:" ); @@ -2609,14 +2586,19 @@ public class surfacing { System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + ": to output binary domain combinations for (downstream) graph analysis" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); + System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION + + ": e value max per domain for output of all proteins per domain" ); System.out.println(); + System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" + + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" + + " -no_eo -mo=0 -input=genomes_limited.txt -out_dir=out -o=o " + + " -species_tree=tol.xml -obo=gene_ontology_2012_02_07.obo -pos_filter=f.txt -all_prot" ); System.out.println(); - System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar" + System.out.println( "Example 2: java -Xms128m -Xmx512m -cp path/to/forester.jar" + " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST" + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo " + "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo " - + "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION - + "=50 human mouse brafl strpu" ); + + "-ds_output=detailed_html -scoring=domains -sort=alpha human mouse brafl strpu" ); System.out.println(); } @@ -2785,7 +2767,8 @@ public class surfacing { private static void writeProteinListsForAllSpecies( final File output_dir, final SortedMap> protein_lists_per_species, - final List gwcd_list ) { + final List gwcd_list, + final double domain_e_cutoff ) { final SortedSet all_domains = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { all_domains.addAll( gwcd.getAllDomainIds() ); @@ -2799,7 +2782,8 @@ public class surfacing { domain, proteins_file_writer, "\t", - LIMIT_SPEC_FOR_PROT_EX ); + LIMIT_SPEC_FOR_PROT_EX, + domain_e_cutoff ); proteins_file_writer.close(); } catch ( final IOException e ) {