X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fsurfacing.java;h=99e946cfab2541c9f18ae453abdf5bb938658ec3;hb=e0914db6b0fa3516bc77186eec4d36dd9a753a24;hp=692b6998502de64bfe7bd7ff7403158271388762;hpb=e174d62534f7fc6f3de133d523a402a87735b27f;p=jalview.git diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 692b699..99e946c 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -44,9 +44,7 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.evoinference.distance.NeighborJoining; import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; -import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; @@ -57,7 +55,6 @@ import org.forester.go.PfamToGoParser; import org.forester.io.parsers.HmmscanPerDomainTableParser; import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; import org.forester.io.parsers.util.ParserUtils; -import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; @@ -226,8 +223,6 @@ public class surfacing { final static private long JACKNIFE_RANDOM_SEED_DEFAULT = 19; final static private double JACKNIFE_RATIO_DEFAULT = 0.5; //final static private String INFER_SPECIES_TREES_OPTION = "species_tree_inference"; - final static private String INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX = "_sd_nj.nh"; - final static private String INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX = "_sbc_nj.nh"; final static private String FILTER_POSITIVE_OPTION = "pos_filter"; final static private String FILTER_NEGATIVE_OPTION = "neg_filter"; final static private String FILTER_NEGATIVE_DOMAINS_OPTION = "neg_dom_filter"; @@ -235,8 +230,8 @@ public class surfacing { final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; - final static private String PRG_VERSION = "2.250"; - final static private String PRG_DATE = "2012.05.07"; + final static private String PRG_VERSION = "2.252"; + final static private String PRG_DATE = "2012.08.01"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -256,6 +251,7 @@ public class surfacing { private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; + final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e"; private static final boolean VERBOSE = false; private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; @@ -274,7 +270,7 @@ public class surfacing { private static final String LOG_FILE_SUFFIX = "_log.txt"; private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; - private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; + private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; @@ -546,20 +542,6 @@ public class surfacing { return intrees; } - private static List inferSpeciesTrees( final File outfile, - final List distances_list ) { - final NeighborJoining nj = NeighborJoining.createInstance(); - final List phylogenies = nj.execute( distances_list ); - final PhylogenyWriter w = new PhylogenyWriter(); - try { - w.toNewHampshire( phylogenies, true, true, outfile, ";" ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( PRG_NAME, "failed to write to outfile [" + outfile + "]: " + e.getMessage() ); - } - return phylogenies; - } - private static void log( final String msg, final Writer w ) { try { w.write( msg ); @@ -635,6 +617,7 @@ public class surfacing { allowed_options.add( FILTER_NEGATIVE_DOMAINS_OPTION ); allowed_options.add( IGNORE_VIRAL_IDS ); allowed_options.add( SEQ_EXTRACT_OPTION ); + allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ); allowed_options.add( SECONDARY_FEATURES_PARSIMONY_MAP_FILE ); allowed_options.add( PLUS_MINUS_ANALYSIS_OPTION ); allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); @@ -867,8 +850,20 @@ public class surfacing { species_matrix = true; } boolean output_protein_lists_for_all_domains = false; + double output_list_of_all_proteins_per_domain_e_value_max = -1; if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ) ) { output_protein_lists_for_all_domains = true; + // + if ( cla.isOptionSet( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ) ) { + try { + output_list_of_all_proteins_per_domain_e_value_max = cla + .getOptionValueAsDouble( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for per domain E-value maximum" ); + } + } + // } Detailedness detailedness = DETAILEDNESS_DEFAULT; if ( cla.isOptionSet( surfacing.DETAILEDNESS_OPTION ) ) { @@ -1389,6 +1384,11 @@ public class surfacing { System.out.println( "E-value maximum (inclusive) : " + e_value_max ); html_desc.append( "E-value maximum (inclusive):" + e_value_max + "" + nl ); } + if ( output_protein_lists_for_all_domains ) { + System.out.println( "Domain E-value max : " + output_list_of_all_proteins_per_domain_e_value_max ); + html_desc.append( "Protein lists: E-value maximum per domain (inclusive):" + + output_list_of_all_proteins_per_domain_e_value_max + "" + nl ); + } System.out.println( "Ignore DUFs : " + ignore_dufs ); if ( ignore_virus_like_ids ) { System.out.println( "Ignore virus like ids : " + ignore_virus_like_ids ); @@ -2330,39 +2330,11 @@ public class surfacing { plus_minus_analysis_numbers ); } if ( output_protein_lists_for_all_domains ) { - writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list ); - } - // if ( ( intrees != null ) && ( intrees.length > 0 ) && ( inferred_trees != null ) && ( inferred_trees.size() > 0 ) ) { - // final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, - // e_value_max, - // max_allowed_overlap, - // no_engulfing_overlaps, - // cutoff_scores_file ); - // String s = "_"; - // if ( radomize_fitch_parsimony ) { - // s += random_number_seed_for_fitch_parsimony + "_"; - // } - // int i = 0; - // for( final Phylogeny inferred_tree : inferred_trees ) { - // if ( !inferred_tree.isRooted() ) { - // intrees[ 0 ].getRoot().getName(); - // inferred_tree.r - // } - // final String outfile_name = ForesterUtil.removeSuffix( inferred_tree.getName() ) + s; - // final DomainParsimonyCalculator domain_parsimony = DomainParsimonyCalculator - // .createInstance( inferred_tree, gwcd_list ); - // SurfacingUtil.executeParsimonyAnalysis( random_number_seed_for_fitch_parsimony, - // radomize_fitch_parsimony, - // outfile_name, - // domain_parsimony, - // inferred_tree, - // domain_id_to_go_ids_map, - // go_id_to_term_map, - // go_namespace_limit, - // parameters_sb.toString() ); - // i++; - // } - // } + writeProteinListsForAllSpecies( out_dir, + protein_lists_per_species, + gwcd_list, + output_list_of_all_proteins_per_domain_e_value_max ); + } if ( all_bin_domain_combinations_gained_fitch != null ) { try { executeFitchGainsAnalysis( new File( output_file @@ -2527,23 +2499,6 @@ public class surfacing { } } - // public static StringBuffer stringCombinableDomainsMapToStringBuffer( - // final SortedMap map ) { - // final StringBuffer sb = new StringBuffer(); - // for( final Iterator iter = map.keySet().iterator(); - // iter.hasNext(); ) { - // final Object key = iter.next(); - // sb.append( ForesterUtil.pad( new StringBuffer( key.toString() ), 18, ' ', - // false ) ); - // final CombinableDomains domain_combination = map.get( key ); - // sb.append( ForesterUtil.pad( new StringBuffer( "" + - // domain_combination.getNumberOfCombiningDomains() ), 8, - // ' ', false ) ); - // sb.append( domain_combination.toStringBuffer() ); - // sb.append( ForesterUtil.getLineSeparator() ); - // } - // return sb; - // } private static void printHelp() { System.out.println(); System.out.println( "Usage:" ); @@ -2631,6 +2586,8 @@ public class surfacing { System.out.println( surfacing.DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS + ": to output binary domain combinations for (downstream) graph analysis" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); + System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION + + ": e value max per domain for output of all proteins per domain" ); System.out.println(); System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" @@ -2810,7 +2767,8 @@ public class surfacing { private static void writeProteinListsForAllSpecies( final File output_dir, final SortedMap> protein_lists_per_species, - final List gwcd_list ) { + final List gwcd_list, + final double domain_e_cutoff ) { final SortedSet all_domains = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { all_domains.addAll( gwcd.getAllDomainIds() ); @@ -2824,7 +2782,8 @@ public class surfacing { domain, proteins_file_writer, "\t", - LIMIT_SPEC_FOR_PROT_EX ); + LIMIT_SPEC_FOR_PROT_EX, + domain_e_cutoff ); proteins_file_writer.close(); } catch ( final IOException e ) {