From e367379ed038c8922bf7d898ba64d432c8ad3aec Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Sat, 18 Jun 2011 00:23:50 +0000 Subject: [PATCH] in progress --- .../java/src/org/forester/application/pfam_go.java | 2 +- .../src/org/forester/application/surfacing.java | 53 +- .../forester/application/surfacing_hmmpfam.java | 1 - .../surfacing/PairwiseGenomeComparator.java | 14 - .../src/org/forester/surfacing/SurfacingUtil.java | 543 ++++++-------------- 5 files changed, 161 insertions(+), 452 deletions(-) diff --git a/forester/java/src/org/forester/application/pfam_go.java b/forester/java/src/org/forester/application/pfam_go.java index 194def2..d63ca5f 100644 --- a/forester/java/src/org/forester/application/pfam_go.java +++ b/forester/java/src/org/forester/application/pfam_go.java @@ -42,7 +42,7 @@ public class pfam_go { final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; - final static private String PRG_NAME = "pfam2go"; + final static private String PRG_NAME = "pfam_go"; final static private String PRG_VERSION = "1.00"; final static private String PRG_DATE = "2010.02.02"; final static private String E_MAIL = "czmasek@burnham.org"; diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index ec2ddd9..9d2c987 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -117,12 +117,10 @@ public class surfacing { public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc"; public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html"; public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d"; - public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_GOID_D = "_dollo_gains_goid_d"; public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html"; public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d"; public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html"; public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d"; - public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_GOID_D = "_dollo_present_goid_d"; public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html"; public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex"; public final static String BDC_PRESENT_NEXUS = "_dc.nex"; @@ -146,13 +144,7 @@ public class surfacing { public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features"; public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo" + ForesterConstants.PHYLO_XML_SUFFIX; - public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_BIOLOGICAL_PROCESS = "_dollo_biol_proc_goid_d"; - public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_CELLULAR_COMPONENT = "_dollo_cell_comp_goid_d"; - public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_MOLECULAR_FUNCTION = "_dollo_mol_funct_goid_d"; public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d"; - public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_BIOLOGICAL_PROCESS = "_fitch_biol_proc_goid_dc"; - public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_CELLULAR_COMPONENT = "_fitch_cell_comp_goid_dc"; - public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_MOLECULAR_FUNCTION = "_fitch_mol_funct_goid_dc"; public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc"; final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; @@ -221,8 +213,6 @@ public class surfacing { + ForesterConstants.PHYLO_XML_SUFFIX; final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ" + ForesterConstants.PHYLO_XML_SUFFIX; - final static private String DISPLAY_M_HISTOGRAMS_OPTION = "mhisto"; - // final static private boolean DISPLAY_M_HISTOGRAMS_OPTION_DEFAULT = false; final static private String JACKNIFE_OPTION = "jack"; final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed"; final static private String JACKNIFE_RATIO_OPTION = "jack_ratio"; @@ -239,16 +229,14 @@ public class surfacing { final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; - final static private String PRG_VERSION = "2.003"; - final static private String PRG_DATE = "2010.12.03"; + final static private String PRG_VERSION = "2.100"; + final static private String PRG_DATE = "2011.06.17"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; final static private double MAX_E_VALUE_DEFAULT = -1; final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1; - final static private String DEFAULT_SEARCH_PARAMETER = "ls"; - final private static boolean VERBOSE_DEFAULT = true; private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; @@ -283,12 +271,8 @@ public class surfacing { private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; - // final String error = ForesterUtil.isReadableFile( new File( - // input_file_properties[ i ][ 0 ] ) ); - // if ( !ForesterUtil.isEmpty( error ) ) { - // ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - // } private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, final String automated_pairwise_comparison_suffix, @@ -353,7 +337,7 @@ public class surfacing { } /** - * Warning: This sideeffects 'all_bin_domain_combinations_encountered'! + * Warning: This side-effects 'all_bin_domain_combinations_encountered'! * * * @param output_file @@ -621,7 +605,6 @@ public class surfacing { allowed_options.add( surfacing.GO_NAMESPACE_LIMIT_OPTION ); allowed_options.add( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION ); allowed_options.add( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ); - allowed_options.add( surfacing.DISPLAY_M_HISTOGRAMS_OPTION ); allowed_options.add( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS ); allowed_options.add( JACKNIFE_OPTION ); allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION ); @@ -1336,10 +1319,6 @@ public class surfacing { "no (acceptable) go id to term mapping file provided ('GO OBO file') (-" + surfacing.GO_OBO_FILE_USE_OPTION + "=)" ); } - boolean display_histograms = false; - if ( cla.isOptionSet( surfacing.DISPLAY_M_HISTOGRAMS_OPTION ) ) { - display_histograms = true; - } System.out.println( "Output directory : " + out_dir ); if ( input_file_names_from_file != null ) { System.out.println( "Input files names from : " + input_files_file + " [" @@ -2074,7 +2053,6 @@ public class surfacing { true, surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, surfacing.PRG_NAME, - display_histograms, out_dir, write_pwc_files ); String matrix_output_file = new String( output_file.toString() ); @@ -2107,16 +2085,6 @@ public class surfacing { inferred_trees.add( nj_gd ); inferred_trees.add( nj_bc ); inferred_trees.add( nj_d ); - // final List histogram_datas = pwgc.getHistogramDatas(); - // if ( infer_species_trees ) { - // inferred_trees = new ArrayList(); - // final List inferred_trees_bc = inferSpeciesTrees( new File( output_file + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc - // .getSharedBinaryCombinationsBasedDistances() ); - // final List inferred_trees_d = inferSpeciesTrees( new File( output_file + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc - // .getSharedDomainsBasedDistances() ); - // inferred_trees.addAll( inferred_trees_bc ); - // inferred_trees.addAll( inferred_trees_d ); - // } if ( jacknifed_distances ) { pwgc.performPairwiseComparisonsJacknifed( species, number_of_genomes, @@ -2146,17 +2114,6 @@ public class surfacing { // + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() ); // } } - if ( display_histograms ) { - // final List histogram_datas_all = new ArrayList(); - // histogram_datas_all.add( new HistogramData( "all", - // values_for_all_scores_histogram, - // null, - // 20 ) ); - // final HistogramsFrame hf_all = new HistogramsFrame( histogram_datas_all ); - // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); - // hf_all.setVisible( true ); - // hf.setVisible( true ); - } } // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) ) if ( ( out_dir != null ) && ( !perform_pwc ) ) { output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); @@ -2433,8 +2390,6 @@ public class surfacing { System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION + ": species tree, to perform (Dollo, Fitch) parismony analyses" ); System.out - .println( surfacing.DISPLAY_M_HISTOGRAMS_OPTION + ": to display multiple histograms (using fluorite)" ); - System.out .println( JACKNIFE_OPTION + ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: " + JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" ); diff --git a/forester/java/src/org/forester/application/surfacing_hmmpfam.java b/forester/java/src/org/forester/application/surfacing_hmmpfam.java index eeeb5bb..8a1a107 100644 --- a/forester/java/src/org/forester/application/surfacing_hmmpfam.java +++ b/forester/java/src/org/forester/application/surfacing_hmmpfam.java @@ -2017,7 +2017,6 @@ public class surfacing_hmmpfam { true, surfacing_hmmpfam.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, surfacing_hmmpfam.PRG_NAME, - display_histograms, out_dir, write_pwc_files ); String matrix_output_file = new String( output_file.toString() ); diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index 789574a..d9ad2aa 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -99,7 +99,6 @@ public class PairwiseGenomeComparator { final boolean verbose, final String automated_pairwise_comparison_prefix, final String command_line_prg_name, - final boolean display_histograms, final File out_dir, final boolean write_pairwise_comparisons ) { init(); @@ -164,7 +163,6 @@ public class PairwiseGenomeComparator { list_of_genome_wide_combinable_domains .get( j ) ); genome_similarity_calculator.setAllowDomainsToBeIgnored( false ); - // TODO make histos for these 5 values double dissimilarity_score_mean; if ( stats.getN() < 1 ) { // No domains in common @@ -229,8 +227,6 @@ public class PairwiseGenomeComparator { + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" ); } } - // pairwise_matrix.setValue( i, j, cdc_list.get( cdc_list.size() - // - 1 ) ); if ( pw_stats != null ) { if ( pw_stats.getMin() >= pw_stats.getMax() ) { ForesterUtil @@ -239,16 +235,6 @@ public class PairwiseGenomeComparator { + pw_stats.getMax() + "], possibly indicating that a genome is compared to itself" ); } - if ( display_histograms && ( pw_stats.getMin() < pw_stats.getMax() ) ) { - //final double[] values = pw_stats.getDataAsDoubleArray(); - // List data_items = new - // ArrayList( values.length ); - // for( int n = 0; n < values.length; i++ ) { - // data_items.add( new BasicHistogramDataItem( "", values[ n ] ) - // ); - // } - //~ _histogram_datas.add( new HistogramData( species_i + "-" + species_j, values, null, 20 ) ); - } } } } diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 6e58f58..0e200dc 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -40,6 +40,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.PriorityQueue; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; @@ -59,7 +60,6 @@ import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; -import org.forester.go.GoUtils; import org.forester.go.PfamToGoMapping; import org.forester.io.parsers.nexus.NexusConstants; import org.forester.io.writers.PhylogenyWriter; @@ -161,6 +161,69 @@ public final class SurfacingUtil { return stats; } + private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, + final String outfilename_for_counts, + final String outfilename_for_dc, + final String outfilename_for_dc_for_go_mapping ) { + try { + final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); + final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); + final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final SortedMap dc_gain_counts = new TreeMap(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); + for( final String dc : gained_dc ) { + if ( dc_gain_counts.containsKey( dc ) ) { + dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + } + else { + dc_gain_counts.put( dc, 1 ); + } + } + } + final SortedMap histogram = new TreeMap(); + final SortedMap domain_lists = new TreeMap(); + final SortedMap> domain_lists_go = new TreeMap>(); + final Set dcs = dc_gain_counts.keySet(); + for( final String dc : dcs ) { + final int count = dc_gain_counts.get( dc ); + if ( histogram.containsKey( count ) ) { + histogram.put( count, histogram.get( count ) + 1 ); + domain_lists.put( count, domain_lists.get( count ).append( ", " + dc ) ); + domain_lists_go.get( count ).add( dc ); + } + else { + histogram.put( count, 1 ); + domain_lists.put( count, new StringBuilder( dc ) ); + final PriorityQueue q = new PriorityQueue(); + q.add( dc ); + domain_lists_go.put( count, q ); + } + } + final Set histogram_keys = histogram.keySet(); + for( final Integer histogram_key : histogram_keys ) { + final int count = histogram.get( histogram_key ); + final StringBuilder dc = domain_lists.get( histogram_key ); + out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + } + out_counts.close(); + out_dc.close(); + out_dc_for_go_mapping.close(); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" + + outfilename_for_counts + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" + + outfilename_for_dc + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + + outfilename_for_dc_for_go_mapping + "]" ); + } + public static int calculateOverlap( final Domain domain, final List covered_positions ) { int overlap_count = 0; for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { @@ -637,62 +700,11 @@ public final class SurfacingUtil { + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX ); + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX ); } } - private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, - final String outfilename_for_counts, - final String outfilename_for_dc ) { - try { - final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); - final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); - final SortedMap dc_gain_counts = new TreeMap(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); - for( final String dc : gained_dc ) { - if ( dc_gain_counts.containsKey( dc ) ) { - dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); - } - else { - dc_gain_counts.put( dc, 1 ); - } - } - } - final SortedMap histogram = new TreeMap(); - final SortedMap domain_lists = new TreeMap(); - final Set dcs = dc_gain_counts.keySet(); - for( final String dc : dcs ) { - final int count = dc_gain_counts.get( dc ); - if ( histogram.containsKey( count ) ) { - histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.put( count, domain_lists.get( count ).append( ", " + dc ) ); - } - else { - histogram.put( count, 1 ); - domain_lists.put( count, new StringBuilder( dc ) ); - } - } - final Set histogram_keys = histogram.keySet(); - for( final Integer histogram_key : histogram_keys ) { - final int count = histogram.get( histogram_key ); - final StringBuilder dc = domain_lists.get( histogram_key ); - out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); - } - out_counts.close(); - out_dc.close(); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" - + outfilename_for_counts + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" - + outfilename_for_dc + "]" ); - } - public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name, final DomainParsimonyCalculator secondary_features_parsimony, final Phylogeny phylogeny, @@ -907,6 +919,95 @@ public final class SurfacingUtil { p.setRooted( true ); } + /* + * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value + * + * + */ + static public StringBuffer proteinToDomainCombinations( final Protein protein, + final String protein_id, + final String separator ) { + final StringBuffer sb = new StringBuffer(); + if ( protein.getSpecies() == null ) { + throw new IllegalArgumentException( "species must not be null" ); + } + if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { + throw new IllegalArgumentException( "species id must not be empty" ); + } + final List domains = protein.getProteinDomains(); + if ( domains.size() > 1 ) { + final Map counts = new HashMap(); + for( final Domain domain : domains ) { + final String id = domain.getDomainId().getId(); + if ( counts.containsKey( id ) ) { + counts.put( id, counts.get( id ) + 1 ); + } + else { + counts.put( id, 1 ); + } + } + final Set dcs = new HashSet(); + for( int i = 1; i < domains.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + Domain domain_n = domains.get( i ); + Domain domain_c = domains.get( j ); + if ( domain_n.getFrom() > domain_c.getFrom() ) { + domain_n = domains.get( j ); + domain_c = domains.get( i ); + } + final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId(); + if ( !dcs.contains( dc ) ) { + dcs.add( dc ); + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domain_n.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_c.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_n.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( domain_c.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( counts.get( domain_n.getDomainId().getId() ) ); + sb.append( separator ); + sb.append( counts.get( domain_c.getDomainId().getId() ) ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + } + } + else if ( domains.size() == 1 ) { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domains.get( 0 ).getDomainId().getId() ); + sb.append( separator ); + sb.append( separator ); + sb.append( domains.get( 0 ).getPerDomainEvalue() ); + sb.append( separator ); + sb.append( separator ); + sb.append( 1 ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + else { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb; + } + /** * * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 => @@ -956,7 +1057,7 @@ public final class SurfacingUtil { return pruned_protein; } - static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { + public static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { final List domains = new ArrayList(); for( final Domain d : protein.getProteinDomains() ) { domains.add( d ); @@ -1169,95 +1270,6 @@ public final class SurfacingUtil { + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); } - /* - * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value - * - * - */ - static public StringBuffer proteinToDomainCombinations( final Protein protein, - final String protein_id, - final String separator ) { - final StringBuffer sb = new StringBuffer(); - if ( protein.getSpecies() == null ) { - throw new IllegalArgumentException( "species must not be null" ); - } - if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { - throw new IllegalArgumentException( "species id must not be empty" ); - } - final List domains = protein.getProteinDomains(); - if ( domains.size() > 1 ) { - final Map counts = new HashMap(); - for( final Domain domain : domains ) { - final String id = domain.getDomainId().getId(); - if ( counts.containsKey( id ) ) { - counts.put( id, counts.get( id ) + 1 ); - } - else { - counts.put( id, 1 ); - } - } - final Set dcs = new HashSet(); - for( int i = 1; i < domains.size(); ++i ) { - for( int j = 0; j < i; ++j ) { - Domain domain_n = domains.get( i ); - Domain domain_c = domains.get( j ); - if ( domain_n.getFrom() > domain_c.getFrom() ) { - domain_n = domains.get( j ); - domain_c = domains.get( i ); - } - final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId(); - if ( !dcs.contains( dc ) ) { - dcs.add( dc ); - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domain_n.getDomainId().getId() ); - sb.append( separator ); - sb.append( domain_c.getDomainId().getId() ); - sb.append( separator ); - sb.append( domain_n.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( domain_c.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( counts.get( domain_n.getDomainId().getId() ) ); - sb.append( separator ); - sb.append( counts.get( domain_c.getDomainId().getId() ) ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - } - } - } - else if ( domains.size() == 1 ) { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domains.get( 0 ).getDomainId().getId() ); - sb.append( separator ); - sb.append( separator ); - sb.append( domains.get( 0 ).getPerDomainEvalue() ); - sb.append( separator ); - sb.append( separator ); - sb.append( 1 ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - else { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - return sb; - } - public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, final CharacterStateMatrix.GainLossStates state, final String filename, @@ -1862,111 +1874,6 @@ public final class SurfacingUtil { } } - private static void writeDomainDataORIG( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit, - final Writer out, - final String domain_0, - final String domain_1, - final String prefix_for_html, - final String character_separator_for_non_html_output, - final Map>[] domain_id_to_secondary_features_maps, - final Set all_go_ids ) throws IOException { - boolean any_go_annotation_present = false; - boolean first_has_no_go = false; - int domain_count = 2; // To distinguish between domains and binary domain combinations. - if ( ForesterUtil.isEmpty( domain_1 ) ) { - domain_count = 1; - } - // The following has a difficult to understand logic. - for( int d = 0; d < domain_count; ++d ) { - List go_ids = null; - boolean go_annotation_present = false; - if ( d == 0 ) { - final DomainId domain_id = new DomainId( domain_0 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - go_annotation_present = true; - any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); - } - else { - first_has_no_go = true; - } - } - else { - final DomainId domain_id = new DomainId( domain_1 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - go_annotation_present = true; - any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); - } - } - if ( go_annotation_present ) { - boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) ); - for( final GoId go_id : go_ids ) { - out.write( "" ); - if ( first ) { - first = false; - writeDomainIdsToHtml( out, - domain_0, - domain_1, - prefix_for_html, - domain_id_to_secondary_features_maps ); - } - else { - out.write( "" ); - } - if ( !go_id_to_term_map.containsKey( go_id ) ) { - throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); - } - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { - final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); - final String go_id_str = go_id.getId(); - out.write( "" ); - out.write( "" + go_id_str + "" ); - out.write( "" ); - out.write( go_term.getName() ); - if ( domain_count == 2 ) { - out.write( " (" + d + ")" ); - } - out.write( "" ); - out.write( top ); - out.write( "" ); - out.write( "[" ); - out.write( go_term.getGoNameSpace().toShortString() ); - out.write( "]" ); - out.write( "" ); - if ( all_go_ids != null ) { - all_go_ids.add( go_id ); - } - } - else { - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - } - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - } // for( int d = 0; d < domain_count; ++d ) - if ( !any_go_annotation_present ) { - out.write( "" ); - writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - private static void writeDomainIdsToHtml( final Writer out, final String domain_0, final String domain_1, @@ -1979,91 +1886,6 @@ public final class SurfacingUtil { out.write( " " ); } out.write( "" + domain_0 + "" ); - //if ( ForesterUtil.isEmpty( domain_1 ) ) { - // out.write( " [gs]" ); - //} - // if ( !ForesterUtil.isEmpty( domain_1 ) ) { - // out.write( "=" ); - // out.write( "" + domain_1 + "" ); - //} - // else if ( ( domain_id_to_secondary_features_maps != null ) - // && ( domain_id_to_secondary_features_maps.length > 0 ) ) { - // out.write( " [" ); - // boolean first = true; - // for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { - // final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); - // if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { - // for( final String sec_feature : sec_features ) { - // if ( first ) { - // first = false; - // } - // else { - // out.write( ", " ); - // } - // if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP - // && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { - // out.write( "" + sec_feature + "" ); - // } - // else { - // out.write( sec_feature ); - // } - // } - // } - // } - // out.write( "]" ); - // } - out.write( "" ); - } - - private static void writeDomainIdsToHtmlORIG( final Writer out, - final String domain_0, - final String domain_1, - final String prefix_for_detailed_html, - final Map>[] domain_id_to_secondary_features_maps ) - throws IOException { - out.write( "" ); - if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { - out.write( prefix_for_detailed_html ); - out.write( " " ); - } - out.write( "" + domain_0 + "" ); - if ( ForesterUtil.isEmpty( domain_1 ) ) { - out.write( " [gs]" ); - } - if ( !ForesterUtil.isEmpty( domain_1 ) ) { - out.write( "=" ); - out.write( "" + domain_1 + "" ); - } - else if ( ( domain_id_to_secondary_features_maps != null ) - && ( domain_id_to_secondary_features_maps.length > 0 ) ) { - out.write( " [" ); - boolean first = true; - for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { - final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); - if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { - for( final String sec_feature : sec_features ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP - && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { - out.write( "" + sec_feature + "" ); - } - else { - out.write( sec_feature ); - } - } - } - } - out.write( "]" ); - } out.write( "" ); } @@ -2391,52 +2213,6 @@ public final class SurfacingUtil { } } - public static void writeTaxonomyLinksORIG( final Writer writer, final String species ) throws IOException { - if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { - final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species ); - writer.write( " [" ); - if ( matcher.matches() ) { - writer.write( "uniprot" ); - } - else { - writer.write( "eol" ); - writer.write( "|" ); - writer.write( "tol" ); - writer.write( "|" ); - writer.write( "wikipedia" ); - writer.write( "|" ); - writer.write( "gs" ); - } - writer.write( "]" ); - } - } - - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix ) { - if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { - throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class - + "] to nexus" ); - } - final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; - try { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) ); - w.write( NexusConstants.NEXUS ); - w.write( ForesterUtil.LINE_SEPARATOR ); - my_matrix.writeNexusTaxaBlock( w ); - my_matrix.writeNexusBinaryChractersBlock( w ); - w.flush(); - w.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - } - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix, final Phylogeny phylogeny ) { @@ -2463,13 +2239,6 @@ public final class SurfacingUtil { } } - private static void writeToNexus( final String outfile_name, final DomainParsimonyCalculator domain_parsimony ) { - writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS, - domain_parsimony.createMatrixOfDomainPresenceOrAbsence() ); - writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, - domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence() ); - } - private static void writeToNexus( final String outfile_name, final DomainParsimonyCalculator domain_parsimony, final Phylogeny phylogeny ) { -- 1.7.10.2