3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.surfacing;
29 import java.io.BufferedWriter;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.text.DecimalFormat;
35 import java.text.NumberFormat;
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.Collections;
39 import java.util.Comparator;
40 import java.util.HashMap;
41 import java.util.HashSet;
42 import java.util.Iterator;
43 import java.util.List;
45 import java.util.Map.Entry;
46 import java.util.PriorityQueue;
48 import java.util.SortedMap;
49 import java.util.SortedSet;
50 import java.util.TreeMap;
51 import java.util.TreeSet;
52 import java.util.regex.Matcher;
53 import java.util.regex.Pattern;
55 import org.forester.application.surfacing;
56 import org.forester.evoinference.distance.NeighborJoining;
57 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
58 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
59 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
60 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
61 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
62 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
63 import org.forester.evoinference.matrix.distance.DistanceMatrix;
64 import org.forester.go.GoId;
65 import org.forester.go.GoNameSpace;
66 import org.forester.go.GoTerm;
67 import org.forester.go.PfamToGoMapping;
68 import org.forester.io.parsers.nexus.NexusConstants;
69 import org.forester.io.writers.PhylogenyWriter;
70 import org.forester.phylogeny.Phylogeny;
71 import org.forester.phylogeny.PhylogenyMethods;
72 import org.forester.phylogeny.PhylogenyNode;
73 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
74 import org.forester.phylogeny.data.BinaryCharacters;
75 import org.forester.phylogeny.data.Confidence;
76 import org.forester.phylogeny.data.Taxonomy;
77 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
78 import org.forester.protein.BasicDomain;
79 import org.forester.protein.BasicProtein;
80 import org.forester.protein.BinaryDomainCombination;
81 import org.forester.protein.Domain;
82 import org.forester.protein.Protein;
83 import org.forester.species.Species;
84 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
85 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
86 import org.forester.util.AsciiHistogram;
87 import org.forester.util.BasicDescriptiveStatistics;
88 import org.forester.util.BasicTable;
89 import org.forester.util.BasicTableParser;
90 import org.forester.util.DescriptiveStatistics;
91 import org.forester.util.ForesterUtil;
93 public final class SurfacingUtil {
95 private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" );
96 private static final Comparator<Domain> ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
99 public int compare( final Domain d1,
101 if ( d1.getPerSequenceEvalue() < d2
102 .getPerSequenceEvalue() ) {
106 .getPerSequenceEvalue() > d2
107 .getPerSequenceEvalue() ) {
111 return d1.compareTo( d2 );
115 public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" );
117 private SurfacingUtil() {
118 // Hidden constructor.
121 public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
122 final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
123 final SortedMap<String, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
124 for( final String domain_id : all_cd.keySet() ) {
125 binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
129 public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
130 final SortedSet<String> domain_ids ) {
131 final SortedSet<String> domains = genome.getAllDomainIds();
132 for( final String domain : domains ) {
133 domain_ids.add( domain );
137 public static void addHtmlHead( final Writer w, final String title ) throws IOException {
138 w.write( SurfacingConstants.NL );
140 w.write( "<title>" );
142 w.write( "</title>" );
143 w.write( SurfacingConstants.NL );
144 w.write( "<style>" );
145 w.write( SurfacingConstants.NL );
146 w.write( "a:visited { color : #6633FF; text-decoration : none; }" );
147 w.write( SurfacingConstants.NL );
148 w.write( "a:link { color : #6633FF; text-decoration : none; }" );
149 w.write( SurfacingConstants.NL );
150 w.write( "a:active { color : #99FF00; text-decoration : none; }" );
151 w.write( SurfacingConstants.NL );
152 w.write( "a:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; }" );
153 w.write( SurfacingConstants.NL );
154 w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
155 w.write( SurfacingConstants.NL );
156 w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
157 w.write( SurfacingConstants.NL );
158 w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
159 w.write( SurfacingConstants.NL );
160 w.write( "</style>" );
161 w.write( SurfacingConstants.NL );
162 w.write( "</head>" );
163 w.write( SurfacingConstants.NL );
166 public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
167 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
168 for( final DomainSimilarity similarity : similarities ) {
169 stats.addValue( similarity.getMeanSimilarityScore() );
174 public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
175 int overlap_count = 0;
176 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
177 if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) {
181 return overlap_count;
184 public static void checkForOutputFileWriteability( final File outfile ) {
185 final String error = ForesterUtil.isWritableFile( outfile );
186 if ( !ForesterUtil.isEmpty( error ) ) {
187 ForesterUtil.fatalError( surfacing.PRG_NAME, error );
191 public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
192 final BinaryDomainCombination.DomainCombinationType dc_type,
193 final List<BinaryDomainCombination> all_binary_domains_combination_gained,
194 final boolean get_gains ) {
195 final SortedSet<String> sorted_ids = new TreeSet<String>();
196 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
197 sorted_ids.add( matrix.getIdentifier( i ) );
199 for( final String id : sorted_ids ) {
200 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
201 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
202 || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
203 if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
204 all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
205 .createInstance( matrix.getCharacter( c ) ) );
207 else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
208 all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
209 .createInstance( matrix.getCharacter( c ) ) );
212 all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
213 .getCharacter( c ) ) );
220 public static Map<String, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
221 final Map<String, List<GoId>> domain_id_to_go_ids_map = new HashMap<String, List<GoId>>( pfam_to_go_mappings.size() );
222 for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
223 if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
224 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
226 domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
228 return domain_id_to_go_ids_map;
231 public static Map<String, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
233 final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
234 final Map<String, Set<String>> map = new TreeMap<String, Set<String>>();
235 for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
236 final String domain_id = primary_table.getValue( 0, r );
237 if ( !map.containsKey( domain_id ) ) {
238 map.put( domain_id, new HashSet<String>() );
240 map.get( domain_id ).add( primary_table.getValue( 1, r ) );
245 public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
246 checkForOutputFileWriteability( nj_tree_outfile );
247 final NeighborJoining nj = NeighborJoining.createInstance();
248 final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
249 phylogeny.setName( nj_tree_outfile.getName() );
250 writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
254 public static Map<String, Integer> createTaxCodeToIdMap( final Phylogeny phy ) {
255 final Map<String, Integer> m = new HashMap<String, Integer>();
256 for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
257 final PhylogenyNode n = iter.next();
258 if ( n.getNodeData().isHasTaxonomy() ) {
259 final Taxonomy t = n.getNodeData().getTaxonomy();
260 final String c = t.getTaxonomyCode();
261 if ( !ForesterUtil.isEmpty( c ) ) {
262 if ( n.getNodeData().getTaxonomy() == null ) {
263 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
265 final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue();
266 if ( ForesterUtil.isEmpty( id ) ) {
267 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
269 if ( m.containsKey( c ) ) {
270 ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" );
272 final int iid = Integer.valueOf( id );
273 if ( m.containsValue( iid ) ) {
274 ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" );
280 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n );
286 public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
287 final Detailedness detailedness ) {
288 for( final DomainSimilarity domain_similarity : domain_similarities ) {
289 if ( domain_similarity instanceof PrintableDomainSimilarity ) {
290 final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity;
291 printable_domain_similarity.setDetailedness( detailedness );
296 public static void doit( final List<Protein> proteins,
297 final List<String> query_domain_ids_nc_order,
299 final String separator,
300 final String limit_to_species,
301 final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
302 for( final Protein protein : proteins ) {
303 if ( ForesterUtil.isEmpty( limit_to_species )
304 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
305 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
306 out.write( protein.getSpecies().getSpeciesId() );
307 out.write( separator );
308 out.write( protein.getProteinId().getId() );
309 out.write( separator );
311 final Set<String> visited_domain_ids = new HashSet<String>();
312 boolean first = true;
313 for( final Domain domain : protein.getProteinDomains() ) {
314 if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
315 visited_domain_ids.add( domain.getDomainId() );
322 out.write( domain.getDomainId() );
324 out.write( "" + domain.getTotalCount() );
329 out.write( separator );
330 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
331 .equals( SurfacingConstants.NONE ) ) ) {
332 out.write( protein.getDescription() );
334 out.write( separator );
335 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
336 .equals( SurfacingConstants.NONE ) ) ) {
337 out.write( protein.getAccession() );
339 out.write( SurfacingConstants.NL );
346 public static void domainsPerProteinsStatistics( final String genome,
347 final List<Protein> protein_list,
348 final DescriptiveStatistics all_genomes_domains_per_potein_stats,
349 final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
350 final SortedSet<String> domains_which_are_always_single,
351 final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
352 final SortedSet<String> domains_which_never_single,
353 final Writer writer ) {
354 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
355 for( final Protein protein : protein_list ) {
356 final int domains = protein.getNumberOfProteinDomains();
357 //System.out.println( domains );
358 stats.addValue( domains );
359 all_genomes_domains_per_potein_stats.addValue( domains );
360 if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
361 all_genomes_domains_per_potein_histo.put( domains, 1 );
364 all_genomes_domains_per_potein_histo.put( domains,
365 1 + all_genomes_domains_per_potein_histo.get( domains ) );
367 if ( domains == 1 ) {
368 final String domain = protein.getProteinDomain( 0 ).getDomainId();
369 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
370 if ( domains_which_never_single.contains( domain ) ) {
371 domains_which_never_single.remove( domain );
372 domains_which_are_sometimes_single_sometimes_not.add( domain );
375 domains_which_are_always_single.add( domain );
379 else if ( domains > 1 ) {
380 for( final Domain d : protein.getProteinDomains() ) {
381 final String domain = d.getDomainId();
382 // System.out.println( domain );
383 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
384 if ( domains_which_are_always_single.contains( domain ) ) {
385 domains_which_are_always_single.remove( domain );
386 domains_which_are_sometimes_single_sometimes_not.add( domain );
389 domains_which_never_single.add( domain );
396 writer.write( genome );
397 writer.write( "\t" );
398 if ( stats.getN() >= 1 ) {
399 writer.write( stats.arithmeticMean() + "" );
400 writer.write( "\t" );
401 if ( stats.getN() >= 2 ) {
402 writer.write( stats.sampleStandardDeviation() + "" );
407 writer.write( "\t" );
408 writer.write( stats.median() + "" );
409 writer.write( "\t" );
410 writer.write( stats.getN() + "" );
411 writer.write( "\t" );
412 writer.write( stats.getMin() + "" );
413 writer.write( "\t" );
414 writer.write( stats.getMax() + "" );
417 writer.write( "\t" );
418 writer.write( "\t" );
419 writer.write( "\t" );
421 writer.write( "\t" );
422 writer.write( "\t" );
424 writer.write( "\n" );
426 catch ( final IOException e ) {
431 public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
432 final int number_of_genomes,
433 final DomainLengthsTable domain_lengths_table,
434 final File outfile ) throws IOException {
435 final DecimalFormat df = new DecimalFormat( "#.00" );
436 checkForOutputFileWriteability( outfile );
437 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
438 out.write( "MEAN BASED STATISTICS PER SPECIES" );
439 out.write( ForesterUtil.LINE_SEPARATOR );
440 out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
441 out.write( ForesterUtil.LINE_SEPARATOR );
442 out.write( ForesterUtil.LINE_SEPARATOR );
443 final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
444 out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
445 out.write( ForesterUtil.LINE_SEPARATOR );
446 for( final DomainLengths domain_lengths : domain_lengths_list ) {
447 final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
448 if ( species_list.size() > 0 ) {
449 out.write( domain_lengths.getDomainId() + "\t" );
450 for( final Species species : species_list ) {
451 out.write( species + "\t" );
453 out.write( ForesterUtil.LINE_SEPARATOR );
454 // DescriptiveStatistics stats_for_domain = domain_lengths
455 // .calculateMeanBasedStatistics();
456 //AsciiHistogram histo = new AsciiHistogram( stats_for_domain );
457 //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
460 out.write( ForesterUtil.LINE_SEPARATOR );
461 out.write( ForesterUtil.LINE_SEPARATOR );
462 out.write( "OUTLIER SPECIES (Z 1.0)" );
463 out.write( ForesterUtil.LINE_SEPARATOR );
464 final DescriptiveStatistics stats_for_all_species = domain_lengths_table
465 .calculateMeanBasedStatisticsForAllSpecies();
466 out.write( stats_for_all_species.asSummary() );
467 out.write( ForesterUtil.LINE_SEPARATOR );
468 final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
469 out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
470 out.write( ForesterUtil.LINE_SEPARATOR );
471 final double population_sd = stats_for_all_species.sampleStandardDeviation();
472 final double population_mean = stats_for_all_species.arithmeticMean();
473 for( final Species species : domain_lengths_table.getSpecies() ) {
474 final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
475 final double z = ( x - population_mean ) / population_sd;
476 out.write( species + "\t" + z );
477 out.write( ForesterUtil.LINE_SEPARATOR );
479 out.write( ForesterUtil.LINE_SEPARATOR );
480 for( final Species species : domain_lengths_table.getSpecies() ) {
481 final DescriptiveStatistics stats_for_species = domain_lengths_table
482 .calculateMeanBasedStatisticsForSpecies( species );
483 final double x = stats_for_species.arithmeticMean();
484 final double z = ( x - population_mean ) / population_sd;
485 if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
486 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
487 out.write( ForesterUtil.LINE_SEPARATOR );
491 // final List<HistogramData> histogram_datas = new ArrayList<HistogramData>();
492 // for( int i = 0; i < number_of_genomes; ++i ) {
493 // final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] );
495 // .add( new HistogramData( species.toString(), domain_lengths_table
496 // .calculateMeanBasedStatisticsForSpecies( species )
497 // .getDataAsDoubleArray(), 5, 600, null, 60 ) );
499 // final HistogramsFrame hf = new HistogramsFrame( histogram_datas );
500 // hf.setVisible( true );
506 * @param all_binary_domains_combination_lost_fitch
507 * @param use_last_in_fitch_parsimony
508 * @param consider_directedness_and_adjacency_for_bin_combinations
509 * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
510 * which were gained under unweighted (Fitch) parsimony.
512 public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
513 final boolean radomize_fitch_parsimony,
514 final String outfile_name,
515 final DomainParsimonyCalculator domain_parsimony,
516 final Phylogeny phylogeny,
517 final Map<String, List<GoId>> domain_id_to_go_ids_map,
518 final Map<GoId, GoTerm> go_id_to_term_map,
519 final GoNameSpace go_namespace_limit,
520 final String parameters_str,
521 final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
522 final SortedSet<String> positive_filter,
523 final boolean output_binary_domain_combinations_for_graphs,
524 final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
525 final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
526 final BinaryDomainCombination.DomainCombinationType dc_type,
527 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
528 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
529 final Map<String, DescriptiveStatistics> domain_length_stats_by_domain,
530 final Map<String, Integer> tax_code_to_id_map,
531 final boolean write_to_nexus,
532 final boolean use_last_in_fitch_parsimony ) {
533 final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
534 final String date_time = ForesterUtil.getCurrentDateTime();
535 final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
536 final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
537 final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
538 final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
539 final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
540 if ( write_to_nexus ) {
541 writeToNexus( outfile_name, domain_parsimony, phylogeny );
545 Phylogeny local_phylogeny_l = phylogeny.copy();
546 if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
547 domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
550 domain_parsimony.executeDolloParsimonyOnDomainPresence();
552 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
553 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
554 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
555 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
556 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
557 CharacterStateMatrix.GainLossStates.GAIN,
558 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
560 ForesterUtil.LINE_SEPARATOR,
562 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
563 CharacterStateMatrix.GainLossStates.LOSS,
564 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
566 ForesterUtil.LINE_SEPARATOR,
568 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
569 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
571 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
575 domain_parsimony.getGainLossMatrix(),
576 CharacterStateMatrix.GainLossStates.GAIN,
577 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
579 ForesterUtil.LINE_SEPARATOR,
580 "Dollo Parsimony | Gains | Domains",
582 domain_id_to_secondary_features_maps,
583 all_pfams_encountered,
584 all_pfams_gained_as_domains,
586 tax_code_to_id_map );
587 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
591 domain_parsimony.getGainLossMatrix(),
592 CharacterStateMatrix.GainLossStates.LOSS,
593 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
595 ForesterUtil.LINE_SEPARATOR,
596 "Dollo Parsimony | Losses | Domains",
598 domain_id_to_secondary_features_maps,
599 all_pfams_encountered,
600 all_pfams_lost_as_domains,
602 tax_code_to_id_map );
603 // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
604 // go_id_to_term_map,
605 // go_namespace_limit,
607 // domain_parsimony.getGainLossMatrix(),
609 // outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
611 // ForesterUtil.LINE_SEPARATOR,
612 // "Dollo Parsimony | Present | Domains",
614 // domain_id_to_secondary_features_maps,
615 // all_pfams_encountered,
617 // "_dollo_present_d",
618 // tax_code_to_id_map );
619 preparePhylogeny( local_phylogeny_l,
622 "Dollo parsimony on domain presence/absence",
623 "dollo_on_domains_" + outfile_name,
625 SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
626 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
628 writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
629 writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
631 catch ( final IOException e ) {
633 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
635 if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
636 // FITCH DOMAIN COMBINATIONS
637 // -------------------------
638 local_phylogeny_l = phylogeny.copy();
639 String randomization = "no";
640 if ( radomize_fitch_parsimony ) {
641 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
642 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
645 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony );
647 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
648 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
649 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
650 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
652 .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
653 CharacterStateMatrix.GainLossStates.GAIN,
654 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
656 ForesterUtil.LINE_SEPARATOR,
658 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
659 CharacterStateMatrix.GainLossStates.LOSS,
661 + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
663 ForesterUtil.LINE_SEPARATOR,
665 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
666 + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
667 if ( all_binary_domains_combination_gained_fitch != null ) {
668 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
670 all_binary_domains_combination_gained_fitch,
673 if ( all_binary_domains_combination_lost_fitch != null ) {
674 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
676 all_binary_domains_combination_lost_fitch,
679 if ( output_binary_domain_combinations_for_graphs ) {
681 .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
682 .getGainLossMatrix(),
685 + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
687 ForesterUtil.LINE_SEPARATOR,
688 BinaryDomainCombination.OutputFormat.DOT );
691 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
695 domain_parsimony.getGainLossMatrix(),
696 CharacterStateMatrix.GainLossStates.GAIN,
697 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
699 ForesterUtil.LINE_SEPARATOR,
700 "Fitch Parsimony | Gains | Domain Combinations",
703 all_pfams_encountered,
704 all_pfams_gained_as_dom_combinations,
706 tax_code_to_id_map );
707 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
711 domain_parsimony.getGainLossMatrix(),
712 CharacterStateMatrix.GainLossStates.LOSS,
713 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
715 ForesterUtil.LINE_SEPARATOR,
716 "Fitch Parsimony | Losses | Domain Combinations",
719 all_pfams_encountered,
720 all_pfams_lost_as_dom_combinations,
722 tax_code_to_id_map );
723 // writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
724 // go_id_to_term_map,
725 // go_namespace_limit,
727 // domain_parsimony.getGainLossMatrix(),
729 // outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
731 // ForesterUtil.LINE_SEPARATOR,
732 // "Fitch Parsimony | Present | Domain Combinations",
735 // all_pfams_encountered,
737 // "_fitch_present_dc",
738 // tax_code_to_id_map );
739 writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
742 all_pfams_encountered );
743 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
744 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
745 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
746 all_pfams_gained_as_dom_combinations );
747 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
748 preparePhylogeny( local_phylogeny_l,
751 "Fitch parsimony on binary domain combination presence/absence randomization: "
753 "fitch_on_binary_domain_combinations_" + outfile_name,
755 SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
756 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
757 calculateIndependentDomainCombinationGains( local_phylogeny_l,
759 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
761 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
763 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
765 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
766 outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
767 outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
768 outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
769 protein_length_stats_by_dc,
770 domain_number_stats_by_dc,
771 domain_length_stats_by_domain );
775 public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
776 final DomainParsimonyCalculator secondary_features_parsimony,
777 final Phylogeny phylogeny,
778 final String parameters_str,
779 final Map<Species, MappingResults> mapping_results_map,
780 final boolean use_last_in_fitch_parsimony ) {
781 final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
782 final String date_time = ForesterUtil.getCurrentDateTime();
783 System.out.println();
784 writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
785 secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
787 Phylogeny local_phylogeny_copy = phylogeny.copy();
788 secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
789 SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
790 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
791 SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
792 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
794 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
795 CharacterStateMatrix.GainLossStates.GAIN,
797 + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
799 ForesterUtil.LINE_SEPARATOR,
802 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
803 CharacterStateMatrix.GainLossStates.LOSS,
805 + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
807 ForesterUtil.LINE_SEPARATOR,
810 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
813 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
815 ForesterUtil.LINE_SEPARATOR,
817 preparePhylogeny( local_phylogeny_copy,
818 secondary_features_parsimony,
820 "Dollo parsimony on secondary feature presence/absence",
821 "dollo_on_secondary_features_" + outfile_name,
823 SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
824 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
825 // FITCH DOMAIN COMBINATIONS
826 // -------------------------
827 local_phylogeny_copy = phylogeny.copy();
828 final String randomization = "no";
829 secondary_features_parsimony
830 .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony );
831 preparePhylogeny( local_phylogeny_copy,
832 secondary_features_parsimony,
834 "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
836 "fitch_on_binary_domain_combinations_" + outfile_name,
838 SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
839 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
840 calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
841 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
842 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
843 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
844 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
845 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
846 + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
849 public static void extractProteinNames( final List<Protein> proteins,
850 final List<String> query_domain_ids_nc_order,
852 final String separator,
853 final String limit_to_species ) throws IOException {
854 for( final Protein protein : proteins ) {
855 if ( ForesterUtil.isEmpty( limit_to_species )
856 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
857 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
858 out.write( protein.getSpecies().getSpeciesId() );
859 out.write( separator );
860 out.write( protein.getProteinId().getId() );
861 out.write( separator );
863 final Set<String> visited_domain_ids = new HashSet<String>();
864 boolean first = true;
865 for( final Domain domain : protein.getProteinDomains() ) {
866 if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
867 visited_domain_ids.add( domain.getDomainId() );
874 out.write( domain.getDomainId() );
876 out.write( "" + domain.getTotalCount() );
881 out.write( separator );
882 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
883 .equals( SurfacingConstants.NONE ) ) ) {
884 out.write( protein.getDescription() );
886 out.write( separator );
887 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
888 .equals( SurfacingConstants.NONE ) ) ) {
889 out.write( protein.getAccession() );
891 out.write( SurfacingConstants.NL );
898 public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
899 final String domain_id,
901 final String separator,
902 final String limit_to_species,
903 final double domain_e_cutoff ) throws IOException {
904 System.out.println( "Per domain E-value: " + domain_e_cutoff );
905 for( final Species species : protein_lists_per_species.keySet() ) {
906 System.out.println( species + ":" );
907 for( final Protein protein : protein_lists_per_species.get( species ) ) {
908 if ( ForesterUtil.isEmpty( limit_to_species )
909 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
910 final List<Domain> domains = protein.getProteinDomains( domain_id );
911 if ( domains.size() > 0 ) {
912 out.write( protein.getSpecies().getSpeciesId() );
913 out.write( separator );
914 out.write( protein.getProteinId().getId() );
915 out.write( separator );
916 out.write( domain_id.toString() );
917 out.write( separator );
919 for( final Domain domain : domains ) {
920 if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
922 out.write( domain.getFrom() + "-" + domain.getTo() );
923 if ( prev_to >= 0 ) {
924 final int l = domain.getFrom() - prev_to;
925 System.out.println( l );
927 prev_to = domain.getTo();
931 out.write( separator );
932 final List<Domain> domain_list = new ArrayList<Domain>();
933 for( final Domain domain : protein.getProteinDomains() ) {
934 if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
935 domain_list.add( domain );
938 final Domain domain_ary[] = new Domain[ domain_list.size() ];
939 for( int i = 0; i < domain_list.size(); ++i ) {
940 domain_ary[ i ] = domain_list.get( i );
942 Arrays.sort( domain_ary, new DomainComparator( true ) );
944 boolean first = true;
945 for( final Domain domain : domain_ary ) {
952 out.write( domain.getDomainId().toString() );
953 out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
954 out.write( ":" + domain.getPerDomainEvalue() );
957 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
958 .equals( SurfacingConstants.NONE ) ) ) {
959 out.write( protein.getDescription() );
961 out.write( separator );
962 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
963 .equals( SurfacingConstants.NONE ) ) ) {
964 out.write( protein.getAccession() );
966 out.write( SurfacingConstants.NL );
974 public static SortedSet<String> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
975 final SortedSet<String> all_domains_ids = new TreeSet<String>();
976 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
977 final Set<String> all_domains = gwcd.getAllDomainIds();
978 // for( final Domain domain : all_domains ) {
979 all_domains_ids.addAll( all_domains );
982 return all_domains_ids;
985 public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
986 final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
987 for( final Protein protein_domain_collection : protein_domain_collections ) {
988 for( final Object name : protein_domain_collection.getProteinDomains() ) {
989 final BasicDomain protein_domain = ( BasicDomain ) name;
990 final String id = protein_domain.getDomainId();
991 if ( map.containsKey( id ) ) {
992 map.put( id, map.get( id ) + 1 );
1002 public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
1003 final PhylogenyNodeIterator it = p.iteratorPostorder();
1005 while ( it.hasNext() ) {
1006 final PhylogenyNode n = it.next();
1007 if ( ForesterUtil.isEmpty( n.getName() )
1008 && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1009 .getScientificName() ) )
1010 && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1011 .getCommonName() ) ) ) {
1012 if ( n.getParent() != null ) {
1013 names.append( " " );
1014 names.append( n.getParent().getName() );
1016 final List l = n.getAllExternalDescendants();
1017 for( final Object object : l ) {
1018 System.out.println( l.toString() );
1027 * Returns true is Domain domain falls in an uninterrupted stretch of
1028 * covered positions.
1031 * @param covered_positions
1034 public static boolean isEngulfed( final Domain domain, final List<Boolean> covered_positions ) {
1035 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
1036 if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) {
1043 public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
1044 final SortedMap<String, Integer> domain_architecuture_counts,
1045 final int min_count,
1046 final File da_counts_outfile,
1047 final File unique_da_outfile ) {
1048 checkForOutputFileWriteability( da_counts_outfile );
1049 checkForOutputFileWriteability( unique_da_outfile );
1051 final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
1052 final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
1053 final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
1054 while ( it.hasNext() ) {
1055 final Map.Entry<String, Integer> e = it.next();
1056 final String da = e.getKey();
1057 final int count = e.getValue();
1058 if ( count >= min_count ) {
1059 da_counts_out.write( da );
1060 da_counts_out.write( "\t" );
1061 da_counts_out.write( String.valueOf( count ) );
1062 da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
1065 final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
1066 while ( it2.hasNext() ) {
1067 final Map.Entry<String, Set<String>> e2 = it2.next();
1068 final String genome = e2.getKey();
1069 final Set<String> das = e2.getValue();
1070 if ( das.contains( da ) ) {
1071 unique_da_out.write( genome );
1072 unique_da_out.write( "\t" );
1073 unique_da_out.write( da );
1074 unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
1079 unique_da_out.close();
1080 da_counts_out.close();
1082 catch ( final IOException e ) {
1083 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1085 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
1086 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
1090 public static void preparePhylogeny( final Phylogeny p,
1091 final DomainParsimonyCalculator domain_parsimony,
1092 final String date_time,
1093 final String method,
1095 final String parameters_str ) {
1096 domain_parsimony.decoratePhylogenyWithDomains( p );
1097 final StringBuilder desc = new StringBuilder();
1098 desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
1099 desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
1100 desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
1101 desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
1102 desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
1103 desc.append( "[Parameters: " + parameters_str + "]" );
1105 p.setDescription( desc.toString() );
1106 p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
1107 p.setRerootable( false );
1108 p.setRooted( true );
1112 * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
1116 static public StringBuffer proteinToDomainCombinations( final Protein protein,
1117 final String protein_id,
1118 final String separator ) {
1119 final StringBuffer sb = new StringBuffer();
1120 if ( protein.getSpecies() == null ) {
1121 throw new IllegalArgumentException( "species must not be null" );
1123 if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
1124 throw new IllegalArgumentException( "species id must not be empty" );
1126 final List<Domain> domains = protein.getProteinDomains();
1127 if ( domains.size() > 1 ) {
1128 final Map<String, Integer> counts = new HashMap<String, Integer>();
1129 for( final Domain domain : domains ) {
1130 final String id = domain.getDomainId();
1131 if ( counts.containsKey( id ) ) {
1132 counts.put( id, counts.get( id ) + 1 );
1135 counts.put( id, 1 );
1138 final Set<String> dcs = new HashSet<String>();
1139 for( int i = 1; i < domains.size(); ++i ) {
1140 for( int j = 0; j < i; ++j ) {
1141 Domain domain_n = domains.get( i );
1142 Domain domain_c = domains.get( j );
1143 if ( domain_n.getFrom() > domain_c.getFrom() ) {
1144 domain_n = domains.get( j );
1145 domain_c = domains.get( i );
1147 final String dc = domain_n.getDomainId() + domain_c.getDomainId();
1148 if ( !dcs.contains( dc ) ) {
1150 sb.append( protein.getSpecies() );
1151 sb.append( separator );
1152 sb.append( protein_id );
1153 sb.append( separator );
1154 sb.append( domain_n.getDomainId() );
1155 sb.append( separator );
1156 sb.append( domain_c.getDomainId() );
1157 sb.append( separator );
1158 sb.append( domain_n.getPerDomainEvalue() );
1159 sb.append( separator );
1160 sb.append( domain_c.getPerDomainEvalue() );
1161 sb.append( separator );
1162 sb.append( counts.get( domain_n.getDomainId() ) );
1163 sb.append( separator );
1164 sb.append( counts.get( domain_c.getDomainId() ) );
1165 sb.append( ForesterUtil.LINE_SEPARATOR );
1170 else if ( domains.size() == 1 ) {
1171 sb.append( protein.getSpecies() );
1172 sb.append( separator );
1173 sb.append( protein_id );
1174 sb.append( separator );
1175 sb.append( domains.get( 0 ).getDomainId() );
1176 sb.append( separator );
1177 sb.append( separator );
1178 sb.append( domains.get( 0 ).getPerDomainEvalue() );
1179 sb.append( separator );
1180 sb.append( separator );
1182 sb.append( separator );
1183 sb.append( ForesterUtil.LINE_SEPARATOR );
1186 sb.append( protein.getSpecies() );
1187 sb.append( separator );
1188 sb.append( protein_id );
1189 sb.append( separator );
1190 sb.append( separator );
1191 sb.append( separator );
1192 sb.append( separator );
1193 sb.append( separator );
1194 sb.append( separator );
1195 sb.append( ForesterUtil.LINE_SEPARATOR );
1202 * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 =>
1203 * domain with 0.3 is ignored
1205 * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored
1208 * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_
1211 * @param max_allowed_overlap
1212 * maximal allowed overlap (inclusive) to be still considered not
1213 * overlapping (zero or negative value to allow any overlap)
1214 * @param remove_engulfed_domains
1215 * to remove domains which are completely engulfed by coverage of
1216 * domains with better support
1220 public static Protein removeOverlappingDomains( final int max_allowed_overlap,
1221 final boolean remove_engulfed_domains,
1222 final Protein protein ) {
1223 final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
1224 .getSpeciesId(), protein.getLength() );
1225 final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
1226 final List<Boolean> covered_positions = new ArrayList<Boolean>();
1227 for( final Domain domain : sorted ) {
1228 if ( ( ( max_allowed_overlap < 0 ) || ( SurfacingUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) )
1229 && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) {
1230 final int covered_positions_size = covered_positions.size();
1231 for( int i = covered_positions_size; i < domain.getFrom(); ++i ) {
1232 covered_positions.add( false );
1234 final int new_covered_positions_size = covered_positions.size();
1235 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
1236 if ( i < new_covered_positions_size ) {
1237 covered_positions.set( i, true );
1240 covered_positions.add( true );
1243 pruned_protein.addProteinDomain( domain );
1246 return pruned_protein;
1249 public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
1250 final List<Domain> domains = new ArrayList<Domain>();
1251 for( final Domain d : protein.getProteinDomains() ) {
1254 Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
1258 public static int storeDomainArchitectures( final String genome,
1259 final SortedMap<String, Set<String>> domain_architecutures,
1260 final List<Protein> protein_list,
1261 final Map<String, Integer> distinct_domain_architecuture_counts ) {
1262 final Set<String> da = new HashSet<String>();
1263 domain_architecutures.put( genome, da );
1264 for( final Protein protein : protein_list ) {
1265 final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
1266 if ( !da.contains( da_str ) ) {
1267 if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
1268 distinct_domain_architecuture_counts.put( da_str, 1 );
1271 distinct_domain_architecuture_counts.put( da_str,
1272 distinct_domain_architecuture_counts.get( da_str ) + 1 );
1280 public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
1281 final boolean get_gains,
1282 final String outdir,
1283 final String suffix_for_filename ) throws IOException {
1284 CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
1286 state = CharacterStateMatrix.GainLossStates.LOSS;
1288 final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
1292 for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
1293 final PhylogenyNode node = it.next();
1294 if ( !node.isExternal() ) {
1295 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
1296 if ( domains.size() > 0 ) {
1297 final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
1298 + node.getName() + suffix_for_filename );
1299 for( final String domain : domains ) {
1300 writer.write( domain );
1301 writer.write( ForesterUtil.LINE_SEPARATOR );
1309 public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
1310 final File output_dir,
1311 final GenomeWideCombinableDomains gwcd,
1313 final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1314 File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ]
1315 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
1316 if ( output_dir != null ) {
1317 dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
1319 checkForOutputFileWriteability( dc_outfile_dot );
1320 final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
1322 final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
1323 for( final BinaryDomainCombination bdc : binary_combinations ) {
1324 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
1326 out_dot.write( SurfacingConstants.NL );
1330 catch ( final IOException e ) {
1331 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1333 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
1334 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1335 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
1338 public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1339 final CharacterStateMatrix.GainLossStates state,
1340 final String filename,
1341 final String indentifier_characters_separator,
1342 final String character_separator,
1343 final Map<String, String> descriptions ) {
1344 final File outfile = new File( filename );
1345 checkForOutputFileWriteability( outfile );
1346 final SortedSet<String> sorted_ids = new TreeSet<String>();
1347 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1348 sorted_ids.add( matrix.getIdentifier( i ) );
1351 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1352 for( final String id : sorted_ids ) {
1353 out.write( indentifier_characters_separator );
1354 out.write( "#" + id );
1355 out.write( indentifier_characters_separator );
1356 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1358 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1359 if ( ( matrix.getState( id, c ) == state )
1360 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1361 .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1362 out.write( matrix.getCharacter( c ) );
1363 if ( ( descriptions != null ) && !descriptions.isEmpty()
1364 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
1366 out.write( descriptions.get( matrix.getCharacter( c ) ) );
1368 out.write( character_separator );
1375 catch ( final IOException e ) {
1376 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1378 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1381 public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1382 final CharacterStateMatrix.GainLossStates state,
1383 final String filename,
1384 final String indentifier_characters_separator,
1385 final String character_separator,
1386 final BinaryDomainCombination.OutputFormat bc_output_format ) {
1387 final File outfile = new File( filename );
1388 checkForOutputFileWriteability( outfile );
1389 final SortedSet<String> sorted_ids = new TreeSet<String>();
1390 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1391 sorted_ids.add( matrix.getIdentifier( i ) );
1394 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1395 for( final String id : sorted_ids ) {
1396 out.write( indentifier_characters_separator );
1397 out.write( "#" + id );
1398 out.write( indentifier_characters_separator );
1399 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1401 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1402 if ( ( matrix.getState( id, c ) == state )
1403 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1404 .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1405 BinaryDomainCombination bdc = null;
1407 bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
1409 catch ( final Exception e ) {
1410 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1412 out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
1413 out.write( character_separator );
1420 catch ( final IOException e ) {
1421 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1423 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1426 public static void writeBinaryStatesMatrixToList( final Map<String, List<GoId>> domain_id_to_go_ids_map,
1427 final Map<GoId, GoTerm> go_id_to_term_map,
1428 final GoNameSpace go_namespace_limit,
1429 final boolean domain_combinations,
1430 final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1431 final CharacterStateMatrix.GainLossStates state,
1432 final String filename,
1433 final String indentifier_characters_separator,
1434 final String character_separator,
1435 final String title_for_html,
1436 final String prefix_for_html,
1437 final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
1438 final SortedSet<String> all_pfams_encountered,
1439 final SortedSet<String> pfams_gained_or_lost,
1440 final String suffix_for_per_node_events_file,
1441 final Map<String, Integer> tax_code_to_id_map ) {
1442 if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1443 throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1445 else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1446 throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
1448 else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1449 throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
1451 final File outfile = new File( filename );
1452 checkForOutputFileWriteability( outfile );
1453 final SortedSet<String> sorted_ids = new TreeSet<String>();
1454 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1455 sorted_ids.add( matrix.getIdentifier( i ) );
1458 final Writer out = new BufferedWriter( new FileWriter( outfile ) );
1459 final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
1460 domain_combinations,
1463 Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1464 File per_node_go_mapped_domain_gain_loss_outfile = null;
1465 int per_node_counter = 0;
1466 out.write( "<html>" );
1467 out.write( SurfacingConstants.NL );
1468 addHtmlHead( out, title_for_html );
1469 out.write( SurfacingConstants.NL );
1470 out.write( "<body>" );
1471 out.write( SurfacingConstants.NL );
1472 out.write( "<h1>" );
1473 out.write( SurfacingConstants.NL );
1474 out.write( title_for_html );
1475 out.write( SurfacingConstants.NL );
1476 out.write( "</h1>" );
1477 out.write( SurfacingConstants.NL );
1478 out.write( "<table>" );
1479 out.write( SurfacingConstants.NL );
1480 for( final String id : sorted_ids ) {
1481 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1482 if ( matcher.matches() ) {
1485 out.write( "<tr>" );
1486 out.write( "<td>" );
1487 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
1488 out.write( "</td>" );
1489 out.write( "</tr>" );
1490 out.write( SurfacingConstants.NL );
1492 out.write( "</table>" );
1493 out.write( SurfacingConstants.NL );
1494 for( final String id : sorted_ids ) {
1495 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1496 if ( matcher.matches() ) {
1499 out.write( SurfacingConstants.NL );
1500 out.write( "<h2>" );
1501 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
1502 writeTaxonomyLinks( out, id, tax_code_to_id_map );
1503 out.write( "</h2>" );
1504 out.write( SurfacingConstants.NL );
1505 out.write( "<table>" );
1506 out.write( SurfacingConstants.NL );
1507 out.write( "<tr>" );
1508 out.write( "<td><b>" );
1509 out.write( "Pfam domain(s)" );
1510 out.write( "</b></td><td><b>" );
1511 out.write( "GO term acc" );
1512 out.write( "</b></td><td><b>" );
1513 out.write( "GO term" );
1514 out.write( "</b></td><td><b>" );
1515 out.write( "GO namespace" );
1516 out.write( "</b></td>" );
1517 out.write( "</tr>" );
1518 out.write( SurfacingConstants.NL );
1519 out.write( "</tr>" );
1520 out.write( SurfacingConstants.NL );
1521 per_node_counter = 0;
1522 if ( matrix.getNumberOfCharacters() > 0 ) {
1523 per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
1524 + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
1525 SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
1526 per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
1527 .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
1530 per_node_go_mapped_domain_gain_loss_outfile = null;
1531 per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1533 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1535 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1536 if ( ( matrix.getState( id, c ) == state )
1537 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
1538 .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
1539 final String character = matrix.getCharacter( c );
1540 String domain_0 = "";
1541 String domain_1 = "";
1542 if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
1543 final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
1544 if ( s.length != 2 ) {
1545 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
1546 + character + "]" );
1552 domain_0 = character;
1554 writeDomainData( domain_id_to_go_ids_map,
1561 character_separator,
1562 domain_id_to_secondary_features_maps,
1564 all_pfams_encountered.add( domain_0 );
1565 if ( pfams_gained_or_lost != null ) {
1566 pfams_gained_or_lost.add( domain_0 );
1568 if ( !ForesterUtil.isEmpty( domain_1 ) ) {
1569 all_pfams_encountered.add( domain_1 );
1570 if ( pfams_gained_or_lost != null ) {
1571 pfams_gained_or_lost.add( domain_1 );
1574 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1575 writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
1582 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1583 per_node_go_mapped_domain_gain_loss_outfile_writer.close();
1584 if ( per_node_counter < 1 ) {
1585 per_node_go_mapped_domain_gain_loss_outfile.delete();
1587 per_node_counter = 0;
1589 out.write( "</table>" );
1590 out.write( SurfacingConstants.NL );
1591 out.write( "<hr>" );
1592 out.write( SurfacingConstants.NL );
1593 } // for( final String id : sorted_ids ) {
1594 out.write( "</body>" );
1595 out.write( SurfacingConstants.NL );
1596 out.write( "</html>" );
1597 out.write( SurfacingConstants.NL );
1601 catch ( final IOException e ) {
1602 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1604 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
1607 public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
1608 final File output_dir,
1609 final Writer per_genome_domain_promiscuity_statistics_writer,
1610 final GenomeWideCombinableDomains gwcd,
1612 final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1613 File dc_outfile = new File( input_file_properties[ i ][ 1 ]
1614 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
1615 if ( output_dir != null ) {
1616 dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
1618 checkForOutputFileWriteability( dc_outfile );
1620 final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
1621 out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
1624 catch ( final IOException e ) {
1625 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1627 final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
1629 per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" );
1630 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
1631 if ( stats.getN() < 2 ) {
1632 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
1635 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
1636 .sampleStandardDeviation() ) + "\t" );
1638 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
1639 per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
1640 per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
1641 per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
1642 final SortedSet<String> mpds = gwcd.getMostPromiscuosDomain();
1643 for( final String mpd : mpds ) {
1644 per_genome_domain_promiscuity_statistics_writer.write( mpd + " " );
1646 per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
1648 catch ( final IOException e ) {
1649 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1651 if ( input_file_properties[ i ].length == 3 ) {
1652 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1653 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1654 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
1657 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1658 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
1659 + dc_outfile + "\"" );
1663 public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc,
1664 final StringBuilder html_title,
1665 final Writer single_writer,
1666 Map<Character, Writer> split_writers,
1667 final SortedSet<DomainSimilarity> similarities,
1668 final boolean treat_as_binary,
1669 final List<Species> species_order,
1670 final PrintableDomainSimilarity.PRINT_OPTION print_option,
1671 final DomainSimilarity.DomainSimilaritySortField sort_field,
1672 final DomainSimilarity.DomainSimilarityScoring scoring,
1673 final boolean verbose,
1674 final Map<String, Integer> tax_code_to_id_map )
1675 throws IOException {
1676 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
1677 String histogram_title = null;
1678 switch ( sort_field ) {
1679 case ABS_MAX_COUNTS_DIFFERENCE:
1680 if ( treat_as_binary ) {
1681 histogram_title = "absolute counts difference:";
1684 histogram_title = "absolute (maximal) counts difference:";
1687 case MAX_COUNTS_DIFFERENCE:
1688 if ( treat_as_binary ) {
1689 histogram_title = "counts difference:";
1692 histogram_title = "(maximal) counts difference:";
1696 histogram_title = "score mean:";
1699 histogram_title = "score minimum:";
1702 histogram_title = "score maximum:";
1704 case MAX_DIFFERENCE:
1705 if ( treat_as_binary ) {
1706 histogram_title = "difference:";
1709 histogram_title = "(maximal) difference:";
1713 histogram_title = "score mean:";
1716 histogram_title = "score standard deviation:";
1719 histogram_title = "species number:";
1722 throw new AssertionError( "Unknown sort field: " + sort_field );
1724 for( final DomainSimilarity similarity : similarities ) {
1725 switch ( sort_field ) {
1726 case ABS_MAX_COUNTS_DIFFERENCE:
1727 stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) );
1729 case MAX_COUNTS_DIFFERENCE:
1730 stats.addValue( similarity.getMaximalDifferenceInCounts() );
1733 stats.addValue( similarity.getMeanSimilarityScore() );
1736 stats.addValue( similarity.getMinimalSimilarityScore() );
1739 stats.addValue( similarity.getMaximalSimilarityScore() );
1741 case MAX_DIFFERENCE:
1742 stats.addValue( similarity.getMaximalDifference() );
1745 stats.addValue( similarity.getMeanSimilarityScore() );
1748 stats.addValue( similarity.getStandardDeviationOfSimilarityScore() );
1751 stats.addValue( similarity.getSpecies().size() );
1754 throw new AssertionError( "Unknown sort field: " + sort_field );
1757 AsciiHistogram histo = null;
1759 if ( stats.getMin() < stats.getMax() ) {
1760 histo = new AsciiHistogram( stats, histogram_title );
1763 catch ( Exception e ) {
1766 if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
1767 split_writers = new HashMap<Character, Writer>();
1768 split_writers.put( '_', single_writer );
1770 switch ( print_option ) {
1771 case SIMPLE_TAB_DELIMITED:
1774 for( final Character key : split_writers.keySet() ) {
1775 final Writer w = split_writers.get( key );
1776 w.write( "<html>" );
1777 w.write( SurfacingConstants.NL );
1779 addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() );
1782 addHtmlHead( w, "DCs (" + html_title + ")" );
1784 w.write( SurfacingConstants.NL );
1785 w.write( "<body>" );
1786 w.write( SurfacingConstants.NL );
1787 w.write( html_desc.toString() );
1788 w.write( SurfacingConstants.NL );
1791 w.write( SurfacingConstants.NL );
1792 w.write( "<tt><pre>" );
1793 w.write( SurfacingConstants.NL );
1794 if ( histo != null ) {
1795 w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
1796 w.write( SurfacingConstants.NL );
1798 w.write( "</pre></tt>" );
1799 w.write( SurfacingConstants.NL );
1800 w.write( "<table>" );
1801 w.write( SurfacingConstants.NL );
1802 w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
1803 w.write( SurfacingConstants.NL );
1804 w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
1805 w.write( SurfacingConstants.NL );
1806 w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
1807 w.write( SurfacingConstants.NL );
1808 w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
1809 w.write( SurfacingConstants.NL );
1810 if ( stats.getN() > 1 ) {
1811 w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
1814 w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
1816 w.write( SurfacingConstants.NL );
1817 w.write( "<tr><td>Median: </td><td>" + stats.median() + "</td></tr>" );
1818 w.write( SurfacingConstants.NL );
1819 w.write( "</table>" );
1820 w.write( SurfacingConstants.NL );
1822 w.write( SurfacingConstants.NL );
1824 w.write( SurfacingConstants.NL );
1826 w.write( SurfacingConstants.NL );
1827 w.write( "<table>" );
1828 w.write( SurfacingConstants.NL );
1832 for( final Writer w : split_writers.values() ) {
1833 w.write( SurfacingConstants.NL );
1836 for( final DomainSimilarity similarity : similarities ) {
1837 if ( ( species_order != null ) && !species_order.isEmpty() ) {
1838 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
1840 if ( single_writer != null ) {
1841 single_writer.write( "<a href=\"#" + similarity.getDomainId() + "\">" + similarity.getDomainId()
1843 single_writer.write( SurfacingConstants.NL );
1846 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
1848 if ( local_writer == null ) {
1849 local_writer = split_writers.get( '0' );
1851 local_writer.write( "<a href=\"#" + similarity.getDomainId() + "\">" + similarity.getDomainId()
1853 local_writer.write( SurfacingConstants.NL );
1856 // w.write( "<hr>" );
1857 // w.write( SurfacingConstants.NL );
1859 for( final DomainSimilarity similarity : similarities ) {
1860 if ( ( species_order != null ) && !species_order.isEmpty() ) {
1861 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
1863 if ( single_writer != null ) {
1864 single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() );
1865 single_writer.write( SurfacingConstants.NL );
1868 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
1870 if ( local_writer == null ) {
1871 local_writer = split_writers.get( '0' );
1873 local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() );
1874 local_writer.write( SurfacingConstants.NL );
1877 switch ( print_option ) {
1879 for( final Writer w : split_writers.values() ) {
1880 w.write( SurfacingConstants.NL );
1881 w.write( "</table>" );
1882 w.write( SurfacingConstants.NL );
1883 w.write( "</font>" );
1884 w.write( SurfacingConstants.NL );
1885 w.write( "</body>" );
1886 w.write( SurfacingConstants.NL );
1887 w.write( "</html>" );
1888 w.write( SurfacingConstants.NL );
1892 for( final Writer w : split_writers.values() ) {
1898 public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
1899 final String filename,
1900 final Format format ) {
1901 final File outfile = new File( filename );
1902 checkForOutputFileWriteability( outfile );
1904 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1905 matrix.toWriter( out, format );
1909 catch ( final IOException e ) {
1910 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1912 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
1915 public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
1916 checkForOutputFileWriteability( matrix_outfile );
1918 final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
1919 for( final DistanceMatrix distance_matrix : matrices ) {
1920 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
1921 out.write( ForesterUtil.LINE_SEPARATOR );
1926 catch ( final IOException e ) {
1927 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1929 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
1932 public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
1933 final PhylogenyWriter writer = new PhylogenyWriter();
1935 writer.toPhyloXML( new File( filename ), phylogeny, 1 );
1937 catch ( final IOException e ) {
1938 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
1941 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
1944 public static void writeTaxonomyLinks( final Writer writer,
1945 final String species,
1946 final Map<String, Integer> tax_code_to_id_map ) throws IOException {
1947 if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
1948 writer.write( " [" );
1949 if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) {
1950 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK
1951 + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot</a>" );
1954 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
1955 + "\" target=\"taxonomy_window\">eol</a>" );
1956 writer.write( "|" );
1957 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + species
1958 + "\" target=\"taxonomy_window\">scholar</a>" );
1959 writer.write( "|" );
1960 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_WEB_SEARCH_LINK + species
1961 + "\" target=\"taxonomy_window\">google</a>" );
1963 writer.write( "]" );
1967 private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
1968 if ( map.containsKey( s ) ) {
1969 map.put( s, map.get( s ) + 1 );
1976 private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
1977 final String outfilename_for_counts,
1978 final String outfilename_for_dc,
1979 final String outfilename_for_dc_for_go_mapping,
1980 final String outfilename_for_dc_for_go_mapping_unique,
1981 final String outfilename_for_rank_counts,
1982 final String outfilename_for_ancestor_species_counts,
1983 final String outfilename_for_protein_stats,
1984 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
1985 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
1986 final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
1989 // if ( protein_length_stats_by_dc != null ) {
1990 // for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
1991 // System.out.print( entry.getKey().toString() );
1992 // System.out.print( ": " );
1993 // double[] a = entry.getValue().getDataAsDoubleArray();
1994 // for( int i = 0; i < a.length; i++ ) {
1995 // System.out.print( a[ i ] + " " );
1997 // System.out.println();
2000 // if ( domain_number_stats_by_dc != null ) {
2001 // for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
2002 // System.out.print( entry.getKey().toString() );
2003 // System.out.print( ": " );
2004 // double[] a = entry.getValue().getDataAsDoubleArray();
2005 // for( int i = 0; i < a.length; i++ ) {
2006 // System.out.print( a[ i ] + " " );
2008 // System.out.println();
2012 final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
2013 final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
2014 final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
2015 final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
2016 final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
2017 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
2018 final PhylogenyNode n = it.next();
2019 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
2020 for( final String dc : gained_dc ) {
2021 if ( dc_gain_counts.containsKey( dc ) ) {
2022 dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
2025 dc_gain_counts.put( dc, 1 );
2029 final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
2030 final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
2031 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
2032 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
2033 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
2034 final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
2035 final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
2036 final Set<String> dcs = dc_gain_counts.keySet();
2037 final SortedSet<String> more_than_once = new TreeSet<String>();
2038 DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
2039 DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
2040 DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
2041 final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
2042 long gained_multiple_times_domain_length_sum = 0;
2043 long gained_once_domain_length_sum = 0;
2044 long gained_multiple_times_domain_length_count = 0;
2045 long gained_once_domain_length_count = 0;
2046 for( final String dc : dcs ) {
2047 final int count = dc_gain_counts.get( dc );
2048 if ( histogram.containsKey( count ) ) {
2049 histogram.put( count, histogram.get( count ) + 1 );
2050 domain_lists.get( count ).append( ", " + dc );
2051 domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
2052 domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
2055 histogram.put( count, 1 );
2056 domain_lists.put( count, new StringBuilder( dc ) );
2057 final PriorityQueue<String> q = new PriorityQueue<String>();
2058 q.addAll( splitDomainCombination( dc ) );
2059 domain_lists_go.put( count, q );
2060 final SortedSet<String> set = new TreeSet<String>();
2061 set.addAll( splitDomainCombination( dc ) );
2062 domain_lists_go_unique.put( count, set );
2064 if ( protein_length_stats_by_dc != null ) {
2065 if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
2066 dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
2068 dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
2069 .arithmeticMean() );
2071 if ( domain_number_stats_by_dc != null ) {
2072 if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
2073 dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
2075 dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
2076 .arithmeticMean() );
2078 if ( domain_length_stats_by_domain != null ) {
2079 if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
2080 dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
2082 final String[] ds = dc.split( "=" );
2083 dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
2084 .get( ds[ 0 ] ).arithmeticMean() );
2085 dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
2086 .get( ds[ 1 ] ).arithmeticMean() );
2089 more_than_once.add( dc );
2090 if ( protein_length_stats_by_dc != null ) {
2091 final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
2092 for( final double element : s.getData() ) {
2093 gained_multiple_times_lengths_stats.addValue( element );
2096 if ( domain_number_stats_by_dc != null ) {
2097 final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
2098 for( final double element : s.getData() ) {
2099 gained_multiple_times_domain_count_stats.addValue( element );
2102 if ( domain_length_stats_by_domain != null ) {
2103 final String[] ds = dc.split( "=" );
2104 final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
2105 final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
2106 for( final double element : s0.getData() ) {
2107 gained_multiple_times_domain_length_sum += element;
2108 ++gained_multiple_times_domain_length_count;
2110 for( final double element : s1.getData() ) {
2111 gained_multiple_times_domain_length_sum += element;
2112 ++gained_multiple_times_domain_length_count;
2117 if ( protein_length_stats_by_dc != null ) {
2118 final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
2119 for( final double element : s.getData() ) {
2120 gained_once_lengths_stats.addValue( element );
2123 if ( domain_number_stats_by_dc != null ) {
2124 final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
2125 for( final double element : s.getData() ) {
2126 gained_once_domain_count_stats.addValue( element );
2129 if ( domain_length_stats_by_domain != null ) {
2130 final String[] ds = dc.split( "=" );
2131 final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
2132 final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
2133 for( final double element : s0.getData() ) {
2134 gained_once_domain_length_sum += element;
2135 ++gained_once_domain_length_count;
2137 for( final double element : s1.getData() ) {
2138 gained_once_domain_length_sum += element;
2139 ++gained_once_domain_length_count;
2144 final Set<Integer> histogram_keys = histogram.keySet();
2145 for( final Integer histogram_key : histogram_keys ) {
2146 final int count = histogram.get( histogram_key );
2147 final StringBuilder dc = domain_lists.get( histogram_key );
2148 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
2149 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
2150 out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2151 final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
2152 Arrays.sort( sorted );
2153 for( final Object domain : sorted ) {
2154 out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
2156 out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2157 for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
2158 out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
2163 out_dc_for_go_mapping.close();
2164 out_dc_for_go_mapping_unique.close();
2165 final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
2166 final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
2167 for( final String dc : more_than_once ) {
2168 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
2169 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
2170 final PhylogenyNode n = it.next();
2171 if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
2175 for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
2176 for( int j = i + 1; j < nodes.size(); ++j ) {
2177 final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
2178 String rank = "unknown";
2179 if ( lca.getNodeData().isHasTaxonomy()
2180 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
2181 rank = lca.getNodeData().getTaxonomy().getRank();
2183 addToCountMap( lca_rank_counts, rank );
2185 if ( lca.getNodeData().isHasTaxonomy()
2186 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
2187 lca_species = lca.getNodeData().getTaxonomy().getScientificName();
2189 else if ( lca.getNodeData().isHasTaxonomy()
2190 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
2191 lca_species = lca.getNodeData().getTaxonomy().getCommonName();
2194 lca_species = lca.getName();
2196 addToCountMap( lca_ancestor_species_counts, lca_species );
2200 final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
2201 final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
2202 ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
2203 ForesterUtil.map2writer( out_for_ancestor_species_counts,
2204 lca_ancestor_species_counts,
2206 ForesterUtil.LINE_SEPARATOR );
2207 out_for_rank_counts.close();
2208 out_for_ancestor_species_counts.close();
2209 if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
2210 && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
2211 final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
2212 w.write( "Domain Lengths: " );
2214 if ( domain_length_stats_by_domain != null ) {
2215 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
2217 w.write( entry.getKey().toString() );
2218 w.write( "\t" + entry.getValue().arithmeticMean() );
2219 w.write( "\t" + entry.getValue().median() );
2226 w.write( "Protein Lengths: " );
2228 if ( protein_length_stats_by_dc != null ) {
2229 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
2231 w.write( entry.getKey().toString() );
2232 w.write( "\t" + entry.getValue().arithmeticMean() );
2233 w.write( "\t" + entry.getValue().median() );
2240 w.write( "Number of domains: " );
2242 if ( domain_number_stats_by_dc != null ) {
2243 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
2245 w.write( entry.getKey().toString() );
2246 w.write( "\t" + entry.getValue().arithmeticMean() );
2247 w.write( "\t" + entry.getValue().median() );
2254 w.write( "Gained once, domain lengths:" );
2256 w.write( "N: " + gained_once_domain_length_count );
2258 w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
2261 w.write( "Gained multiple times, domain lengths:" );
2263 w.write( "N: " + gained_multiple_times_domain_length_count );
2266 + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
2271 w.write( "Gained once, protein lengths:" );
2273 w.write( gained_once_lengths_stats.toString() );
2274 gained_once_lengths_stats = null;
2277 w.write( "Gained once, domain counts:" );
2279 w.write( gained_once_domain_count_stats.toString() );
2280 gained_once_domain_count_stats = null;
2283 w.write( "Gained multiple times, protein lengths:" );
2285 w.write( gained_multiple_times_lengths_stats.toString() );
2286 gained_multiple_times_lengths_stats = null;
2289 w.write( "Gained multiple times, domain counts:" );
2291 w.write( gained_multiple_times_domain_count_stats.toString() );
2296 catch ( final IOException e ) {
2297 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2299 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
2300 + outfilename_for_counts + "]" );
2301 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
2302 + outfilename_for_dc + "]" );
2303 ForesterUtil.programMessage( surfacing.PRG_NAME,
2304 "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
2305 + outfilename_for_dc_for_go_mapping + "]" );
2306 ForesterUtil.programMessage( surfacing.PRG_NAME,
2307 "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
2308 + outfilename_for_dc_for_go_mapping_unique + "]" );
2311 private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
2312 final boolean get_gains ) {
2313 final SortedSet<String> domains = new TreeSet<String>();
2314 for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
2315 final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
2317 domains.addAll( chars.getGainedCharacters() );
2320 domains.addAll( chars.getLostCharacters() );
2326 private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
2327 final boolean domain_combinations,
2328 final CharacterStateMatrix.GainLossStates state,
2329 final String outfile ) {
2330 File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
2331 + ForesterUtil.FILE_SEPARATOR + base_dir );
2332 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2333 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2335 if ( domain_combinations ) {
2336 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2337 + ForesterUtil.FILE_SEPARATOR + "DC" );
2340 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2341 + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
2343 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2344 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2346 if ( state == GainLossStates.GAIN ) {
2347 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2348 + ForesterUtil.FILE_SEPARATOR + "GAINS" );
2350 else if ( state == GainLossStates.LOSS ) {
2351 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2352 + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
2355 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2356 + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
2358 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2359 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2361 return per_node_go_mapped_domain_gain_loss_files_base_dir;
2364 private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
2365 final SortedMap<String, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
2366 final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
2367 for( final String domain_id : cds.keySet() ) {
2368 final CombinableDomains cd = cds.get( domain_id );
2369 binary_combinations.addAll( cd.toBinaryDomainCombinations() );
2371 return binary_combinations;
2374 private static List<String> splitDomainCombination( final String dc ) {
2375 final String[] s = dc.split( "=" );
2376 if ( s.length != 2 ) {
2377 ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
2381 final List<String> l = new ArrayList<String>( 2 );
2387 private static void writeAllEncounteredPfamsToFile( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2388 final Map<GoId, GoTerm> go_id_to_term_map,
2389 final String outfile_name,
2390 final SortedSet<String> all_pfams_encountered ) {
2391 final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
2392 final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
2393 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
2394 final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
2395 int biological_process_counter = 0;
2396 int cellular_component_counter = 0;
2397 int molecular_function_counter = 0;
2398 int pfams_with_mappings_counter = 0;
2399 int pfams_without_mappings_counter = 0;
2400 int pfams_without_mappings_to_bp_or_mf_counter = 0;
2401 int pfams_with_mappings_to_bp_or_mf_counter = 0;
2403 final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
2404 final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
2405 final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
2406 summary_writer.write( "# Pfam to GO mapping summary" );
2407 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2408 summary_writer.write( "# Actual summary is at the end of this file." );
2409 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2410 summary_writer.write( "# Encountered Pfams without a GO mapping:" );
2411 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2412 for( final String pfam : all_pfams_encountered ) {
2413 all_pfams_encountered_writer.write( pfam );
2414 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
2415 final String domain_id = new String( pfam );
2416 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
2417 ++pfams_with_mappings_counter;
2418 all_pfams_encountered_with_go_annotation_writer.write( pfam );
2419 all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
2420 final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
2421 boolean maps_to_bp = false;
2422 boolean maps_to_cc = false;
2423 boolean maps_to_mf = false;
2424 for( final GoId go_id : go_ids ) {
2425 final GoTerm go_term = go_id_to_term_map.get( go_id );
2426 if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
2429 else if ( go_term.getGoNameSpace().isCellularComponent() ) {
2432 else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
2437 ++biological_process_counter;
2440 ++cellular_component_counter;
2443 ++molecular_function_counter;
2445 if ( maps_to_bp || maps_to_mf ) {
2446 ++pfams_with_mappings_to_bp_or_mf_counter;
2449 ++pfams_without_mappings_to_bp_or_mf_counter;
2453 ++pfams_without_mappings_to_bp_or_mf_counter;
2454 ++pfams_without_mappings_counter;
2455 summary_writer.write( pfam );
2456 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2459 all_pfams_encountered_writer.close();
2460 all_pfams_encountered_with_go_annotation_writer.close();
2461 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
2462 + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
2463 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
2464 + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
2466 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
2467 + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
2468 + encountered_pfams_summary_file + "\"" );
2469 ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : "
2470 + all_pfams_encountered.size() );
2471 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : "
2472 + pfams_without_mappings_counter + " ["
2473 + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2474 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
2475 + pfams_without_mappings_to_bp_or_mf_counter + " ["
2476 + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2477 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : "
2478 + pfams_with_mappings_counter + " ["
2479 + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2480 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : "
2481 + pfams_with_mappings_to_bp_or_mf_counter + " ["
2482 + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2483 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
2484 + biological_process_counter + " ["
2485 + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2486 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
2487 + molecular_function_counter + " ["
2488 + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2489 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
2490 + cellular_component_counter + " ["
2491 + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2492 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2493 summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() );
2494 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2495 summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter
2496 + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2497 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2498 summary_writer.write( "# Pfams without mapping to proc. or func. : "
2499 + pfams_without_mappings_to_bp_or_mf_counter + " ["
2500 + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2501 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2502 summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " ["
2503 + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2504 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2505 summary_writer.write( "# Pfams with a mapping to proc. or func. : "
2506 + pfams_with_mappings_to_bp_or_mf_counter + " ["
2507 + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2508 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2509 summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
2510 + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2511 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2512 summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
2513 + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2514 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2515 summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
2516 + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2517 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2518 summary_writer.close();
2520 catch ( final IOException e ) {
2521 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2525 private static void writeDomainData( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2526 final Map<GoId, GoTerm> go_id_to_term_map,
2527 final GoNameSpace go_namespace_limit,
2529 final String domain_0,
2530 final String domain_1,
2531 final String prefix_for_html,
2532 final String character_separator_for_non_html_output,
2533 final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
2534 final Set<GoId> all_go_ids ) throws IOException {
2535 boolean any_go_annotation_present = false;
2536 boolean first_has_no_go = false;
2537 int domain_count = 2; // To distinguish between domains and binary domain combinations.
2538 if ( ForesterUtil.isEmpty( domain_1 ) ) {
2541 // The following has a difficult to understand logic.
2542 for( int d = 0; d < domain_count; ++d ) {
2543 List<GoId> go_ids = null;
2544 boolean go_annotation_present = false;
2546 if ( domain_id_to_go_ids_map.containsKey( domain_0 ) ) {
2547 go_annotation_present = true;
2548 any_go_annotation_present = true;
2549 go_ids = domain_id_to_go_ids_map.get( domain_0 );
2552 first_has_no_go = true;
2556 if ( domain_id_to_go_ids_map.containsKey( domain_1 ) ) {
2557 go_annotation_present = true;
2558 any_go_annotation_present = true;
2559 go_ids = domain_id_to_go_ids_map.get( domain_1 );
2562 if ( go_annotation_present ) {
2563 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
2564 for( final GoId go_id : go_ids ) {
2565 out.write( "<tr>" );
2568 writeDomainIdsToHtml( out,
2572 domain_id_to_secondary_features_maps );
2575 out.write( "<td></td>" );
2577 if ( !go_id_to_term_map.containsKey( go_id ) ) {
2578 throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
2580 final GoTerm go_term = go_id_to_term_map.get( go_id );
2581 if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
2582 // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
2583 final String go_id_str = go_id.getId();
2584 out.write( "<td>" );
2585 out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
2586 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
2587 out.write( "</td><td>" );
2588 out.write( go_term.getName() );
2589 if ( domain_count == 2 ) {
2590 out.write( " (" + d + ")" );
2592 out.write( "</td><td>" );
2593 // out.write( top );
2594 // out.write( "</td><td>" );
2596 out.write( go_term.getGoNameSpace().toShortString() );
2598 out.write( "</td>" );
2599 if ( all_go_ids != null ) {
2600 all_go_ids.add( go_id );
2604 out.write( "<td>" );
2605 out.write( "</td><td>" );
2606 out.write( "</td><td>" );
2607 out.write( "</td><td>" );
2608 out.write( "</td>" );
2610 out.write( "</tr>" );
2611 out.write( SurfacingConstants.NL );
2614 } // for( int d = 0; d < domain_count; ++d )
2615 if ( !any_go_annotation_present ) {
2616 out.write( "<tr>" );
2617 writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
2618 out.write( "<td>" );
2619 out.write( "</td><td>" );
2620 out.write( "</td><td>" );
2621 out.write( "</td><td>" );
2622 out.write( "</td>" );
2623 out.write( "</tr>" );
2624 out.write( SurfacingConstants.NL );
2628 private static void writeDomainIdsToHtml( final Writer out,
2629 final String domain_0,
2630 final String domain_1,
2631 final String prefix_for_detailed_html,
2632 final Map<String, Set<String>>[] domain_id_to_secondary_features_maps )
2633 throws IOException {
2634 out.write( "<td>" );
2635 if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
2636 out.write( prefix_for_detailed_html );
2639 out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
2640 out.write( "</td>" );
2643 private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
2644 final String domain_0,
2645 final String domain_1 ) throws IOException {
2646 individual_files_writer.write( domain_0 );
2647 individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2648 if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2649 individual_files_writer.write( domain_1 );
2650 individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2654 private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
2656 final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
2657 for( final String pfam : pfams ) {
2658 writer.write( pfam );
2659 writer.write( ForesterUtil.LINE_SEPARATOR );
2662 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
2665 catch ( final IOException e ) {
2666 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2670 private static void writeToNexus( final String outfile_name,
2671 final CharacterStateMatrix<BinaryStates> matrix,
2672 final Phylogeny phylogeny ) {
2673 if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
2674 throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
2677 final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
2678 final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
2679 phylogenies.add( phylogeny );
2681 final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
2682 w.write( NexusConstants.NEXUS );
2683 w.write( ForesterUtil.LINE_SEPARATOR );
2684 my_matrix.writeNexusTaxaBlock( w );
2685 my_matrix.writeNexusBinaryChractersBlock( w );
2686 PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
2689 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
2691 catch ( final IOException e ) {
2692 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2696 private static void writeToNexus( final String outfile_name,
2697 final DomainParsimonyCalculator domain_parsimony,
2698 final Phylogeny phylogeny ) {
2699 writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
2700 domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
2702 writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
2703 domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
2707 final static class DomainComparator implements Comparator<Domain> {
2709 final private boolean _ascending;
2711 public DomainComparator( final boolean ascending ) {
2712 _ascending = ascending;
2716 public final int compare( final Domain d0, final Domain d1 ) {
2717 if ( d0.getFrom() < d1.getFrom() ) {
2718 return _ascending ? -1 : 1;
2720 else if ( d0.getFrom() > d1.getFrom() ) {
2721 return _ascending ? 1 : -1;