3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.surfacing;
29 import java.io.BufferedWriter;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.text.DecimalFormat;
35 import java.text.NumberFormat;
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.Collections;
39 import java.util.Comparator;
40 import java.util.HashMap;
41 import java.util.HashSet;
42 import java.util.Iterator;
43 import java.util.List;
45 import java.util.Map.Entry;
46 import java.util.PriorityQueue;
48 import java.util.SortedMap;
49 import java.util.SortedSet;
50 import java.util.TreeMap;
51 import java.util.TreeSet;
52 import java.util.regex.Matcher;
53 import java.util.regex.Pattern;
55 import org.forester.application.surfacing;
56 import org.forester.evoinference.distance.NeighborJoining;
57 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
58 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
59 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
60 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
61 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
62 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
63 import org.forester.evoinference.matrix.distance.DistanceMatrix;
64 import org.forester.go.GoId;
65 import org.forester.go.GoNameSpace;
66 import org.forester.go.GoTerm;
67 import org.forester.go.PfamToGoMapping;
68 import org.forester.io.parsers.nexus.NexusConstants;
69 import org.forester.io.writers.PhylogenyWriter;
70 import org.forester.phylogeny.Phylogeny;
71 import org.forester.phylogeny.PhylogenyMethods;
72 import org.forester.phylogeny.PhylogenyNode;
73 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
74 import org.forester.phylogeny.data.BinaryCharacters;
75 import org.forester.phylogeny.data.Confidence;
76 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
77 import org.forester.protein.BasicDomain;
78 import org.forester.protein.BasicProtein;
79 import org.forester.protein.BinaryDomainCombination;
80 import org.forester.protein.Domain;
81 import org.forester.protein.DomainId;
82 import org.forester.protein.Protein;
83 import org.forester.species.Species;
84 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
85 import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput;
86 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
87 import org.forester.util.AsciiHistogram;
88 import org.forester.util.BasicDescriptiveStatistics;
89 import org.forester.util.BasicTable;
90 import org.forester.util.BasicTableParser;
91 import org.forester.util.DescriptiveStatistics;
92 import org.forester.util.ForesterUtil;
94 public final class SurfacingUtil {
96 private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
97 private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" );
98 private static final Comparator<Domain> ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
101 public int compare( final Domain d1,
103 if ( d1.getPerSequenceEvalue() < d2
104 .getPerSequenceEvalue() ) {
108 .getPerSequenceEvalue() > d2
109 .getPerSequenceEvalue() ) {
113 return d1.compareTo( d2 );
117 public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" );
118 private static final boolean USE_LAST = true;
120 private SurfacingUtil() {
121 // Hidden constructor.
124 public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
125 final SortedMap<String, Integer> domain_architecuture_counts,
127 final File da_counts_outfile,
128 final File unique_da_outfile ) {
129 checkForOutputFileWriteability( da_counts_outfile );
130 checkForOutputFileWriteability( unique_da_outfile );
132 final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
133 final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
134 final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
135 while ( it.hasNext() ) {
136 final Map.Entry<String, Integer> e = it.next();
137 final String da = e.getKey();
138 final int count = e.getValue();
139 if ( count >= min_count ) {
140 da_counts_out.write( da );
141 da_counts_out.write( "\t" );
142 da_counts_out.write( String.valueOf( count ) );
143 da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
146 final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
147 while ( it2.hasNext() ) {
148 final Map.Entry<String, Set<String>> e2 = it2.next();
149 final String genome = e2.getKey();
150 final Set<String> das = e2.getValue();
151 if ( das.contains( da ) ) {
152 unique_da_out.write( genome );
153 unique_da_out.write( "\t" );
154 unique_da_out.write( da );
155 unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
160 unique_da_out.close();
161 da_counts_out.close();
163 catch ( final IOException e ) {
164 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
166 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
167 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
171 public static int storeDomainArchitectures( final String genome,
172 final SortedMap<String, Set<String>> domain_architecutures,
173 final List<Protein> protein_list,
174 final Map<String, Integer> distinct_domain_architecuture_counts ) {
175 final Set<String> da = new HashSet<String>();
176 domain_architecutures.put( genome, da );
177 for( final Protein protein : protein_list ) {
178 final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
179 if ( !da.contains( da_str ) ) {
180 if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
181 distinct_domain_architecuture_counts.put( da_str, 1 );
184 distinct_domain_architecuture_counts.put( da_str,
185 distinct_domain_architecuture_counts.get( da_str ) + 1 );
193 public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
194 final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
195 final SortedMap<DomainId, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
196 for( final DomainId domain_id : all_cd.keySet() ) {
197 binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
201 public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
202 final SortedSet<DomainId> domain_ids ) {
203 final SortedSet<DomainId> domains = genome.getAllDomainIds();
204 for( final DomainId domain : domains ) {
205 domain_ids.add( domain );
209 public static void addHtmlHead( final Writer w, final String title ) throws IOException {
210 w.write( SurfacingConstants.NL );
212 w.write( "<title>" );
214 w.write( "</title>" );
215 w.write( SurfacingConstants.NL );
216 w.write( "<style>" );
217 w.write( SurfacingConstants.NL );
218 w.write( "a:visited { color : #6633FF; text-decoration : none; }" );
219 w.write( SurfacingConstants.NL );
220 w.write( "a:link { color : #6633FF; text-decoration : none; }" );
221 w.write( SurfacingConstants.NL );
222 w.write( "a:active { color : #99FF00; text-decoration : none; }" );
223 w.write( SurfacingConstants.NL );
224 w.write( "a:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; }" );
225 w.write( SurfacingConstants.NL );
226 w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
227 w.write( SurfacingConstants.NL );
228 w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
229 w.write( SurfacingConstants.NL );
230 w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
231 w.write( SurfacingConstants.NL );
232 w.write( "</style>" );
233 w.write( SurfacingConstants.NL );
234 w.write( "</head>" );
235 w.write( SurfacingConstants.NL );
238 public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
239 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
240 for( final DomainSimilarity similarity : similarities ) {
241 stats.addValue( similarity.getMeanSimilarityScore() );
246 private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
247 final String outfilename_for_counts,
248 final String outfilename_for_dc,
249 final String outfilename_for_dc_for_go_mapping,
250 final String outfilename_for_dc_for_go_mapping_unique,
251 final String outfilename_for_rank_counts,
252 final String outfilename_for_ancestor_species_counts,
253 final String outfilename_for_protein_stats,
254 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
255 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
256 final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
259 // if ( protein_length_stats_by_dc != null ) {
260 // for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
261 // System.out.print( entry.getKey().toString() );
262 // System.out.print( ": " );
263 // double[] a = entry.getValue().getDataAsDoubleArray();
264 // for( int i = 0; i < a.length; i++ ) {
265 // System.out.print( a[ i ] + " " );
267 // System.out.println();
270 // if ( domain_number_stats_by_dc != null ) {
271 // for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
272 // System.out.print( entry.getKey().toString() );
273 // System.out.print( ": " );
274 // double[] a = entry.getValue().getDataAsDoubleArray();
275 // for( int i = 0; i < a.length; i++ ) {
276 // System.out.print( a[ i ] + " " );
278 // System.out.println();
282 final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
283 final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
284 final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
285 final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
286 final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
287 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
288 final PhylogenyNode n = it.next();
289 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
290 for( final String dc : gained_dc ) {
291 if ( dc_gain_counts.containsKey( dc ) ) {
292 dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
295 dc_gain_counts.put( dc, 1 );
299 final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
300 final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
301 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
302 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
303 final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
304 final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
305 final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
306 final Set<String> dcs = dc_gain_counts.keySet();
307 final SortedSet<String> more_than_once = new TreeSet<String>();
308 final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
309 final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
310 final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
311 final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
312 long gained_multiple_times_domain_length_sum = 0;
313 long gained_once_domain_length_sum = 0;
314 long gained_multiple_times_domain_length_count = 0;
315 long gained_once_domain_length_count = 0;
316 for( final String dc : dcs ) {
317 final int count = dc_gain_counts.get( dc );
318 if ( histogram.containsKey( count ) ) {
319 histogram.put( count, histogram.get( count ) + 1 );
320 domain_lists.get( count ).append( ", " + dc );
321 domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
322 domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
325 histogram.put( count, 1 );
326 domain_lists.put( count, new StringBuilder( dc ) );
327 final PriorityQueue<String> q = new PriorityQueue<String>();
328 q.addAll( splitDomainCombination( dc ) );
329 domain_lists_go.put( count, q );
330 final SortedSet<String> set = new TreeSet<String>();
331 set.addAll( splitDomainCombination( dc ) );
332 domain_lists_go_unique.put( count, set );
334 if ( protein_length_stats_by_dc != null ) {
335 if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
336 dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
338 dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
341 if ( domain_number_stats_by_dc != null ) {
342 if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
343 dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
345 dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
348 if ( domain_length_stats_by_domain != null ) {
349 if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
350 dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
352 final String[] ds = dc.split( "=" );
353 dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
354 .get( ds[ 0 ] ).arithmeticMean() );
355 dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
356 .get( ds[ 1 ] ).arithmeticMean() );
359 more_than_once.add( dc );
360 if ( protein_length_stats_by_dc != null ) {
361 final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
362 for( final double element : s.getData() ) {
363 gained_multiple_times_lengths_stats.addValue( element );
366 if ( domain_number_stats_by_dc != null ) {
367 final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
368 for( final double element : s.getData() ) {
369 gained_multiple_times_domain_count_stats.addValue( element );
372 if ( domain_length_stats_by_domain != null ) {
373 final String[] ds = dc.split( "=" );
374 final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
375 final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
376 for( final double element : s0.getData() ) {
377 gained_multiple_times_domain_length_sum += element;
378 ++gained_multiple_times_domain_length_count;
380 for( final double element : s1.getData() ) {
381 gained_multiple_times_domain_length_sum += element;
382 ++gained_multiple_times_domain_length_count;
387 if ( protein_length_stats_by_dc != null ) {
388 final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
389 for( final double element : s.getData() ) {
390 gained_once_lengths_stats.addValue( element );
393 if ( domain_number_stats_by_dc != null ) {
394 final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
395 for( final double element : s.getData() ) {
396 gained_once_domain_count_stats.addValue( element );
399 if ( domain_length_stats_by_domain != null ) {
400 final String[] ds = dc.split( "=" );
401 final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
402 final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
403 for( final double element : s0.getData() ) {
404 gained_once_domain_length_sum += element;
405 ++gained_once_domain_length_count;
407 for( final double element : s1.getData() ) {
408 gained_once_domain_length_sum += element;
409 ++gained_once_domain_length_count;
414 final Set<Integer> histogram_keys = histogram.keySet();
415 for( final Integer histogram_key : histogram_keys ) {
416 final int count = histogram.get( histogram_key );
417 final StringBuilder dc = domain_lists.get( histogram_key );
418 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
419 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
420 out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
421 final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
422 Arrays.sort( sorted );
423 for( final Object domain : sorted ) {
424 out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
426 out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
427 for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
428 out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
433 out_dc_for_go_mapping.close();
434 out_dc_for_go_mapping_unique.close();
435 final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
436 final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
437 for( final String dc : more_than_once ) {
438 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
439 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
440 final PhylogenyNode n = it.next();
441 if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
445 for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
446 for( int j = i + 1; j < nodes.size(); ++j ) {
447 final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
448 String rank = "unknown";
449 if ( lca.getNodeData().isHasTaxonomy()
450 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
451 rank = lca.getNodeData().getTaxonomy().getRank();
453 addToCountMap( lca_rank_counts, rank );
455 if ( lca.getNodeData().isHasTaxonomy()
456 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
457 lca_species = lca.getNodeData().getTaxonomy().getScientificName();
459 else if ( lca.getNodeData().isHasTaxonomy()
460 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
461 lca_species = lca.getNodeData().getTaxonomy().getCommonName();
464 lca_species = lca.getName();
466 addToCountMap( lca_ancestor_species_counts, lca_species );
470 final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
471 final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
472 ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
473 ForesterUtil.map2writer( out_for_ancestor_species_counts,
474 lca_ancestor_species_counts,
476 ForesterUtil.LINE_SEPARATOR );
477 out_for_rank_counts.close();
478 out_for_ancestor_species_counts.close();
479 if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
480 && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
481 final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
482 w.write( "Domain Lengths: " );
484 if ( domain_length_stats_by_domain != null ) {
485 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
487 w.write( entry.getKey().toString() );
488 w.write( "\t" + entry.getValue().arithmeticMean() );
489 w.write( "\t" + entry.getValue().median() );
496 w.write( "Protein Lengths: " );
498 if ( protein_length_stats_by_dc != null ) {
499 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
501 w.write( entry.getKey().toString() );
502 w.write( "\t" + entry.getValue().arithmeticMean() );
503 w.write( "\t" + entry.getValue().median() );
510 w.write( "Number of domains: " );
512 if ( domain_number_stats_by_dc != null ) {
513 for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
515 w.write( entry.getKey().toString() );
516 w.write( "\t" + entry.getValue().arithmeticMean() );
517 w.write( "\t" + entry.getValue().median() );
524 w.write( "Gained once, domain lengths:" );
526 w.write( "N: " + gained_once_domain_length_count );
528 w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
531 w.write( "Gained multiple times, domain lengths:" );
533 w.write( "N: " + gained_multiple_times_domain_length_count );
536 + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
541 w.write( "Gained once, protein lengths:" );
543 w.write( gained_once_lengths_stats.toString() );
546 w.write( "Gained once, domain counts:" );
548 w.write( gained_once_domain_count_stats.toString() );
551 w.write( "Gained multiple times, protein lengths:" );
553 w.write( gained_multiple_times_lengths_stats.toString() );
556 w.write( "Gained multiple times, domain counts:" );
558 w.write( gained_multiple_times_domain_count_stats.toString() );
563 catch ( final IOException e ) {
564 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
566 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
567 + outfilename_for_counts + "]" );
568 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
569 + outfilename_for_dc + "]" );
570 ForesterUtil.programMessage( surfacing.PRG_NAME,
571 "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
572 + outfilename_for_dc_for_go_mapping + "]" );
573 ForesterUtil.programMessage( surfacing.PRG_NAME,
574 "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
575 + outfilename_for_dc_for_go_mapping_unique + "]" );
578 private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
579 if ( map.containsKey( s ) ) {
580 map.put( s, map.get( s ) + 1 );
587 public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
588 int overlap_count = 0;
589 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
590 if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) {
594 return overlap_count;
597 public static void checkForOutputFileWriteability( final File outfile ) {
598 final String error = ForesterUtil.isWritableFile( outfile );
599 if ( !ForesterUtil.isEmpty( error ) ) {
600 ForesterUtil.fatalError( surfacing.PRG_NAME, error );
604 private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
605 final boolean get_gains ) {
606 final SortedSet<String> domains = new TreeSet<String>();
607 for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
608 final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
610 domains.addAll( chars.getGainedCharacters() );
613 domains.addAll( chars.getLostCharacters() );
619 public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
620 final BinaryDomainCombination.DomainCombinationType dc_type,
621 final List<BinaryDomainCombination> all_binary_domains_combination_gained,
622 final boolean get_gains ) {
623 final SortedSet<String> sorted_ids = new TreeSet<String>();
624 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
625 sorted_ids.add( matrix.getIdentifier( i ) );
627 for( final String id : sorted_ids ) {
628 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
629 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
630 || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
631 if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
632 all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
633 .createInstance( matrix.getCharacter( c ) ) );
635 else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
636 all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
637 .createInstance( matrix.getCharacter( c ) ) );
640 all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
641 .getCharacter( c ) ) );
648 private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
649 final boolean domain_combinations,
650 final CharacterStateMatrix.GainLossStates state,
651 final String outfile ) {
652 File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
653 + ForesterUtil.FILE_SEPARATOR + base_dir );
654 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
655 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
657 if ( domain_combinations ) {
658 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
659 + ForesterUtil.FILE_SEPARATOR + "DC" );
662 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
663 + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
665 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
666 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
668 if ( state == GainLossStates.GAIN ) {
669 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
670 + ForesterUtil.FILE_SEPARATOR + "GAINS" );
672 else if ( state == GainLossStates.LOSS ) {
673 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
674 + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
677 per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
678 + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
680 if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
681 per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
683 return per_node_go_mapped_domain_gain_loss_files_base_dir;
686 public static Map<DomainId, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
687 final Map<DomainId, List<GoId>> domain_id_to_go_ids_map = new HashMap<DomainId, List<GoId>>( pfam_to_go_mappings
689 for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
690 if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
691 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
693 domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
695 return domain_id_to_go_ids_map;
698 public static Map<DomainId, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
700 final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
701 final Map<DomainId, Set<String>> map = new TreeMap<DomainId, Set<String>>();
702 for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
703 final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) );
704 if ( !map.containsKey( domain_id ) ) {
705 map.put( domain_id, new HashSet<String>() );
707 map.get( domain_id ).add( primary_table.getValue( 1, r ) );
712 public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
713 checkForOutputFileWriteability( nj_tree_outfile );
714 final NeighborJoining nj = NeighborJoining.createInstance();
715 final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
716 phylogeny.setName( nj_tree_outfile.getName() );
717 writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
721 private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
722 final SortedMap<DomainId, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
723 final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
724 for( final DomainId domain_id : cds.keySet() ) {
725 final CombinableDomains cd = cds.get( domain_id );
726 binary_combinations.addAll( cd.toBinaryDomainCombinations() );
728 return binary_combinations;
731 public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
732 final Detailedness detailedness,
733 final GoAnnotationOutput go_annotation_output,
734 final Map<GoId, GoTerm> go_id_to_term_map,
735 final GoNameSpace go_namespace_limit ) {
736 if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) {
737 throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" );
739 for( final DomainSimilarity domain_similarity : domain_similarities ) {
740 if ( domain_similarity instanceof PrintableDomainSimilarity ) {
741 final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity;
742 printable_domain_similarity.setDetailedness( detailedness );
743 printable_domain_similarity.setGoAnnotationOutput( go_annotation_output );
744 printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map );
745 printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit );
750 public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
751 final int number_of_genomes,
752 final DomainLengthsTable domain_lengths_table,
753 final File outfile ) throws IOException {
754 final DecimalFormat df = new DecimalFormat( "#.00" );
755 checkForOutputFileWriteability( outfile );
756 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
757 out.write( "MEAN BASED STATISTICS PER SPECIES" );
758 out.write( ForesterUtil.LINE_SEPARATOR );
759 out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
760 out.write( ForesterUtil.LINE_SEPARATOR );
761 out.write( ForesterUtil.LINE_SEPARATOR );
762 final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
763 out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
764 out.write( ForesterUtil.LINE_SEPARATOR );
765 for( final DomainLengths domain_lengths : domain_lengths_list ) {
766 final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
767 if ( species_list.size() > 0 ) {
768 out.write( domain_lengths.getDomainId() + "\t" );
769 for( final Species species : species_list ) {
770 out.write( species + "\t" );
772 out.write( ForesterUtil.LINE_SEPARATOR );
773 // DescriptiveStatistics stats_for_domain = domain_lengths
774 // .calculateMeanBasedStatistics();
775 //AsciiHistogram histo = new AsciiHistogram( stats_for_domain );
776 //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
779 out.write( ForesterUtil.LINE_SEPARATOR );
780 out.write( ForesterUtil.LINE_SEPARATOR );
781 out.write( "OUTLIER SPECIES (Z 1.0)" );
782 out.write( ForesterUtil.LINE_SEPARATOR );
783 final DescriptiveStatistics stats_for_all_species = domain_lengths_table
784 .calculateMeanBasedStatisticsForAllSpecies();
785 out.write( stats_for_all_species.asSummary() );
786 out.write( ForesterUtil.LINE_SEPARATOR );
787 final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
788 out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
789 out.write( ForesterUtil.LINE_SEPARATOR );
790 final double population_sd = stats_for_all_species.sampleStandardDeviation();
791 final double population_mean = stats_for_all_species.arithmeticMean();
792 for( final Species species : domain_lengths_table.getSpecies() ) {
793 final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
794 final double z = ( x - population_mean ) / population_sd;
795 out.write( species + "\t" + z );
796 out.write( ForesterUtil.LINE_SEPARATOR );
798 out.write( ForesterUtil.LINE_SEPARATOR );
799 for( final Species species : domain_lengths_table.getSpecies() ) {
800 final DescriptiveStatistics stats_for_species = domain_lengths_table
801 .calculateMeanBasedStatisticsForSpecies( species );
802 final double x = stats_for_species.arithmeticMean();
803 final double z = ( x - population_mean ) / population_sd;
804 if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
805 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
806 out.write( ForesterUtil.LINE_SEPARATOR );
810 // final List<HistogramData> histogram_datas = new ArrayList<HistogramData>();
811 // for( int i = 0; i < number_of_genomes; ++i ) {
812 // final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] );
814 // .add( new HistogramData( species.toString(), domain_lengths_table
815 // .calculateMeanBasedStatisticsForSpecies( species )
816 // .getDataAsDoubleArray(), 5, 600, null, 60 ) );
818 // final HistogramsFrame hf = new HistogramsFrame( histogram_datas );
819 // hf.setVisible( true );
825 * @param all_binary_domains_combination_lost_fitch
826 * @param consider_directedness_and_adjacency_for_bin_combinations
827 * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
828 * which were gained under unweighted (Fitch) parsimony.
830 public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
831 final boolean radomize_fitch_parsimony,
832 final String outfile_name,
833 final DomainParsimonyCalculator domain_parsimony,
834 final Phylogeny phylogeny,
835 final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
836 final Map<GoId, GoTerm> go_id_to_term_map,
837 final GoNameSpace go_namespace_limit,
838 final String parameters_str,
839 final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
840 final SortedSet<DomainId> positive_filter,
841 final boolean output_binary_domain_combinations_for_graphs,
842 final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
843 final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
844 final BinaryDomainCombination.DomainCombinationType dc_type,
845 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
846 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
847 final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
848 final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
849 final String date_time = ForesterUtil.getCurrentDateTime();
850 final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
851 final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
852 final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
853 final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
854 final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
855 writeToNexus( outfile_name, domain_parsimony, phylogeny );
858 Phylogeny local_phylogeny_l = phylogeny.copy();
859 if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
860 domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
863 domain_parsimony.executeDolloParsimonyOnDomainPresence();
865 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
866 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
867 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
868 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
869 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
870 CharacterStateMatrix.GainLossStates.GAIN,
871 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
873 ForesterUtil.LINE_SEPARATOR,
875 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
876 CharacterStateMatrix.GainLossStates.LOSS,
877 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
879 ForesterUtil.LINE_SEPARATOR,
881 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
882 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
884 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
888 domain_parsimony.getGainLossMatrix(),
889 CharacterStateMatrix.GainLossStates.GAIN,
890 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
892 ForesterUtil.LINE_SEPARATOR,
893 "Dollo Parsimony | Gains | Domains",
895 domain_id_to_secondary_features_maps,
896 all_pfams_encountered,
897 all_pfams_gained_as_domains,
899 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
903 domain_parsimony.getGainLossMatrix(),
904 CharacterStateMatrix.GainLossStates.LOSS,
905 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
907 ForesterUtil.LINE_SEPARATOR,
908 "Dollo Parsimony | Losses | Domains",
910 domain_id_to_secondary_features_maps,
911 all_pfams_encountered,
912 all_pfams_lost_as_domains,
914 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
918 domain_parsimony.getGainLossMatrix(),
920 outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
922 ForesterUtil.LINE_SEPARATOR,
923 "Dollo Parsimony | Present | Domains",
925 domain_id_to_secondary_features_maps,
926 all_pfams_encountered,
928 "_dollo_present_d" );
929 preparePhylogeny( local_phylogeny_l,
932 "Dollo parsimony on domain presence/absence",
933 "dollo_on_domains_" + outfile_name,
935 SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
936 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
938 writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
939 writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
941 catch ( final IOException e ) {
943 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
945 if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
946 // FITCH DOMAIN COMBINATIONS
947 // -------------------------
948 local_phylogeny_l = phylogeny.copy();
949 String randomization = "no";
950 if ( radomize_fitch_parsimony ) {
951 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
952 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
955 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST );
957 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
958 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
959 SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
960 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
962 .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
963 CharacterStateMatrix.GainLossStates.GAIN,
964 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
966 ForesterUtil.LINE_SEPARATOR,
968 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
969 CharacterStateMatrix.GainLossStates.LOSS,
971 + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
973 ForesterUtil.LINE_SEPARATOR,
975 SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
976 + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
977 if ( all_binary_domains_combination_gained_fitch != null ) {
978 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
980 all_binary_domains_combination_gained_fitch,
983 if ( all_binary_domains_combination_lost_fitch != null ) {
984 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
986 all_binary_domains_combination_lost_fitch,
989 if ( output_binary_domain_combinations_for_graphs ) {
991 .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
992 .getGainLossMatrix(),
995 + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
997 ForesterUtil.LINE_SEPARATOR,
998 BinaryDomainCombination.OutputFormat.DOT );
1001 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1005 domain_parsimony.getGainLossMatrix(),
1006 CharacterStateMatrix.GainLossStates.GAIN,
1007 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
1009 ForesterUtil.LINE_SEPARATOR,
1010 "Fitch Parsimony | Gains | Domain Combinations",
1013 all_pfams_encountered,
1014 all_pfams_gained_as_dom_combinations,
1015 "_fitch_gains_dc" );
1016 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1020 domain_parsimony.getGainLossMatrix(),
1021 CharacterStateMatrix.GainLossStates.LOSS,
1022 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
1024 ForesterUtil.LINE_SEPARATOR,
1025 "Fitch Parsimony | Losses | Domain Combinations",
1028 all_pfams_encountered,
1029 all_pfams_lost_as_dom_combinations,
1030 "_fitch_losses_dc" );
1031 writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1035 domain_parsimony.getGainLossMatrix(),
1037 outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
1039 ForesterUtil.LINE_SEPARATOR,
1040 "Fitch Parsimony | Present | Domain Combinations",
1043 all_pfams_encountered,
1045 "_fitch_present_dc" );
1046 writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
1049 all_pfams_encountered );
1050 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
1051 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
1052 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
1053 all_pfams_gained_as_dom_combinations );
1054 writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
1055 preparePhylogeny( local_phylogeny_l,
1058 "Fitch parsimony on binary domain combination presence/absence randomization: "
1060 "fitch_on_binary_domain_combinations_" + outfile_name,
1062 SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
1063 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
1064 calculateIndependentDomainCombinationGains( local_phylogeny_l,
1066 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
1068 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
1070 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
1072 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
1073 outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
1074 outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
1075 outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
1076 protein_length_stats_by_dc,
1077 domain_number_stats_by_dc,
1078 domain_length_stats_by_domain );
1082 public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
1083 final DomainParsimonyCalculator secondary_features_parsimony,
1084 final Phylogeny phylogeny,
1085 final String parameters_str,
1086 final Map<Species, MappingResults> mapping_results_map ) {
1087 final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
1088 final String date_time = ForesterUtil.getCurrentDateTime();
1089 System.out.println();
1090 writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
1091 secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
1093 Phylogeny local_phylogeny_copy = phylogeny.copy();
1094 secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
1095 SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
1096 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
1097 SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
1098 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
1100 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1101 CharacterStateMatrix.GainLossStates.GAIN,
1103 + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
1105 ForesterUtil.LINE_SEPARATOR,
1108 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1109 CharacterStateMatrix.GainLossStates.LOSS,
1111 + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
1113 ForesterUtil.LINE_SEPARATOR,
1116 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1119 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
1121 ForesterUtil.LINE_SEPARATOR,
1123 preparePhylogeny( local_phylogeny_copy,
1124 secondary_features_parsimony,
1126 "Dollo parsimony on secondary feature presence/absence",
1127 "dollo_on_secondary_features_" + outfile_name,
1129 SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
1130 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
1131 // FITCH DOMAIN COMBINATIONS
1132 // -------------------------
1133 local_phylogeny_copy = phylogeny.copy();
1134 final String randomization = "no";
1135 secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST );
1136 preparePhylogeny( local_phylogeny_copy,
1137 secondary_features_parsimony,
1139 "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
1141 "fitch_on_binary_domain_combinations_" + outfile_name,
1143 SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
1144 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
1145 calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
1146 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
1147 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
1148 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
1149 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
1150 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
1151 + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
1154 public static void doit( final List<Protein> proteins,
1155 final List<DomainId> query_domain_ids_nc_order,
1157 final String separator,
1158 final String limit_to_species,
1159 final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
1160 for( final Protein protein : proteins ) {
1161 if ( ForesterUtil.isEmpty( limit_to_species )
1162 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1163 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
1164 out.write( protein.getSpecies().getSpeciesId() );
1165 out.write( separator );
1166 out.write( protein.getProteinId().getId() );
1167 out.write( separator );
1169 final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
1170 boolean first = true;
1171 for( final Domain domain : protein.getProteinDomains() ) {
1172 if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
1173 visited_domain_ids.add( domain.getDomainId() );
1180 out.write( domain.getDomainId().getId() );
1182 out.write( "" + domain.getTotalCount() );
1187 out.write( separator );
1188 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1189 .equals( SurfacingConstants.NONE ) ) ) {
1190 out.write( protein.getDescription() );
1192 out.write( separator );
1193 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1194 .equals( SurfacingConstants.NONE ) ) ) {
1195 out.write( protein.getAccession() );
1197 out.write( SurfacingConstants.NL );
1204 public static void extractProteinNames( final List<Protein> proteins,
1205 final List<DomainId> query_domain_ids_nc_order,
1207 final String separator,
1208 final String limit_to_species ) throws IOException {
1209 for( final Protein protein : proteins ) {
1210 if ( ForesterUtil.isEmpty( limit_to_species )
1211 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1212 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
1213 out.write( protein.getSpecies().getSpeciesId() );
1214 out.write( separator );
1215 out.write( protein.getProteinId().getId() );
1216 out.write( separator );
1218 final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
1219 boolean first = true;
1220 for( final Domain domain : protein.getProteinDomains() ) {
1221 if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
1222 visited_domain_ids.add( domain.getDomainId() );
1229 out.write( domain.getDomainId().getId() );
1231 out.write( "" + domain.getTotalCount() );
1236 out.write( separator );
1237 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1238 .equals( SurfacingConstants.NONE ) ) ) {
1239 out.write( protein.getDescription() );
1241 out.write( separator );
1242 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1243 .equals( SurfacingConstants.NONE ) ) ) {
1244 out.write( protein.getAccession() );
1246 out.write( SurfacingConstants.NL );
1253 public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
1254 final DomainId domain_id,
1256 final String separator,
1257 final String limit_to_species,
1258 final double domain_e_cutoff ) throws IOException {
1259 System.out.println( "Per domain E-value: " + domain_e_cutoff );
1260 for( final Species species : protein_lists_per_species.keySet() ) {
1261 System.out.println( species + ":" );
1262 for( final Protein protein : protein_lists_per_species.get( species ) ) {
1263 if ( ForesterUtil.isEmpty( limit_to_species )
1264 || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1265 final List<Domain> domains = protein.getProteinDomains( domain_id );
1266 if ( domains.size() > 0 ) {
1267 out.write( protein.getSpecies().getSpeciesId() );
1268 out.write( separator );
1269 out.write( protein.getProteinId().getId() );
1270 out.write( separator );
1271 out.write( domain_id.toString() );
1272 out.write( separator );
1274 for( final Domain domain : domains ) {
1275 if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1277 out.write( domain.getFrom() + "-" + domain.getTo() );
1278 if ( prev_to >= 0 ) {
1279 final int l = domain.getFrom() - prev_to;
1280 System.out.println( l );
1282 prev_to = domain.getTo();
1286 out.write( separator );
1287 final List<Domain> domain_list = new ArrayList<Domain>();
1288 for( final Domain domain : protein.getProteinDomains() ) {
1289 if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1290 domain_list.add( domain );
1293 final Domain domain_ary[] = new Domain[ domain_list.size() ];
1294 for( int i = 0; i < domain_list.size(); ++i ) {
1295 domain_ary[ i ] = domain_list.get( i );
1297 Arrays.sort( domain_ary, new DomainComparator( true ) );
1299 boolean first = true;
1300 for( final Domain domain : domain_ary ) {
1307 out.write( domain.getDomainId().toString() );
1308 out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
1309 out.write( ":" + domain.getPerDomainEvalue() );
1312 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1313 .equals( SurfacingConstants.NONE ) ) ) {
1314 out.write( protein.getDescription() );
1316 out.write( separator );
1317 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1318 .equals( SurfacingConstants.NONE ) ) ) {
1319 out.write( protein.getAccession() );
1321 out.write( SurfacingConstants.NL );
1329 public static SortedSet<DomainId> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
1330 final SortedSet<DomainId> all_domains_ids = new TreeSet<DomainId>();
1331 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
1332 final Set<DomainId> all_domains = gwcd.getAllDomainIds();
1333 // for( final Domain domain : all_domains ) {
1334 all_domains_ids.addAll( all_domains );
1337 return all_domains_ids;
1340 public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
1341 final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
1342 for( final Protein protein_domain_collection : protein_domain_collections ) {
1343 for( final Object name : protein_domain_collection.getProteinDomains() ) {
1344 final BasicDomain protein_domain = ( BasicDomain ) name;
1345 final String id = protein_domain.getDomainId().getId();
1346 if ( map.containsKey( id ) ) {
1347 map.put( id, map.get( id ) + 1 );
1357 public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
1358 final PhylogenyNodeIterator it = p.iteratorPostorder();
1360 while ( it.hasNext() ) {
1361 final PhylogenyNode n = it.next();
1362 if ( ForesterUtil.isEmpty( n.getName() )
1363 && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1364 .getScientificName() ) )
1365 && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1366 .getCommonName() ) ) ) {
1367 if ( n.getParent() != null ) {
1368 names.append( " " );
1369 names.append( n.getParent().getName() );
1371 final List l = n.getAllExternalDescendants();
1372 for( final Object object : l ) {
1373 System.out.println( l.toString() );
1382 * Returns true is Domain domain falls in an uninterrupted stretch of
1383 * covered positions.
1386 * @param covered_positions
1389 public static boolean isEngulfed( final Domain domain, final List<Boolean> covered_positions ) {
1390 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
1391 if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) {
1398 public static void preparePhylogeny( final Phylogeny p,
1399 final DomainParsimonyCalculator domain_parsimony,
1400 final String date_time,
1401 final String method,
1403 final String parameters_str ) {
1404 domain_parsimony.decoratePhylogenyWithDomains( p );
1405 final StringBuilder desc = new StringBuilder();
1406 desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
1407 desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
1408 desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
1409 desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
1410 desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
1411 desc.append( "[Parameters: " + parameters_str + "]" );
1413 p.setDescription( desc.toString() );
1414 p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
1415 p.setRerootable( false );
1416 p.setRooted( true );
1420 * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
1424 static public StringBuffer proteinToDomainCombinations( final Protein protein,
1425 final String protein_id,
1426 final String separator ) {
1427 final StringBuffer sb = new StringBuffer();
1428 if ( protein.getSpecies() == null ) {
1429 throw new IllegalArgumentException( "species must not be null" );
1431 if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
1432 throw new IllegalArgumentException( "species id must not be empty" );
1434 final List<Domain> domains = protein.getProteinDomains();
1435 if ( domains.size() > 1 ) {
1436 final Map<String, Integer> counts = new HashMap<String, Integer>();
1437 for( final Domain domain : domains ) {
1438 final String id = domain.getDomainId().getId();
1439 if ( counts.containsKey( id ) ) {
1440 counts.put( id, counts.get( id ) + 1 );
1443 counts.put( id, 1 );
1446 final Set<String> dcs = new HashSet<String>();
1447 for( int i = 1; i < domains.size(); ++i ) {
1448 for( int j = 0; j < i; ++j ) {
1449 Domain domain_n = domains.get( i );
1450 Domain domain_c = domains.get( j );
1451 if ( domain_n.getFrom() > domain_c.getFrom() ) {
1452 domain_n = domains.get( j );
1453 domain_c = domains.get( i );
1455 final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId();
1456 if ( !dcs.contains( dc ) ) {
1458 sb.append( protein.getSpecies() );
1459 sb.append( separator );
1460 sb.append( protein_id );
1461 sb.append( separator );
1462 sb.append( domain_n.getDomainId().getId() );
1463 sb.append( separator );
1464 sb.append( domain_c.getDomainId().getId() );
1465 sb.append( separator );
1466 sb.append( domain_n.getPerDomainEvalue() );
1467 sb.append( separator );
1468 sb.append( domain_c.getPerDomainEvalue() );
1469 sb.append( separator );
1470 sb.append( counts.get( domain_n.getDomainId().getId() ) );
1471 sb.append( separator );
1472 sb.append( counts.get( domain_c.getDomainId().getId() ) );
1473 sb.append( ForesterUtil.LINE_SEPARATOR );
1478 else if ( domains.size() == 1 ) {
1479 sb.append( protein.getSpecies() );
1480 sb.append( separator );
1481 sb.append( protein_id );
1482 sb.append( separator );
1483 sb.append( domains.get( 0 ).getDomainId().getId() );
1484 sb.append( separator );
1485 sb.append( separator );
1486 sb.append( domains.get( 0 ).getPerDomainEvalue() );
1487 sb.append( separator );
1488 sb.append( separator );
1490 sb.append( separator );
1491 sb.append( ForesterUtil.LINE_SEPARATOR );
1494 sb.append( protein.getSpecies() );
1495 sb.append( separator );
1496 sb.append( protein_id );
1497 sb.append( separator );
1498 sb.append( separator );
1499 sb.append( separator );
1500 sb.append( separator );
1501 sb.append( separator );
1502 sb.append( separator );
1503 sb.append( ForesterUtil.LINE_SEPARATOR );
1510 * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 =>
1511 * domain with 0.3 is ignored
1513 * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored
1516 * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_
1519 * @param max_allowed_overlap
1520 * maximal allowed overlap (inclusive) to be still considered not
1521 * overlapping (zero or negative value to allow any overlap)
1522 * @param remove_engulfed_domains
1523 * to remove domains which are completely engulfed by coverage of
1524 * domains with better support
1528 public static Protein removeOverlappingDomains( final int max_allowed_overlap,
1529 final boolean remove_engulfed_domains,
1530 final Protein protein ) {
1531 final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
1532 .getSpeciesId(), protein.getLength() );
1533 final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
1534 final List<Boolean> covered_positions = new ArrayList<Boolean>();
1535 for( final Domain domain : sorted ) {
1536 if ( ( ( max_allowed_overlap < 0 ) || ( SurfacingUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) )
1537 && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) {
1538 final int covered_positions_size = covered_positions.size();
1539 for( int i = covered_positions_size; i < domain.getFrom(); ++i ) {
1540 covered_positions.add( false );
1542 final int new_covered_positions_size = covered_positions.size();
1543 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
1544 if ( i < new_covered_positions_size ) {
1545 covered_positions.set( i, true );
1548 covered_positions.add( true );
1551 pruned_protein.addProteinDomain( domain );
1554 return pruned_protein;
1557 public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
1558 final List<Domain> domains = new ArrayList<Domain>();
1559 for( final Domain d : protein.getProteinDomains() ) {
1562 Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
1566 private static List<String> splitDomainCombination( final String dc ) {
1567 final String[] s = dc.split( "=" );
1568 if ( s.length != 2 ) {
1569 ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
1573 final List<String> l = new ArrayList<String>( 2 );
1579 public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
1580 final boolean get_gains,
1581 final String outdir,
1582 final String suffix_for_filename ) throws IOException {
1583 CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
1585 state = CharacterStateMatrix.GainLossStates.LOSS;
1587 final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
1591 for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
1592 final PhylogenyNode node = it.next();
1593 if ( !node.isExternal() ) {
1594 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
1595 if ( domains.size() > 0 ) {
1596 final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
1597 + node.getName() + suffix_for_filename );
1598 for( final String domain : domains ) {
1599 writer.write( domain );
1600 writer.write( ForesterUtil.LINE_SEPARATOR );
1608 private static void writeAllEncounteredPfamsToFile( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1609 final Map<GoId, GoTerm> go_id_to_term_map,
1610 final String outfile_name,
1611 final SortedSet<String> all_pfams_encountered ) {
1612 final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
1613 final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
1614 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
1615 final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
1616 int biological_process_counter = 0;
1617 int cellular_component_counter = 0;
1618 int molecular_function_counter = 0;
1619 int pfams_with_mappings_counter = 0;
1620 int pfams_without_mappings_counter = 0;
1621 int pfams_without_mappings_to_bp_or_mf_counter = 0;
1622 int pfams_with_mappings_to_bp_or_mf_counter = 0;
1624 final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
1625 final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
1626 final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
1627 summary_writer.write( "# Pfam to GO mapping summary" );
1628 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1629 summary_writer.write( "# Actual summary is at the end of this file." );
1630 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1631 summary_writer.write( "# Encountered Pfams without a GO mapping:" );
1632 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1633 for( final String pfam : all_pfams_encountered ) {
1634 all_pfams_encountered_writer.write( pfam );
1635 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
1636 final DomainId domain_id = new DomainId( pfam );
1637 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
1638 ++pfams_with_mappings_counter;
1639 all_pfams_encountered_with_go_annotation_writer.write( pfam );
1640 all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
1641 final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
1642 boolean maps_to_bp = false;
1643 boolean maps_to_cc = false;
1644 boolean maps_to_mf = false;
1645 for( final GoId go_id : go_ids ) {
1646 final GoTerm go_term = go_id_to_term_map.get( go_id );
1647 if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
1650 else if ( go_term.getGoNameSpace().isCellularComponent() ) {
1653 else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
1658 ++biological_process_counter;
1661 ++cellular_component_counter;
1664 ++molecular_function_counter;
1666 if ( maps_to_bp || maps_to_mf ) {
1667 ++pfams_with_mappings_to_bp_or_mf_counter;
1670 ++pfams_without_mappings_to_bp_or_mf_counter;
1674 ++pfams_without_mappings_to_bp_or_mf_counter;
1675 ++pfams_without_mappings_counter;
1676 summary_writer.write( pfam );
1677 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1680 all_pfams_encountered_writer.close();
1681 all_pfams_encountered_with_go_annotation_writer.close();
1682 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
1683 + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
1684 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
1685 + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
1687 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
1688 + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
1689 + encountered_pfams_summary_file + "\"" );
1690 ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : "
1691 + all_pfams_encountered.size() );
1692 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : "
1693 + pfams_without_mappings_counter + " ["
1694 + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
1695 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
1696 + pfams_without_mappings_to_bp_or_mf_counter + " ["
1697 + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
1698 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : "
1699 + pfams_with_mappings_counter + " ["
1700 + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
1701 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : "
1702 + pfams_with_mappings_to_bp_or_mf_counter + " ["
1703 + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
1704 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
1705 + biological_process_counter + " ["
1706 + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
1707 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
1708 + molecular_function_counter + " ["
1709 + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
1710 ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
1711 + cellular_component_counter + " ["
1712 + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
1713 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1714 summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() );
1715 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1716 summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter
1717 + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
1718 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1719 summary_writer.write( "# Pfams without mapping to proc. or func. : "
1720 + pfams_without_mappings_to_bp_or_mf_counter + " ["
1721 + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
1722 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1723 summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " ["
1724 + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
1725 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1726 summary_writer.write( "# Pfams with a mapping to proc. or func. : "
1727 + pfams_with_mappings_to_bp_or_mf_counter + " ["
1728 + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
1729 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1730 summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
1731 + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
1732 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1733 summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
1734 + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
1735 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1736 summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
1737 + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
1738 summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1739 summary_writer.close();
1741 catch ( final IOException e ) {
1742 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
1746 public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
1747 final File output_dir,
1748 final GenomeWideCombinableDomains gwcd,
1750 final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1751 File dc_outfile_dot = new File( input_file_properties[ i ][ 0 ]
1752 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
1753 if ( output_dir != null ) {
1754 dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
1756 checkForOutputFileWriteability( dc_outfile_dot );
1757 final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
1759 final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
1760 for( final BinaryDomainCombination bdc : binary_combinations ) {
1761 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
1763 out_dot.write( SurfacingConstants.NL );
1767 catch ( final IOException e ) {
1768 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1770 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
1771 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1772 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
1775 public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1776 final CharacterStateMatrix.GainLossStates state,
1777 final String filename,
1778 final String indentifier_characters_separator,
1779 final String character_separator,
1780 final Map<String, String> descriptions ) {
1781 final File outfile = new File( filename );
1782 checkForOutputFileWriteability( outfile );
1783 final SortedSet<String> sorted_ids = new TreeSet<String>();
1784 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1785 sorted_ids.add( matrix.getIdentifier( i ) );
1788 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1789 for( final String id : sorted_ids ) {
1790 out.write( indentifier_characters_separator );
1791 out.write( "#" + id );
1792 out.write( indentifier_characters_separator );
1793 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1795 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1796 if ( ( matrix.getState( id, c ) == state )
1797 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1798 .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1799 out.write( matrix.getCharacter( c ) );
1800 if ( ( descriptions != null ) && !descriptions.isEmpty()
1801 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
1803 out.write( descriptions.get( matrix.getCharacter( c ) ) );
1805 out.write( character_separator );
1812 catch ( final IOException e ) {
1813 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1815 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1818 public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1819 final CharacterStateMatrix.GainLossStates state,
1820 final String filename,
1821 final String indentifier_characters_separator,
1822 final String character_separator,
1823 final BinaryDomainCombination.OutputFormat bc_output_format ) {
1824 final File outfile = new File( filename );
1825 checkForOutputFileWriteability( outfile );
1826 final SortedSet<String> sorted_ids = new TreeSet<String>();
1827 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1828 sorted_ids.add( matrix.getIdentifier( i ) );
1831 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1832 for( final String id : sorted_ids ) {
1833 out.write( indentifier_characters_separator );
1834 out.write( "#" + id );
1835 out.write( indentifier_characters_separator );
1836 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1838 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1839 if ( ( matrix.getState( id, c ) == state )
1840 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1841 .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1842 BinaryDomainCombination bdc = null;
1844 bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
1846 catch ( final Exception e ) {
1847 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1849 out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
1850 out.write( character_separator );
1857 catch ( final IOException e ) {
1858 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1860 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1863 public static void writeBinaryStatesMatrixToList( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1864 final Map<GoId, GoTerm> go_id_to_term_map,
1865 final GoNameSpace go_namespace_limit,
1866 final boolean domain_combinations,
1867 final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1868 final CharacterStateMatrix.GainLossStates state,
1869 final String filename,
1870 final String indentifier_characters_separator,
1871 final String character_separator,
1872 final String title_for_html,
1873 final String prefix_for_html,
1874 final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
1875 final SortedSet<String> all_pfams_encountered,
1876 final SortedSet<String> pfams_gained_or_lost,
1877 final String suffix_for_per_node_events_file ) {
1878 if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1879 throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1881 else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1882 throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
1884 else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1885 throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
1887 final File outfile = new File( filename );
1888 checkForOutputFileWriteability( outfile );
1889 final SortedSet<String> sorted_ids = new TreeSet<String>();
1890 for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1891 sorted_ids.add( matrix.getIdentifier( i ) );
1894 final Writer out = new BufferedWriter( new FileWriter( outfile ) );
1895 final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
1896 domain_combinations,
1899 Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1900 File per_node_go_mapped_domain_gain_loss_outfile = null;
1901 int per_node_counter = 0;
1902 out.write( "<html>" );
1903 out.write( SurfacingConstants.NL );
1904 addHtmlHead( out, title_for_html );
1905 out.write( SurfacingConstants.NL );
1906 out.write( "<body>" );
1907 out.write( SurfacingConstants.NL );
1908 out.write( "<h1>" );
1909 out.write( SurfacingConstants.NL );
1910 out.write( title_for_html );
1911 out.write( SurfacingConstants.NL );
1912 out.write( "</h1>" );
1913 out.write( SurfacingConstants.NL );
1914 out.write( "<table>" );
1915 out.write( SurfacingConstants.NL );
1916 for( final String id : sorted_ids ) {
1917 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1918 if ( matcher.matches() ) {
1921 out.write( "<tr>" );
1922 out.write( "<td>" );
1923 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
1924 out.write( "</td>" );
1925 out.write( "</tr>" );
1926 out.write( SurfacingConstants.NL );
1928 out.write( "</table>" );
1929 out.write( SurfacingConstants.NL );
1930 for( final String id : sorted_ids ) {
1931 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1932 if ( matcher.matches() ) {
1935 out.write( SurfacingConstants.NL );
1936 out.write( "<h2>" );
1937 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
1938 writeTaxonomyLinks( out, id );
1939 out.write( "</h2>" );
1940 out.write( SurfacingConstants.NL );
1941 out.write( "<table>" );
1942 out.write( SurfacingConstants.NL );
1943 out.write( "<tr>" );
1944 out.write( "<td><b>" );
1945 out.write( "Pfam domain(s)" );
1946 out.write( "</b></td><td><b>" );
1947 out.write( "GO term acc" );
1948 out.write( "</b></td><td><b>" );
1949 out.write( "GO term" );
1950 out.write( "</b></td><td><b>" );
1951 out.write( "GO namespace" );
1952 out.write( "</b></td>" );
1953 out.write( "</tr>" );
1954 out.write( SurfacingConstants.NL );
1955 out.write( "</tr>" );
1956 out.write( SurfacingConstants.NL );
1957 per_node_counter = 0;
1958 if ( matrix.getNumberOfCharacters() > 0 ) {
1959 per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
1960 + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
1961 SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
1962 per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
1963 .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
1966 per_node_go_mapped_domain_gain_loss_outfile = null;
1967 per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1969 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1971 // using null to indicate either UNCHANGED_PRESENT or GAIN.
1972 if ( ( matrix.getState( id, c ) == state )
1973 || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
1974 .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
1975 final String character = matrix.getCharacter( c );
1976 String domain_0 = "";
1977 String domain_1 = "";
1978 if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
1979 final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
1980 if ( s.length != 2 ) {
1981 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
1982 + character + "]" );
1988 domain_0 = character;
1990 writeDomainData( domain_id_to_go_ids_map,
1997 character_separator,
1998 domain_id_to_secondary_features_maps,
2000 all_pfams_encountered.add( domain_0 );
2001 if ( pfams_gained_or_lost != null ) {
2002 pfams_gained_or_lost.add( domain_0 );
2004 if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2005 all_pfams_encountered.add( domain_1 );
2006 if ( pfams_gained_or_lost != null ) {
2007 pfams_gained_or_lost.add( domain_1 );
2010 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2011 writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
2018 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2019 per_node_go_mapped_domain_gain_loss_outfile_writer.close();
2020 if ( per_node_counter < 1 ) {
2021 per_node_go_mapped_domain_gain_loss_outfile.delete();
2023 per_node_counter = 0;
2025 out.write( "</table>" );
2026 out.write( SurfacingConstants.NL );
2027 out.write( "<hr>" );
2028 out.write( SurfacingConstants.NL );
2029 } // for( final String id : sorted_ids ) {
2030 out.write( "</body>" );
2031 out.write( SurfacingConstants.NL );
2032 out.write( "</html>" );
2033 out.write( SurfacingConstants.NL );
2037 catch ( final IOException e ) {
2038 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2040 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
2043 public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
2044 final File output_dir,
2045 final Writer per_genome_domain_promiscuity_statistics_writer,
2046 final GenomeWideCombinableDomains gwcd,
2048 final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
2049 File dc_outfile = new File( input_file_properties[ i ][ 0 ]
2050 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
2051 if ( output_dir != null ) {
2052 dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
2054 checkForOutputFileWriteability( dc_outfile );
2056 final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
2057 out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
2060 catch ( final IOException e ) {
2061 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2063 final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
2065 per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 0 ] + "\t" );
2066 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
2067 if ( stats.getN() < 2 ) {
2068 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
2071 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
2072 .sampleStandardDeviation() ) + "\t" );
2074 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
2075 per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
2076 per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
2077 per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
2078 final SortedSet<DomainId> mpds = gwcd.getMostPromiscuosDomain();
2079 for( final DomainId mpd : mpds ) {
2080 per_genome_domain_promiscuity_statistics_writer.write( mpd.getId() + " " );
2082 per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
2084 catch ( final IOException e ) {
2085 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2087 if ( input_file_properties[ i ].length == 3 ) {
2088 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2089 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
2090 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
2093 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2094 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
2095 + dc_outfile + "\"" );
2099 private static void writeDomainData( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
2100 final Map<GoId, GoTerm> go_id_to_term_map,
2101 final GoNameSpace go_namespace_limit,
2103 final String domain_0,
2104 final String domain_1,
2105 final String prefix_for_html,
2106 final String character_separator_for_non_html_output,
2107 final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
2108 final Set<GoId> all_go_ids ) throws IOException {
2109 boolean any_go_annotation_present = false;
2110 boolean first_has_no_go = false;
2111 int domain_count = 2; // To distinguish between domains and binary domain combinations.
2112 if ( ForesterUtil.isEmpty( domain_1 ) ) {
2115 // The following has a difficult to understand logic.
2116 for( int d = 0; d < domain_count; ++d ) {
2117 List<GoId> go_ids = null;
2118 boolean go_annotation_present = false;
2120 final DomainId domain_id = new DomainId( domain_0 );
2121 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
2122 go_annotation_present = true;
2123 any_go_annotation_present = true;
2124 go_ids = domain_id_to_go_ids_map.get( domain_id );
2127 first_has_no_go = true;
2131 final DomainId domain_id = new DomainId( domain_1 );
2132 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
2133 go_annotation_present = true;
2134 any_go_annotation_present = true;
2135 go_ids = domain_id_to_go_ids_map.get( domain_id );
2138 if ( go_annotation_present ) {
2139 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
2140 for( final GoId go_id : go_ids ) {
2141 out.write( "<tr>" );
2144 writeDomainIdsToHtml( out,
2148 domain_id_to_secondary_features_maps );
2151 out.write( "<td></td>" );
2153 if ( !go_id_to_term_map.containsKey( go_id ) ) {
2154 throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
2156 final GoTerm go_term = go_id_to_term_map.get( go_id );
2157 if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
2158 // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
2159 final String go_id_str = go_id.getId();
2160 out.write( "<td>" );
2161 out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
2162 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
2163 out.write( "</td><td>" );
2164 out.write( go_term.getName() );
2165 if ( domain_count == 2 ) {
2166 out.write( " (" + d + ")" );
2168 out.write( "</td><td>" );
2169 // out.write( top );
2170 // out.write( "</td><td>" );
2172 out.write( go_term.getGoNameSpace().toShortString() );
2174 out.write( "</td>" );
2175 if ( all_go_ids != null ) {
2176 all_go_ids.add( go_id );
2180 out.write( "<td>" );
2181 out.write( "</td><td>" );
2182 out.write( "</td><td>" );
2183 out.write( "</td><td>" );
2184 out.write( "</td>" );
2186 out.write( "</tr>" );
2187 out.write( SurfacingConstants.NL );
2190 } // for( int d = 0; d < domain_count; ++d )
2191 if ( !any_go_annotation_present ) {
2192 out.write( "<tr>" );
2193 writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
2194 out.write( "<td>" );
2195 out.write( "</td><td>" );
2196 out.write( "</td><td>" );
2197 out.write( "</td><td>" );
2198 out.write( "</td>" );
2199 out.write( "</tr>" );
2200 out.write( SurfacingConstants.NL );
2204 private static void writeDomainIdsToHtml( final Writer out,
2205 final String domain_0,
2206 final String domain_1,
2207 final String prefix_for_detailed_html,
2208 final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps )
2209 throws IOException {
2210 out.write( "<td>" );
2211 if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
2212 out.write( prefix_for_detailed_html );
2215 out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
2216 out.write( "</td>" );
2219 public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc,
2220 final StringBuilder html_title,
2221 final Writer single_writer,
2222 Map<Character, Writer> split_writers,
2223 final SortedSet<DomainSimilarity> similarities,
2224 final boolean treat_as_binary,
2225 final List<Species> species_order,
2226 final PrintableDomainSimilarity.PRINT_OPTION print_option,
2227 final DomainSimilarity.DomainSimilaritySortField sort_field,
2228 final DomainSimilarity.DomainSimilarityScoring scoring,
2229 final boolean verbose ) throws IOException {
2230 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
2231 String histogram_title = null;
2232 switch ( sort_field ) {
2233 case ABS_MAX_COUNTS_DIFFERENCE:
2234 if ( treat_as_binary ) {
2235 histogram_title = "absolute counts difference:";
2238 histogram_title = "absolute (maximal) counts difference:";
2241 case MAX_COUNTS_DIFFERENCE:
2242 if ( treat_as_binary ) {
2243 histogram_title = "counts difference:";
2246 histogram_title = "(maximal) counts difference:";
2250 histogram_title = "score mean:";
2253 histogram_title = "score minimum:";
2256 histogram_title = "score maximum:";
2258 case MAX_DIFFERENCE:
2259 if ( treat_as_binary ) {
2260 histogram_title = "difference:";
2263 histogram_title = "(maximal) difference:";
2267 histogram_title = "score mean:";
2270 histogram_title = "score standard deviation:";
2273 histogram_title = "species number:";
2276 throw new AssertionError( "Unknown sort field: " + sort_field );
2278 for( final DomainSimilarity similarity : similarities ) {
2279 switch ( sort_field ) {
2280 case ABS_MAX_COUNTS_DIFFERENCE:
2281 stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) );
2283 case MAX_COUNTS_DIFFERENCE:
2284 stats.addValue( similarity.getMaximalDifferenceInCounts() );
2287 stats.addValue( similarity.getMeanSimilarityScore() );
2290 stats.addValue( similarity.getMinimalSimilarityScore() );
2293 stats.addValue( similarity.getMaximalSimilarityScore() );
2295 case MAX_DIFFERENCE:
2296 stats.addValue( similarity.getMaximalDifference() );
2299 stats.addValue( similarity.getMeanSimilarityScore() );
2302 stats.addValue( similarity.getStandardDeviationOfSimilarityScore() );
2305 stats.addValue( similarity.getSpecies().size() );
2308 throw new AssertionError( "Unknown sort field: " + sort_field );
2312 // final HistogramData[] hists = new HistogramData[ 1 ];
2315 // List<HistogramDataItem> data_items = new
2316 // ArrayList<HistogramDataItem>();
2317 // double[] values = stats.getDataAsDoubleArray();
2318 // for( int i = 0; i < values.length; i++ ) {
2319 // HistogramDataItem data_item = new BasicHistogramDataItem( "", values[
2321 // data_items.add( data_item );
2325 // HistogramData hd0 = new HistogramData( "name",
2333 // hists[ 0 ] = hd0;
2335 // final HistogramsFrame hf = new HistogramsFrame( hists );
2336 // hf.setVisible( true );
2338 AsciiHistogram histo = null;
2339 if ( stats.getMin() < stats.getMin() ) {
2340 histo = new AsciiHistogram( stats, histogram_title );
2343 if ( histo != null ) {
2344 System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) );
2346 System.out.println();
2347 System.out.println( "N : " + stats.getN() );
2348 System.out.println( "Min : " + stats.getMin() );
2349 System.out.println( "Max : " + stats.getMax() );
2350 System.out.println( "Mean : " + stats.arithmeticMean() );
2351 if ( stats.getN() > 1 ) {
2352 System.out.println( "SD : " + stats.sampleStandardDeviation() );
2355 System.out.println( "SD : n/a" );
2357 System.out.println( "Median : " + stats.median() );
2358 if ( stats.getN() > 1 ) {
2359 System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() );
2362 System.out.println( "Pearsonian skewness : n/a" );
2365 if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
2366 split_writers = new HashMap<Character, Writer>();
2367 split_writers.put( '_', single_writer );
2369 switch ( print_option ) {
2370 case SIMPLE_TAB_DELIMITED:
2373 for( final Character key : split_writers.keySet() ) {
2374 final Writer w = split_writers.get( key );
2375 w.write( "<html>" );
2376 w.write( SurfacingConstants.NL );
2378 addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() );
2381 addHtmlHead( w, "DCs (" + html_title + ")" );
2383 w.write( SurfacingConstants.NL );
2384 w.write( "<body>" );
2385 w.write( SurfacingConstants.NL );
2386 w.write( html_desc.toString() );
2387 w.write( SurfacingConstants.NL );
2390 w.write( SurfacingConstants.NL );
2391 w.write( "<tt><pre>" );
2392 w.write( SurfacingConstants.NL );
2393 if ( histo != null ) {
2394 w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
2395 w.write( SurfacingConstants.NL );
2397 w.write( "</pre></tt>" );
2398 w.write( SurfacingConstants.NL );
2399 w.write( "<table>" );
2400 w.write( SurfacingConstants.NL );
2401 w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
2402 w.write( SurfacingConstants.NL );
2403 w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
2404 w.write( SurfacingConstants.NL );
2405 w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
2406 w.write( SurfacingConstants.NL );
2407 w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
2408 w.write( SurfacingConstants.NL );
2409 if ( stats.getN() > 1 ) {
2410 w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
2413 w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
2415 w.write( SurfacingConstants.NL );
2416 w.write( "<tr><td>Median: </td><td>" + stats.median() + "</td></tr>" );
2417 w.write( SurfacingConstants.NL );
2418 if ( stats.getN() > 1 ) {
2419 w.write( "<tr><td>Pearsonian skewness: </td><td>" + stats.pearsonianSkewness() + "</td></tr>" );
2422 w.write( "<tr><td>Pearsonian skewness: </td><td>n/a</td></tr>" );
2424 w.write( SurfacingConstants.NL );
2425 w.write( "</table>" );
2426 w.write( SurfacingConstants.NL );
2428 w.write( SurfacingConstants.NL );
2430 w.write( SurfacingConstants.NL );
2432 w.write( SurfacingConstants.NL );
2433 w.write( "<table>" );
2434 w.write( SurfacingConstants.NL );
2438 for( final Writer w : split_writers.values() ) {
2439 w.write( SurfacingConstants.NL );
2441 for( final DomainSimilarity similarity : similarities ) {
2442 if ( ( species_order != null ) && !species_order.isEmpty() ) {
2443 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
2445 if ( single_writer != null ) {
2446 single_writer.write( similarity.toStringBuffer( print_option ).toString() );
2449 Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" )
2450 .toLowerCase().charAt( 0 ) );
2451 if ( local_writer == null ) {
2452 local_writer = split_writers.get( '0' );
2454 local_writer.write( similarity.toStringBuffer( print_option ).toString() );
2456 for( final Writer w : split_writers.values() ) {
2457 w.write( SurfacingConstants.NL );
2460 switch ( print_option ) {
2462 for( final Writer w : split_writers.values() ) {
2463 w.write( SurfacingConstants.NL );
2464 w.write( "</table>" );
2465 w.write( SurfacingConstants.NL );
2466 w.write( "</font>" );
2467 w.write( SurfacingConstants.NL );
2468 w.write( "</body>" );
2469 w.write( SurfacingConstants.NL );
2470 w.write( "</html>" );
2471 w.write( SurfacingConstants.NL );
2475 for( final Writer w : split_writers.values() ) {
2481 private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
2482 final String domain_0,
2483 final String domain_1 ) throws IOException {
2484 individual_files_writer.write( domain_0 );
2485 individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2486 if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2487 individual_files_writer.write( domain_1 );
2488 individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2492 public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
2493 final String filename,
2494 final Format format ) {
2495 final File outfile = new File( filename );
2496 checkForOutputFileWriteability( outfile );
2498 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
2499 matrix.toWriter( out, format );
2503 catch ( final IOException e ) {
2504 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2506 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
2509 public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
2510 checkForOutputFileWriteability( matrix_outfile );
2512 final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
2513 for( final DistanceMatrix distance_matrix : matrices ) {
2514 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
2515 out.write( ForesterUtil.LINE_SEPARATOR );
2520 catch ( final IOException e ) {
2521 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2523 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
2526 private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
2528 final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
2529 for( final String pfam : pfams ) {
2530 writer.write( pfam );
2531 writer.write( ForesterUtil.LINE_SEPARATOR );
2534 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
2537 catch ( final IOException e ) {
2538 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2542 public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
2543 final PhylogenyWriter writer = new PhylogenyWriter();
2545 writer.toPhyloXML( new File( filename ), phylogeny, 1 );
2547 catch ( final IOException e ) {
2548 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
2551 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
2554 public static void writeTaxonomyLinks( final Writer writer, final String species ) throws IOException {
2555 if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
2556 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species );
2557 writer.write( " [" );
2558 if ( matcher.matches() ) {
2559 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_LINK + species
2560 + "\" target=\"taxonomy_window\">uniprot</a>" );
2563 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
2564 + "\" target=\"taxonomy_window\">eol</a>" );
2565 writer.write( "|" );
2566 writer.write( "<a href=\"" + SurfacingConstants.TOL_LINK + species
2567 + "\" target=\"taxonomy_window\">tol</a>" );
2569 writer.write( "]" );
2573 private static void writeToNexus( final String outfile_name,
2574 final CharacterStateMatrix<BinaryStates> matrix,
2575 final Phylogeny phylogeny ) {
2576 if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
2577 throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
2580 final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
2581 final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
2582 phylogenies.add( phylogeny );
2584 final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
2585 w.write( NexusConstants.NEXUS );
2586 w.write( ForesterUtil.LINE_SEPARATOR );
2587 my_matrix.writeNexusTaxaBlock( w );
2588 my_matrix.writeNexusBinaryChractersBlock( w );
2589 PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
2592 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
2594 catch ( final IOException e ) {
2595 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2599 private static void writeToNexus( final String outfile_name,
2600 final DomainParsimonyCalculator domain_parsimony,
2601 final Phylogeny phylogeny ) {
2602 writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
2603 domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
2605 writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
2606 domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
2610 public static void domainsPerProteinsStatistics( final String genome,
2611 final List<Protein> protein_list,
2612 final DescriptiveStatistics all_genomes_domains_per_potein_stats,
2613 final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
2614 final SortedSet<String> domains_which_are_always_single,
2615 final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
2616 final SortedSet<String> domains_which_never_single,
2617 final Writer writer ) {
2618 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
2619 for( final Protein protein : protein_list ) {
2620 final int domains = protein.getNumberOfProteinDomains();
2621 //System.out.println( domains );
2622 stats.addValue( domains );
2623 all_genomes_domains_per_potein_stats.addValue( domains );
2624 if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
2625 all_genomes_domains_per_potein_histo.put( domains, 1 );
2628 all_genomes_domains_per_potein_histo.put( domains,
2629 1 + all_genomes_domains_per_potein_histo.get( domains ) );
2631 if ( domains == 1 ) {
2632 final String domain = protein.getProteinDomain( 0 ).getDomainId().getId();
2633 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
2634 if ( domains_which_never_single.contains( domain ) ) {
2635 domains_which_never_single.remove( domain );
2636 domains_which_are_sometimes_single_sometimes_not.add( domain );
2639 domains_which_are_always_single.add( domain );
2643 else if ( domains > 1 ) {
2644 for( final Domain d : protein.getProteinDomains() ) {
2645 final String domain = d.getDomainId().getId();
2646 // System.out.println( domain );
2647 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
2648 if ( domains_which_are_always_single.contains( domain ) ) {
2649 domains_which_are_always_single.remove( domain );
2650 domains_which_are_sometimes_single_sometimes_not.add( domain );
2653 domains_which_never_single.add( domain );
2660 writer.write( genome );
2661 writer.write( "\t" );
2662 if ( stats.getN() >= 1 ) {
2663 writer.write( stats.arithmeticMean() + "" );
2664 writer.write( "\t" );
2665 if ( stats.getN() >= 2 ) {
2666 writer.write( stats.sampleStandardDeviation() + "" );
2671 writer.write( "\t" );
2672 writer.write( stats.median() + "" );
2673 writer.write( "\t" );
2674 writer.write( stats.getN() + "" );
2675 writer.write( "\t" );
2676 writer.write( stats.getMin() + "" );
2677 writer.write( "\t" );
2678 writer.write( stats.getMax() + "" );
2681 writer.write( "\t" );
2682 writer.write( "\t" );
2683 writer.write( "\t" );
2684 writer.write( "0" );
2685 writer.write( "\t" );
2686 writer.write( "\t" );
2688 writer.write( "\n" );
2690 catch ( final IOException e ) {
2691 e.printStackTrace();
2695 final static class DomainComparator implements Comparator<Domain> {
2697 final private boolean _ascending;
2699 public DomainComparator( final boolean ascending ) {
2700 _ascending = ascending;
2704 public final int compare( final Domain d0, final Domain d1 ) {
2705 if ( d0.getFrom() < d1.getFrom() ) {
2706 return _ascending ? -1 : 1;
2708 else if ( d0.getFrom() > d1.getFrom() ) {
2709 return _ascending ? 1 : -1;