inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.awt.Color;
30 import java.io.BufferedWriter;
31 import java.io.File;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.Writer;
35 import java.text.DecimalFormat;
36 import java.text.NumberFormat;
37 import java.util.ArrayList;
38 import java.util.Arrays;
39 import java.util.Collections;
40 import java.util.Comparator;
41 import java.util.HashMap;
42 import java.util.HashSet;
43 import java.util.Iterator;
44 import java.util.List;
45 import java.util.Map;
46 import java.util.Map.Entry;
47 import java.util.PriorityQueue;
48 import java.util.Set;
49 import java.util.SortedMap;
50 import java.util.SortedSet;
51 import java.util.TreeMap;
52 import java.util.TreeSet;
53 import java.util.regex.Matcher;
54 import java.util.regex.Pattern;
55
56 import org.forester.application.surfacing;
57 import org.forester.evoinference.distance.NeighborJoining;
58 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
59 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
60 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
61 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
62 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
63 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
64 import org.forester.evoinference.matrix.distance.DistanceMatrix;
65 import org.forester.go.GoId;
66 import org.forester.go.GoNameSpace;
67 import org.forester.go.GoTerm;
68 import org.forester.go.PfamToGoMapping;
69 import org.forester.io.parsers.nexus.NexusConstants;
70 import org.forester.io.writers.PhylogenyWriter;
71 import org.forester.phylogeny.Phylogeny;
72 import org.forester.phylogeny.PhylogenyMethods;
73 import org.forester.phylogeny.PhylogenyNode;
74 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
75 import org.forester.phylogeny.data.BinaryCharacters;
76 import org.forester.phylogeny.data.Confidence;
77 import org.forester.phylogeny.data.Taxonomy;
78 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
79 import org.forester.protein.BasicDomain;
80 import org.forester.protein.BasicProtein;
81 import org.forester.protein.BinaryDomainCombination;
82 import org.forester.protein.Domain;
83 import org.forester.protein.Protein;
84 import org.forester.species.Species;
85 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
86 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
87 import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION;
88 import org.forester.util.AsciiHistogram;
89 import org.forester.util.BasicDescriptiveStatistics;
90 import org.forester.util.BasicTable;
91 import org.forester.util.BasicTableParser;
92 import org.forester.util.DescriptiveStatistics;
93 import org.forester.util.ForesterUtil;
94
95 public final class SurfacingUtil {
96
97     private final static NumberFormat       FORMATTER_3                      = new DecimalFormat( "0.000" );
98     private static final Comparator<Domain> ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
99
100                                                                                  @Override
101                                                                                  public int compare( final Domain d1,
102                                                                                                      final Domain d2 ) {
103                                                                                      if ( d1.getPerSequenceEvalue() < d2
104                                                                                              .getPerSequenceEvalue() ) {
105                                                                                          return -1;
106                                                                                      }
107                                                                                      else if ( d1
108                                                                                              .getPerSequenceEvalue() > d2
109                                                                                              .getPerSequenceEvalue() ) {
110                                                                                          return 1;
111                                                                                      }
112                                                                                      else {
113                                                                                          return d1.compareTo( d2 );
114                                                                                      }
115                                                                                  }
116                                                                              };
117     public final static Pattern             PATTERN_SP_STYLE_TAXONOMY        = Pattern.compile( "^[A-Z0-9]{3,5}$" );
118
119     private SurfacingUtil() {
120         // Hidden constructor.
121     }
122
123     public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
124                                                            final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
125         final SortedMap<String, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
126         for( final String domain_id : all_cd.keySet() ) {
127             binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
128         }
129     }
130
131     public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
132                                              final SortedSet<String> domain_ids ) {
133         final SortedSet<String> domains = genome.getAllDomainIds();
134         for( final String domain : domains ) {
135             domain_ids.add( domain );
136         }
137     }
138
139     public static void addHtmlHead( final Writer w, final String title ) throws IOException {
140         w.write( SurfacingConstants.NL );
141         w.write( "<head>" );
142         w.write( "<title>" );
143         w.write( title );
144         w.write( "</title>" );
145         w.write( SurfacingConstants.NL );
146         w.write( "<style>" );
147         w.write( SurfacingConstants.NL );
148         w.write( "a:visited { color : #6633FF; text-decoration : none; }" );
149         w.write( SurfacingConstants.NL );
150         w.write( "a:link { color : #6633FF; text-decoration : none; }" );
151         w.write( SurfacingConstants.NL );
152         w.write( "a:active { color : #99FF00; text-decoration : none; }" );
153         w.write( SurfacingConstants.NL );
154         w.write( "a:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; }" );
155         w.write( SurfacingConstants.NL );
156         //
157         w.write( "a.pl:visited { color : #505050; text-decoration : none; font-size: 7pt;}" );
158         w.write( SurfacingConstants.NL );
159         w.write( "a.pl:link { color : #505050; text-decoration : none; font-size: 7pt;}" );
160         w.write( SurfacingConstants.NL );
161         w.write( "a.pl:active { color : #505050; text-decoration : none; font-size: 7pt;}" );
162         w.write( SurfacingConstants.NL );
163         w.write( "a.pl:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; font-size: 7pt;}" );
164         w.write( SurfacingConstants.NL );
165         //
166         w.write( "a.ps:visited { color : #707070; text-decoration : none; font-size: 7pt;}" );
167         w.write( SurfacingConstants.NL );
168         w.write( "a.ps:link { color : #707070; text-decoration : none; font-size: 7pt;}" );
169         w.write( SurfacingConstants.NL );
170         w.write( "a.ps:active { color : #707070; text-decoration : none; font-size: 7pt;}" );
171         w.write( SurfacingConstants.NL );
172         w.write( "a.ps:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; font-size: 7pt;}" );
173         w.write( SurfacingConstants.NL );
174         //
175         w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
176         w.write( SurfacingConstants.NL );
177         w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
178         w.write( SurfacingConstants.NL );
179         w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
180         w.write( SurfacingConstants.NL );
181         w.write( "</style>" );
182         w.write( SurfacingConstants.NL );
183         w.write( "</head>" );
184         w.write( SurfacingConstants.NL );
185     }
186
187     public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
188         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
189         for( final DomainSimilarity similarity : similarities ) {
190             stats.addValue( similarity.getMeanSimilarityScore() );
191         }
192         return stats;
193     }
194
195     public static void checkForOutputFileWriteability( final File outfile ) {
196         final String error = ForesterUtil.isWritableFile( outfile );
197         if ( !ForesterUtil.isEmpty( error ) ) {
198             ForesterUtil.fatalError( surfacing.PRG_NAME, error );
199         }
200     }
201
202     public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
203                                                                                            final BinaryDomainCombination.DomainCombinationType dc_type,
204                                                                                            final List<BinaryDomainCombination> all_binary_domains_combination_gained,
205                                                                                            final boolean get_gains ) {
206         final SortedSet<String> sorted_ids = new TreeSet<String>();
207         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
208             sorted_ids.add( matrix.getIdentifier( i ) );
209         }
210         for( final String id : sorted_ids ) {
211             for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
212                 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
213                         || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
214                     if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
215                         all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
216                                 .createInstance( matrix.getCharacter( c ) ) );
217                     }
218                     else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
219                         all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
220                                 .createInstance( matrix.getCharacter( c ) ) );
221                     }
222                     else {
223                         all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
224                                 .getCharacter( c ) ) );
225                     }
226                 }
227             }
228         }
229     }
230
231     public static Map<String, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
232         final Map<String, List<GoId>> domain_id_to_go_ids_map = new HashMap<String, List<GoId>>( pfam_to_go_mappings.size() );
233         for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
234             if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
235                 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
236             }
237             domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
238         }
239         return domain_id_to_go_ids_map;
240     }
241
242     public static Map<String, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
243             throws IOException {
244         final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
245         final Map<String, Set<String>> map = new TreeMap<String, Set<String>>();
246         for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
247             final String domain_id = primary_table.getValue( 0, r );
248             if ( !map.containsKey( domain_id ) ) {
249                 map.put( domain_id, new HashSet<String>() );
250             }
251             map.get( domain_id ).add( primary_table.getValue( 1, r ) );
252         }
253         return map;
254     }
255
256     public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
257         checkForOutputFileWriteability( nj_tree_outfile );
258         final NeighborJoining nj = NeighborJoining.createInstance();
259         final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
260         phylogeny.setName( nj_tree_outfile.getName() );
261         writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
262         return phylogeny;
263     }
264
265     public static Map<String, Integer> createTaxCodeToIdMap( final Phylogeny phy ) {
266         final Map<String, Integer> m = new HashMap<String, Integer>();
267         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
268             final PhylogenyNode n = iter.next();
269             if ( n.getNodeData().isHasTaxonomy() ) {
270                 final Taxonomy t = n.getNodeData().getTaxonomy();
271                 final String c = t.getTaxonomyCode();
272                 if ( !ForesterUtil.isEmpty( c ) ) {
273                     if ( n.getNodeData().getTaxonomy() == null ) {
274                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
275                     }
276                     final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue();
277                     if ( ForesterUtil.isEmpty( id ) ) {
278                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
279                     }
280                     if ( m.containsKey( c ) ) {
281                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" );
282                     }
283                     final int iid = Integer.valueOf( id );
284                     if ( m.containsValue( iid ) ) {
285                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" );
286                     }
287                     m.put( c, iid );
288                 }
289             }
290             else {
291                 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n );
292             }
293         }
294         return m;
295     }
296
297     public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
298                                                             final Detailedness detailedness ) {
299         for( final DomainSimilarity domain_similarity : domain_similarities ) {
300             if ( domain_similarity instanceof PrintableDomainSimilarity ) {
301                 final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity;
302                 printable_domain_similarity.setDetailedness( detailedness );
303             }
304         }
305     }
306
307     public static void doit( final List<Protein> proteins,
308                              final List<String> query_domain_ids_nc_order,
309                              final Writer out,
310                              final String separator,
311                              final String limit_to_species,
312                              final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
313         for( final Protein protein : proteins ) {
314             if ( ForesterUtil.isEmpty( limit_to_species )
315                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
316                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
317                     out.write( protein.getSpecies().getSpeciesId() );
318                     out.write( separator );
319                     out.write( protein.getProteinId().getId() );
320                     out.write( separator );
321                     out.write( "[" );
322                     final Set<String> visited_domain_ids = new HashSet<String>();
323                     boolean first = true;
324                     for( final Domain domain : protein.getProteinDomains() ) {
325                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
326                             visited_domain_ids.add( domain.getDomainId() );
327                             if ( first ) {
328                                 first = false;
329                             }
330                             else {
331                                 out.write( " " );
332                             }
333                             out.write( domain.getDomainId() );
334                             out.write( " {" );
335                             out.write( "" + domain.getTotalCount() );
336                             out.write( "}" );
337                         }
338                     }
339                     out.write( "]" );
340                     out.write( separator );
341                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
342                             .equals( SurfacingConstants.NONE ) ) ) {
343                         out.write( protein.getDescription() );
344                     }
345                     out.write( separator );
346                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
347                             .equals( SurfacingConstants.NONE ) ) ) {
348                         out.write( protein.getAccession() );
349                     }
350                     out.write( SurfacingConstants.NL );
351                 }
352             }
353         }
354         out.flush();
355     }
356
357     public static void domainsPerProteinsStatistics( final String genome,
358                                                      final List<Protein> protein_list,
359                                                      final DescriptiveStatistics all_genomes_domains_per_potein_stats,
360                                                      final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
361                                                      final SortedSet<String> domains_which_are_always_single,
362                                                      final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
363                                                      final SortedSet<String> domains_which_never_single,
364                                                      final Writer writer ) {
365         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
366         for( final Protein protein : protein_list ) {
367             final int domains = protein.getNumberOfProteinDomains();
368             //System.out.println( domains );
369             stats.addValue( domains );
370             all_genomes_domains_per_potein_stats.addValue( domains );
371             if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
372                 all_genomes_domains_per_potein_histo.put( domains, 1 );
373             }
374             else {
375                 all_genomes_domains_per_potein_histo.put( domains,
376                                                           1 + all_genomes_domains_per_potein_histo.get( domains ) );
377             }
378             if ( domains == 1 ) {
379                 final String domain = protein.getProteinDomain( 0 ).getDomainId();
380                 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
381                     if ( domains_which_never_single.contains( domain ) ) {
382                         domains_which_never_single.remove( domain );
383                         domains_which_are_sometimes_single_sometimes_not.add( domain );
384                     }
385                     else {
386                         domains_which_are_always_single.add( domain );
387                     }
388                 }
389             }
390             else if ( domains > 1 ) {
391                 for( final Domain d : protein.getProteinDomains() ) {
392                     final String domain = d.getDomainId();
393                     // System.out.println( domain );
394                     if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
395                         if ( domains_which_are_always_single.contains( domain ) ) {
396                             domains_which_are_always_single.remove( domain );
397                             domains_which_are_sometimes_single_sometimes_not.add( domain );
398                         }
399                         else {
400                             domains_which_never_single.add( domain );
401                         }
402                     }
403                 }
404             }
405         }
406         try {
407             writer.write( genome );
408             writer.write( "\t" );
409             if ( stats.getN() >= 1 ) {
410                 writer.write( stats.arithmeticMean() + "" );
411                 writer.write( "\t" );
412                 if ( stats.getN() >= 2 ) {
413                     writer.write( stats.sampleStandardDeviation() + "" );
414                 }
415                 else {
416                     writer.write( "" );
417                 }
418                 writer.write( "\t" );
419                 writer.write( stats.median() + "" );
420                 writer.write( "\t" );
421                 writer.write( stats.getN() + "" );
422                 writer.write( "\t" );
423                 writer.write( stats.getMin() + "" );
424                 writer.write( "\t" );
425                 writer.write( stats.getMax() + "" );
426             }
427             else {
428                 writer.write( "\t" );
429                 writer.write( "\t" );
430                 writer.write( "\t" );
431                 writer.write( "0" );
432                 writer.write( "\t" );
433                 writer.write( "\t" );
434             }
435             writer.write( "\n" );
436         }
437         catch ( final IOException e ) {
438             e.printStackTrace();
439         }
440     }
441
442     public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
443                                                     final int number_of_genomes,
444                                                     final DomainLengthsTable domain_lengths_table,
445                                                     final File outfile ) throws IOException {
446         final DecimalFormat df = new DecimalFormat( "#.00" );
447         checkForOutputFileWriteability( outfile );
448         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
449         out.write( "MEAN BASED STATISTICS PER SPECIES" );
450         out.write( ForesterUtil.LINE_SEPARATOR );
451         out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
452         out.write( ForesterUtil.LINE_SEPARATOR );
453         out.write( ForesterUtil.LINE_SEPARATOR );
454         final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
455         out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
456         out.write( ForesterUtil.LINE_SEPARATOR );
457         for( final DomainLengths domain_lengths : domain_lengths_list ) {
458             final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
459             if ( species_list.size() > 0 ) {
460                 out.write( domain_lengths.getDomainId() + "\t" );
461                 for( final Species species : species_list ) {
462                     out.write( species + "\t" );
463                 }
464                 out.write( ForesterUtil.LINE_SEPARATOR );
465             }
466         }
467         out.write( ForesterUtil.LINE_SEPARATOR );
468         out.write( ForesterUtil.LINE_SEPARATOR );
469         out.write( "OUTLIER SPECIES (Z 1.0)" );
470         out.write( ForesterUtil.LINE_SEPARATOR );
471         final DescriptiveStatistics stats_for_all_species = domain_lengths_table
472                 .calculateMeanBasedStatisticsForAllSpecies();
473         out.write( stats_for_all_species.asSummary() );
474         out.write( ForesterUtil.LINE_SEPARATOR );
475         final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
476         out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
477         out.write( ForesterUtil.LINE_SEPARATOR );
478         final double population_sd = stats_for_all_species.sampleStandardDeviation();
479         final double population_mean = stats_for_all_species.arithmeticMean();
480         for( final Species species : domain_lengths_table.getSpecies() ) {
481             final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
482             final double z = ( x - population_mean ) / population_sd;
483             out.write( species + "\t" + z );
484             out.write( ForesterUtil.LINE_SEPARATOR );
485         }
486         out.write( ForesterUtil.LINE_SEPARATOR );
487         for( final Species species : domain_lengths_table.getSpecies() ) {
488             final DescriptiveStatistics stats_for_species = domain_lengths_table
489                     .calculateMeanBasedStatisticsForSpecies( species );
490             final double x = stats_for_species.arithmeticMean();
491             final double z = ( x - population_mean ) / population_sd;
492             if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
493                 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
494                 out.write( ForesterUtil.LINE_SEPARATOR );
495             }
496         }
497         out.close();
498         System.gc();
499     }
500
501     /**
502      * 
503      * @param all_binary_domains_combination_lost_fitch 
504      * @param use_last_in_fitch_parsimony 
505      * @param consider_directedness_and_adjacency_for_bin_combinations 
506      * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
507      * which were gained under unweighted (Fitch) parsimony.
508      */
509     public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
510                                                  final boolean radomize_fitch_parsimony,
511                                                  final String outfile_name,
512                                                  final DomainParsimonyCalculator domain_parsimony,
513                                                  final Phylogeny phylogeny,
514                                                  final Map<String, List<GoId>> domain_id_to_go_ids_map,
515                                                  final Map<GoId, GoTerm> go_id_to_term_map,
516                                                  final GoNameSpace go_namespace_limit,
517                                                  final String parameters_str,
518                                                  final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
519                                                  final SortedSet<String> positive_filter,
520                                                  final boolean output_binary_domain_combinations_for_graphs,
521                                                  final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
522                                                  final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
523                                                  final BinaryDomainCombination.DomainCombinationType dc_type,
524                                                  final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
525                                                  final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
526                                                  final Map<String, DescriptiveStatistics> domain_length_stats_by_domain,
527                                                  final Map<String, Integer> tax_code_to_id_map,
528                                                  final boolean write_to_nexus,
529                                                  final boolean use_last_in_fitch_parsimony ) {
530         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
531         final String date_time = ForesterUtil.getCurrentDateTime();
532         final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
533         final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
534         final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
535         final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
536         final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
537         if ( write_to_nexus ) {
538             writeToNexus( outfile_name, domain_parsimony, phylogeny );
539         }
540         // DOLLO DOMAINS
541         // -------------
542         Phylogeny local_phylogeny_l = phylogeny.copy();
543         if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
544             domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
545         }
546         else {
547             domain_parsimony.executeDolloParsimonyOnDomainPresence();
548         }
549         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
550                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
551         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
552                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
553         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
554                                                            CharacterStateMatrix.GainLossStates.GAIN,
555                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
556                                                            sep,
557                                                            ForesterUtil.LINE_SEPARATOR,
558                                                            null );
559         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
560                                                            CharacterStateMatrix.GainLossStates.LOSS,
561                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
562                                                            sep,
563                                                            ForesterUtil.LINE_SEPARATOR,
564                                                            null );
565         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
566                 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
567         //HTML:
568         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
569                                        go_id_to_term_map,
570                                        go_namespace_limit,
571                                        false,
572                                        domain_parsimony.getGainLossMatrix(),
573                                        CharacterStateMatrix.GainLossStates.GAIN,
574                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
575                                        sep,
576                                        ForesterUtil.LINE_SEPARATOR,
577                                        "Dollo Parsimony | Gains | Domains",
578                                        "+",
579                                        domain_id_to_secondary_features_maps,
580                                        all_pfams_encountered,
581                                        all_pfams_gained_as_domains,
582                                        "_dollo_gains_d",
583                                        tax_code_to_id_map );
584         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
585                                        go_id_to_term_map,
586                                        go_namespace_limit,
587                                        false,
588                                        domain_parsimony.getGainLossMatrix(),
589                                        CharacterStateMatrix.GainLossStates.LOSS,
590                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
591                                        sep,
592                                        ForesterUtil.LINE_SEPARATOR,
593                                        "Dollo Parsimony | Losses | Domains",
594                                        "-",
595                                        domain_id_to_secondary_features_maps,
596                                        all_pfams_encountered,
597                                        all_pfams_lost_as_domains,
598                                        "_dollo_losses_d",
599                                        tax_code_to_id_map );
600         //        writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
601         //                                       go_id_to_term_map,
602         //                                       go_namespace_limit,
603         //                                       false,
604         //                                       domain_parsimony.getGainLossMatrix(),
605         //                                       null,
606         //                                       outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
607         //                                       sep,
608         //                                       ForesterUtil.LINE_SEPARATOR,
609         //                                       "Dollo Parsimony | Present | Domains",
610         //                                       "",
611         //                                       domain_id_to_secondary_features_maps,
612         //                                       all_pfams_encountered,
613         //                                       null,
614         //                                       "_dollo_present_d",
615         //                                       tax_code_to_id_map );
616         preparePhylogeny( local_phylogeny_l,
617                           domain_parsimony,
618                           date_time,
619                           "Dollo parsimony on domain presence/absence",
620                           "dollo_on_domains_" + outfile_name,
621                           parameters_str );
622         SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
623                 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
624         try {
625             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
626             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
627         }
628         catch ( final IOException e ) {
629             e.printStackTrace();
630             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
631         }
632         if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
633             // FITCH DOMAIN COMBINATIONS
634             // -------------------------
635             local_phylogeny_l = phylogeny.copy();
636             String randomization = "no";
637             if ( radomize_fitch_parsimony ) {
638                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
639                 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
640             }
641             else {
642                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony );
643             }
644             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
645                     + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
646             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
647                     + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
648             SurfacingUtil
649                     .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
650                                                           CharacterStateMatrix.GainLossStates.GAIN,
651                                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
652                                                           sep,
653                                                           ForesterUtil.LINE_SEPARATOR,
654                                                           null );
655             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
656                                                                CharacterStateMatrix.GainLossStates.LOSS,
657                                                                outfile_name
658                                                                        + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
659                                                                sep,
660                                                                ForesterUtil.LINE_SEPARATOR,
661                                                                null );
662             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
663                     + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
664             if ( all_binary_domains_combination_gained_fitch != null ) {
665                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
666                                                                                     dc_type,
667                                                                                     all_binary_domains_combination_gained_fitch,
668                                                                                     true );
669             }
670             if ( all_binary_domains_combination_lost_fitch != null ) {
671                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
672                                                                                     dc_type,
673                                                                                     all_binary_domains_combination_lost_fitch,
674                                                                                     false );
675             }
676             if ( output_binary_domain_combinations_for_graphs ) {
677                 SurfacingUtil
678                         .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
679                                                                                                            .getGainLossMatrix(),
680                                                                                                    null,
681                                                                                                    outfile_name
682                                                                                                            + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
683                                                                                                    sep,
684                                                                                                    ForesterUtil.LINE_SEPARATOR,
685                                                                                                    BinaryDomainCombination.OutputFormat.DOT );
686             }
687             // HTML:
688             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
689                                            go_id_to_term_map,
690                                            go_namespace_limit,
691                                            true,
692                                            domain_parsimony.getGainLossMatrix(),
693                                            CharacterStateMatrix.GainLossStates.GAIN,
694                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
695                                            sep,
696                                            ForesterUtil.LINE_SEPARATOR,
697                                            "Fitch Parsimony | Gains | Domain Combinations",
698                                            "+",
699                                            null,
700                                            all_pfams_encountered,
701                                            all_pfams_gained_as_dom_combinations,
702                                            "_fitch_gains_dc",
703                                            tax_code_to_id_map );
704             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
705                                            go_id_to_term_map,
706                                            go_namespace_limit,
707                                            true,
708                                            domain_parsimony.getGainLossMatrix(),
709                                            CharacterStateMatrix.GainLossStates.LOSS,
710                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
711                                            sep,
712                                            ForesterUtil.LINE_SEPARATOR,
713                                            "Fitch Parsimony | Losses | Domain Combinations",
714                                            "-",
715                                            null,
716                                            all_pfams_encountered,
717                                            all_pfams_lost_as_dom_combinations,
718                                            "_fitch_losses_dc",
719                                            tax_code_to_id_map );
720             //            writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
721             //                                           go_id_to_term_map,
722             //                                           go_namespace_limit,
723             //                                           true,
724             //                                           domain_parsimony.getGainLossMatrix(),
725             //                                           null,
726             //                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
727             //                                           sep,
728             //                                           ForesterUtil.LINE_SEPARATOR,
729             //                                           "Fitch Parsimony | Present | Domain Combinations",
730             //                                           "",
731             //                                           null,
732             //                                           all_pfams_encountered,
733             //                                           null,
734             //                                           "_fitch_present_dc",
735             //                                           tax_code_to_id_map );
736             writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
737                                             go_id_to_term_map,
738                                             outfile_name,
739                                             all_pfams_encountered );
740             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
741             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
742             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
743                               all_pfams_gained_as_dom_combinations );
744             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
745             preparePhylogeny( local_phylogeny_l,
746                               domain_parsimony,
747                               date_time,
748                               "Fitch parsimony on binary domain combination presence/absence randomization: "
749                                       + randomization,
750                               "fitch_on_binary_domain_combinations_" + outfile_name,
751                               parameters_str );
752             SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
753                     + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
754             calculateIndependentDomainCombinationGains( local_phylogeny_l,
755                                                         outfile_name
756                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
757                                                         outfile_name
758                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
759                                                         outfile_name
760                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
761                                                         outfile_name
762                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
763                                                         outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
764                                                         outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
765                                                         outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
766                                                         protein_length_stats_by_dc,
767                                                         domain_number_stats_by_dc,
768                                                         domain_length_stats_by_domain );
769         }
770     }
771
772     public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
773                                                                      final DomainParsimonyCalculator secondary_features_parsimony,
774                                                                      final Phylogeny phylogeny,
775                                                                      final String parameters_str,
776                                                                      final Map<Species, MappingResults> mapping_results_map,
777                                                                      final boolean use_last_in_fitch_parsimony ) {
778         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
779         final String date_time = ForesterUtil.getCurrentDateTime();
780         System.out.println();
781         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
782                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
783                       phylogeny );
784         Phylogeny local_phylogeny_copy = phylogeny.copy();
785         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
786         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
787                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
788         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
789                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
790         SurfacingUtil
791                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
792                                                       CharacterStateMatrix.GainLossStates.GAIN,
793                                                       outfile_name
794                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
795                                                       sep,
796                                                       ForesterUtil.LINE_SEPARATOR,
797                                                       null );
798         SurfacingUtil
799                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
800                                                       CharacterStateMatrix.GainLossStates.LOSS,
801                                                       outfile_name
802                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
803                                                       sep,
804                                                       ForesterUtil.LINE_SEPARATOR,
805                                                       null );
806         SurfacingUtil
807                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
808                                                       null,
809                                                       outfile_name
810                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
811                                                       sep,
812                                                       ForesterUtil.LINE_SEPARATOR,
813                                                       null );
814         preparePhylogeny( local_phylogeny_copy,
815                           secondary_features_parsimony,
816                           date_time,
817                           "Dollo parsimony on secondary feature presence/absence",
818                           "dollo_on_secondary_features_" + outfile_name,
819                           parameters_str );
820         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
821                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
822         // FITCH DOMAIN COMBINATIONS
823         // -------------------------
824         local_phylogeny_copy = phylogeny.copy();
825         final String randomization = "no";
826         secondary_features_parsimony
827                 .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony );
828         preparePhylogeny( local_phylogeny_copy,
829                           secondary_features_parsimony,
830                           date_time,
831                           "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
832                                   + randomization,
833                           "fitch_on_binary_domain_combinations_" + outfile_name,
834                           parameters_str );
835         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
836                 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
837         calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
838                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
839                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
840                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
841                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
842                 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
843                 + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
844     }
845
846     public static void extractProteinNames( final List<Protein> proteins,
847                                             final List<String> query_domain_ids_nc_order,
848                                             final Writer out,
849                                             final String separator,
850                                             final String limit_to_species ) throws IOException {
851         for( final Protein protein : proteins ) {
852             if ( ForesterUtil.isEmpty( limit_to_species )
853                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
854                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
855                     out.write( protein.getSpecies().getSpeciesId() );
856                     out.write( separator );
857                     out.write( protein.getProteinId().getId() );
858                     out.write( separator );
859                     out.write( "[" );
860                     final Set<String> visited_domain_ids = new HashSet<String>();
861                     boolean first = true;
862                     for( final Domain domain : protein.getProteinDomains() ) {
863                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
864                             visited_domain_ids.add( domain.getDomainId() );
865                             if ( first ) {
866                                 first = false;
867                             }
868                             else {
869                                 out.write( " " );
870                             }
871                             out.write( domain.getDomainId() );
872                             out.write( " {" );
873                             out.write( "" + domain.getTotalCount() );
874                             out.write( "}" );
875                         }
876                     }
877                     out.write( "]" );
878                     out.write( separator );
879                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
880                             .equals( SurfacingConstants.NONE ) ) ) {
881                         out.write( protein.getDescription() );
882                     }
883                     out.write( separator );
884                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
885                             .equals( SurfacingConstants.NONE ) ) ) {
886                         out.write( protein.getAccession() );
887                     }
888                     out.write( SurfacingConstants.NL );
889                 }
890             }
891         }
892         out.flush();
893     }
894
895     public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
896                                             final String domain_id,
897                                             final Writer out,
898                                             final String separator,
899                                             final String limit_to_species,
900                                             final double domain_e_cutoff ) throws IOException {
901         System.out.println( "Per domain E-value: " + domain_e_cutoff );
902         for( final Species species : protein_lists_per_species.keySet() ) {
903             System.out.println( species + ":" );
904             for( final Protein protein : protein_lists_per_species.get( species ) ) {
905                 if ( ForesterUtil.isEmpty( limit_to_species )
906                         || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
907                     final List<Domain> domains = protein.getProteinDomains( domain_id );
908                     if ( domains.size() > 0 ) {
909                         out.write( protein.getSpecies().getSpeciesId() );
910                         out.write( separator );
911                         out.write( protein.getProteinId().getId() );
912                         out.write( separator );
913                         out.write( domain_id.toString() );
914                         out.write( separator );
915                         int prev_to = -1;
916                         for( final Domain domain : domains ) {
917                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
918                                 out.write( "/" );
919                                 out.write( domain.getFrom() + "-" + domain.getTo() );
920                                 if ( prev_to >= 0 ) {
921                                     final int l = domain.getFrom() - prev_to;
922                                     System.out.println( l );
923                                 }
924                                 prev_to = domain.getTo();
925                             }
926                         }
927                         out.write( "/" );
928                         out.write( separator );
929                         final List<Domain> domain_list = new ArrayList<Domain>();
930                         for( final Domain domain : protein.getProteinDomains() ) {
931                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
932                                 domain_list.add( domain );
933                             }
934                         }
935                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
936                         for( int i = 0; i < domain_list.size(); ++i ) {
937                             domain_ary[ i ] = domain_list.get( i );
938                         }
939                         Arrays.sort( domain_ary, new DomainComparator( true ) );
940                         out.write( "{" );
941                         boolean first = true;
942                         for( final Domain domain : domain_ary ) {
943                             if ( first ) {
944                                 first = false;
945                             }
946                             else {
947                                 out.write( "," );
948                             }
949                             out.write( domain.getDomainId().toString() );
950                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
951                             out.write( ":" + domain.getPerDomainEvalue() );
952                         }
953                         out.write( "}" );
954                         if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
955                                 .equals( SurfacingConstants.NONE ) ) ) {
956                             out.write( protein.getDescription() );
957                         }
958                         out.write( separator );
959                         if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
960                                 .equals( SurfacingConstants.NONE ) ) ) {
961                             out.write( protein.getAccession() );
962                         }
963                         out.write( SurfacingConstants.NL );
964                     }
965                 }
966             }
967         }
968         out.flush();
969     }
970
971     public static SortedSet<String> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
972         final SortedSet<String> all_domains_ids = new TreeSet<String>();
973         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
974             final Set<String> all_domains = gwcd.getAllDomainIds();
975             //    for( final Domain domain : all_domains ) {
976             all_domains_ids.addAll( all_domains );
977             //    }
978         }
979         return all_domains_ids;
980     }
981
982     public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
983         final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
984         for( final Protein protein_domain_collection : protein_domain_collections ) {
985             for( final Object name : protein_domain_collection.getProteinDomains() ) {
986                 final BasicDomain protein_domain = ( BasicDomain ) name;
987                 final String id = protein_domain.getDomainId();
988                 if ( map.containsKey( id ) ) {
989                     map.put( id, map.get( id ) + 1 );
990                 }
991                 else {
992                     map.put( id, 1 );
993                 }
994             }
995         }
996         return map;
997     }
998
999     public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
1000         final PhylogenyNodeIterator it = p.iteratorPostorder();
1001         int c = 0;
1002         while ( it.hasNext() ) {
1003             final PhylogenyNode n = it.next();
1004             if ( ForesterUtil.isEmpty( n.getName() )
1005                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1006                             .getScientificName() ) )
1007                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1008                             .getCommonName() ) ) ) {
1009                 if ( n.getParent() != null ) {
1010                     names.append( " " );
1011                     names.append( n.getParent().getName() );
1012                 }
1013                 final List l = n.getAllExternalDescendants();
1014                 for( final Object object : l ) {
1015                     System.out.println( l.toString() );
1016                 }
1017                 ++c;
1018             }
1019         }
1020         return c;
1021     }
1022
1023     public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
1024                                                           final SortedMap<String, Integer> domain_architecuture_counts,
1025                                                           final int min_count,
1026                                                           final File da_counts_outfile,
1027                                                           final File unique_da_outfile ) {
1028         checkForOutputFileWriteability( da_counts_outfile );
1029         checkForOutputFileWriteability( unique_da_outfile );
1030         try {
1031             final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
1032             final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
1033             final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
1034             while ( it.hasNext() ) {
1035                 final Map.Entry<String, Integer> e = it.next();
1036                 final String da = e.getKey();
1037                 final int count = e.getValue();
1038                 if ( count >= min_count ) {
1039                     da_counts_out.write( da );
1040                     da_counts_out.write( "\t" );
1041                     da_counts_out.write( String.valueOf( count ) );
1042                     da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
1043                 }
1044                 if ( count == 1 ) {
1045                     final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
1046                     while ( it2.hasNext() ) {
1047                         final Map.Entry<String, Set<String>> e2 = it2.next();
1048                         final String genome = e2.getKey();
1049                         final Set<String> das = e2.getValue();
1050                         if ( das.contains( da ) ) {
1051                             unique_da_out.write( genome );
1052                             unique_da_out.write( "\t" );
1053                             unique_da_out.write( da );
1054                             unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
1055                         }
1056                     }
1057                 }
1058             }
1059             unique_da_out.close();
1060             da_counts_out.close();
1061         }
1062         catch ( final IOException e ) {
1063             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1064         }
1065         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
1066         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
1067         //
1068     }
1069
1070     public static void preparePhylogeny( final Phylogeny p,
1071                                          final DomainParsimonyCalculator domain_parsimony,
1072                                          final String date_time,
1073                                          final String method,
1074                                          final String name,
1075                                          final String parameters_str ) {
1076         domain_parsimony.decoratePhylogenyWithDomains( p );
1077         final StringBuilder desc = new StringBuilder();
1078         desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
1079         desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
1080         desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
1081         desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
1082         desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
1083         desc.append( "[Parameters: " + parameters_str + "]" );
1084         p.setName( name );
1085         p.setDescription( desc.toString() );
1086         p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
1087         p.setRerootable( false );
1088         p.setRooted( true );
1089     }
1090
1091     /*
1092      * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
1093      * 
1094      * 
1095      */
1096     static public StringBuffer proteinToDomainCombinations( final Protein protein,
1097                                                             final String protein_id,
1098                                                             final String separator ) {
1099         final StringBuffer sb = new StringBuffer();
1100         if ( protein.getSpecies() == null ) {
1101             throw new IllegalArgumentException( "species must not be null" );
1102         }
1103         if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
1104             throw new IllegalArgumentException( "species id must not be empty" );
1105         }
1106         final List<Domain> domains = protein.getProteinDomains();
1107         if ( domains.size() > 1 ) {
1108             final Map<String, Integer> counts = new HashMap<String, Integer>();
1109             for( final Domain domain : domains ) {
1110                 final String id = domain.getDomainId();
1111                 if ( counts.containsKey( id ) ) {
1112                     counts.put( id, counts.get( id ) + 1 );
1113                 }
1114                 else {
1115                     counts.put( id, 1 );
1116                 }
1117             }
1118             final Set<String> dcs = new HashSet<String>();
1119             for( int i = 1; i < domains.size(); ++i ) {
1120                 for( int j = 0; j < i; ++j ) {
1121                     Domain domain_n = domains.get( i );
1122                     Domain domain_c = domains.get( j );
1123                     if ( domain_n.getFrom() > domain_c.getFrom() ) {
1124                         domain_n = domains.get( j );
1125                         domain_c = domains.get( i );
1126                     }
1127                     final String dc = domain_n.getDomainId() + domain_c.getDomainId();
1128                     if ( !dcs.contains( dc ) ) {
1129                         dcs.add( dc );
1130                         sb.append( protein.getSpecies() );
1131                         sb.append( separator );
1132                         sb.append( protein_id );
1133                         sb.append( separator );
1134                         sb.append( domain_n.getDomainId() );
1135                         sb.append( separator );
1136                         sb.append( domain_c.getDomainId() );
1137                         sb.append( separator );
1138                         sb.append( domain_n.getPerDomainEvalue() );
1139                         sb.append( separator );
1140                         sb.append( domain_c.getPerDomainEvalue() );
1141                         sb.append( separator );
1142                         sb.append( counts.get( domain_n.getDomainId() ) );
1143                         sb.append( separator );
1144                         sb.append( counts.get( domain_c.getDomainId() ) );
1145                         sb.append( ForesterUtil.LINE_SEPARATOR );
1146                     }
1147                 }
1148             }
1149         }
1150         else if ( domains.size() == 1 ) {
1151             sb.append( protein.getSpecies() );
1152             sb.append( separator );
1153             sb.append( protein_id );
1154             sb.append( separator );
1155             sb.append( domains.get( 0 ).getDomainId() );
1156             sb.append( separator );
1157             sb.append( separator );
1158             sb.append( domains.get( 0 ).getPerDomainEvalue() );
1159             sb.append( separator );
1160             sb.append( separator );
1161             sb.append( 1 );
1162             sb.append( separator );
1163             sb.append( ForesterUtil.LINE_SEPARATOR );
1164         }
1165         else {
1166             sb.append( protein.getSpecies() );
1167             sb.append( separator );
1168             sb.append( protein_id );
1169             sb.append( separator );
1170             sb.append( separator );
1171             sb.append( separator );
1172             sb.append( separator );
1173             sb.append( separator );
1174             sb.append( separator );
1175             sb.append( ForesterUtil.LINE_SEPARATOR );
1176         }
1177         return sb;
1178     }
1179
1180     public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
1181         final List<Domain> domains = new ArrayList<Domain>();
1182         for( final Domain d : protein.getProteinDomains() ) {
1183             domains.add( d );
1184         }
1185         Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
1186         return domains;
1187     }
1188
1189     public static int storeDomainArchitectures( final String genome,
1190                                                 final SortedMap<String, Set<String>> domain_architecutures,
1191                                                 final List<Protein> protein_list,
1192                                                 final Map<String, Integer> distinct_domain_architecuture_counts ) {
1193         final Set<String> da = new HashSet<String>();
1194         domain_architecutures.put( genome, da );
1195         for( final Protein protein : protein_list ) {
1196             final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
1197             if ( !da.contains( da_str ) ) {
1198                 if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
1199                     distinct_domain_architecuture_counts.put( da_str, 1 );
1200                 }
1201                 else {
1202                     distinct_domain_architecuture_counts.put( da_str,
1203                                                               distinct_domain_architecuture_counts.get( da_str ) + 1 );
1204                 }
1205                 da.add( da_str );
1206             }
1207         }
1208         return da.size();
1209     }
1210
1211     public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
1212                                                             final boolean get_gains,
1213                                                             final String outdir,
1214                                                             final String suffix_for_filename ) throws IOException {
1215         CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
1216         if ( !get_gains ) {
1217             state = CharacterStateMatrix.GainLossStates.LOSS;
1218         }
1219         final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
1220                                                                   false,
1221                                                                   state,
1222                                                                   outdir );
1223         for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
1224             final PhylogenyNode node = it.next();
1225             if ( !node.isExternal() ) {
1226                 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
1227                 if ( domains.size() > 0 ) {
1228                     final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
1229                             + node.getName() + suffix_for_filename );
1230                     for( final String domain : domains ) {
1231                         writer.write( domain );
1232                         writer.write( ForesterUtil.LINE_SEPARATOR );
1233                     }
1234                     writer.close();
1235                 }
1236             }
1237         }
1238     }
1239
1240     public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
1241                                                                           final File output_dir,
1242                                                                           final GenomeWideCombinableDomains gwcd,
1243                                                                           final int i,
1244                                                                           final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1245         File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ]
1246                 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
1247         if ( output_dir != null ) {
1248             dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
1249         }
1250         checkForOutputFileWriteability( dc_outfile_dot );
1251         final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
1252         try {
1253             final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
1254             for( final BinaryDomainCombination bdc : binary_combinations ) {
1255                 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
1256                         .toString() );
1257                 out_dot.write( SurfacingConstants.NL );
1258             }
1259             out_dot.close();
1260         }
1261         catch ( final IOException e ) {
1262             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1263         }
1264         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
1265                 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1266                 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
1267     }
1268
1269     public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1270                                                             final CharacterStateMatrix.GainLossStates state,
1271                                                             final String filename,
1272                                                             final String indentifier_characters_separator,
1273                                                             final String character_separator,
1274                                                             final Map<String, String> descriptions ) {
1275         final File outfile = new File( filename );
1276         checkForOutputFileWriteability( outfile );
1277         final SortedSet<String> sorted_ids = new TreeSet<String>();
1278         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1279             sorted_ids.add( matrix.getIdentifier( i ) );
1280         }
1281         try {
1282             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1283             for( final String id : sorted_ids ) {
1284                 out.write( indentifier_characters_separator );
1285                 out.write( "#" + id );
1286                 out.write( indentifier_characters_separator );
1287                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1288                     // Not nice:
1289                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1290                     if ( ( matrix.getState( id, c ) == state )
1291                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1292                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1293                         out.write( matrix.getCharacter( c ) );
1294                         if ( ( descriptions != null ) && !descriptions.isEmpty()
1295                                 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
1296                             out.write( "\t" );
1297                             out.write( descriptions.get( matrix.getCharacter( c ) ) );
1298                         }
1299                         out.write( character_separator );
1300                     }
1301                 }
1302             }
1303             out.flush();
1304             out.close();
1305         }
1306         catch ( final IOException e ) {
1307             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1308         }
1309         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1310     }
1311
1312     public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1313                                                                                                  final CharacterStateMatrix.GainLossStates state,
1314                                                                                                  final String filename,
1315                                                                                                  final String indentifier_characters_separator,
1316                                                                                                  final String character_separator,
1317                                                                                                  final BinaryDomainCombination.OutputFormat bc_output_format ) {
1318         final File outfile = new File( filename );
1319         checkForOutputFileWriteability( outfile );
1320         final SortedSet<String> sorted_ids = new TreeSet<String>();
1321         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1322             sorted_ids.add( matrix.getIdentifier( i ) );
1323         }
1324         try {
1325             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1326             for( final String id : sorted_ids ) {
1327                 out.write( indentifier_characters_separator );
1328                 out.write( "#" + id );
1329                 out.write( indentifier_characters_separator );
1330                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1331                     // Not nice:
1332                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1333                     if ( ( matrix.getState( id, c ) == state )
1334                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1335                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1336                         BinaryDomainCombination bdc = null;
1337                         try {
1338                             bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
1339                         }
1340                         catch ( final Exception e ) {
1341                             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1342                         }
1343                         out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
1344                         out.write( character_separator );
1345                     }
1346                 }
1347             }
1348             out.flush();
1349             out.close();
1350         }
1351         catch ( final IOException e ) {
1352             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1353         }
1354         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1355     }
1356
1357     public static void writeBinaryStatesMatrixToList( final Map<String, List<GoId>> domain_id_to_go_ids_map,
1358                                                       final Map<GoId, GoTerm> go_id_to_term_map,
1359                                                       final GoNameSpace go_namespace_limit,
1360                                                       final boolean domain_combinations,
1361                                                       final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1362                                                       final CharacterStateMatrix.GainLossStates state,
1363                                                       final String filename,
1364                                                       final String indentifier_characters_separator,
1365                                                       final String character_separator,
1366                                                       final String title_for_html,
1367                                                       final String prefix_for_html,
1368                                                       final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
1369                                                       final SortedSet<String> all_pfams_encountered,
1370                                                       final SortedSet<String> pfams_gained_or_lost,
1371                                                       final String suffix_for_per_node_events_file,
1372                                                       final Map<String, Integer> tax_code_to_id_map ) {
1373         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1374             throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1375         }
1376         else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1377             throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
1378         }
1379         else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1380             throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
1381         }
1382         final File outfile = new File( filename );
1383         checkForOutputFileWriteability( outfile );
1384         final SortedSet<String> sorted_ids = new TreeSet<String>();
1385         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1386             sorted_ids.add( matrix.getIdentifier( i ) );
1387         }
1388         try {
1389             final Writer out = new BufferedWriter( new FileWriter( outfile ) );
1390             final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
1391                                                                                                                 domain_combinations,
1392                                                                                                                 state,
1393                                                                                                                 filename );
1394             Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1395             File per_node_go_mapped_domain_gain_loss_outfile = null;
1396             int per_node_counter = 0;
1397             out.write( "<html>" );
1398             out.write( SurfacingConstants.NL );
1399             addHtmlHead( out, title_for_html );
1400             out.write( SurfacingConstants.NL );
1401             out.write( "<body>" );
1402             out.write( SurfacingConstants.NL );
1403             out.write( "<h1>" );
1404             out.write( SurfacingConstants.NL );
1405             out.write( title_for_html );
1406             out.write( SurfacingConstants.NL );
1407             out.write( "</h1>" );
1408             out.write( SurfacingConstants.NL );
1409             out.write( "<table>" );
1410             out.write( SurfacingConstants.NL );
1411             for( final String id : sorted_ids ) {
1412                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1413                 if ( matcher.matches() ) {
1414                     continue;
1415                 }
1416                 out.write( "<tr>" );
1417                 out.write( "<td>" );
1418                 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
1419                 out.write( "</td>" );
1420                 out.write( "</tr>" );
1421                 out.write( SurfacingConstants.NL );
1422             }
1423             out.write( "</table>" );
1424             out.write( SurfacingConstants.NL );
1425             for( final String id : sorted_ids ) {
1426                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1427                 if ( matcher.matches() ) {
1428                     continue;
1429                 }
1430                 out.write( SurfacingConstants.NL );
1431                 out.write( "<h2>" );
1432                 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
1433                 writeTaxonomyLinks( out, id, tax_code_to_id_map );
1434                 out.write( "</h2>" );
1435                 out.write( SurfacingConstants.NL );
1436                 out.write( "<table>" );
1437                 out.write( SurfacingConstants.NL );
1438                 out.write( "<tr>" );
1439                 out.write( "<td><b>" );
1440                 out.write( "Pfam domain(s)" );
1441                 out.write( "</b></td><td><b>" );
1442                 out.write( "GO term acc" );
1443                 out.write( "</b></td><td><b>" );
1444                 out.write( "GO term" );
1445                 out.write( "</b></td><td><b>" );
1446                 out.write( "GO namespace" );
1447                 out.write( "</b></td>" );
1448                 out.write( "</tr>" );
1449                 out.write( SurfacingConstants.NL );
1450                 out.write( "</tr>" );
1451                 out.write( SurfacingConstants.NL );
1452                 per_node_counter = 0;
1453                 if ( matrix.getNumberOfCharacters() > 0 ) {
1454                     per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
1455                             + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
1456                     SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
1457                     per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
1458                             .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
1459                 }
1460                 else {
1461                     per_node_go_mapped_domain_gain_loss_outfile = null;
1462                     per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1463                 }
1464                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1465                     // Not nice:
1466                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1467                     if ( ( matrix.getState( id, c ) == state )
1468                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
1469                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
1470                         final String character = matrix.getCharacter( c );
1471                         String domain_0 = "";
1472                         String domain_1 = "";
1473                         if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
1474                             final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
1475                             if ( s.length != 2 ) {
1476                                 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
1477                                         + character + "]" );
1478                             }
1479                             domain_0 = s[ 0 ];
1480                             domain_1 = s[ 1 ];
1481                         }
1482                         else {
1483                             domain_0 = character;
1484                         }
1485                         writeDomainData( domain_id_to_go_ids_map,
1486                                          go_id_to_term_map,
1487                                          go_namespace_limit,
1488                                          out,
1489                                          domain_0,
1490                                          domain_1,
1491                                          prefix_for_html,
1492                                          character_separator,
1493                                          domain_id_to_secondary_features_maps,
1494                                          null );
1495                         all_pfams_encountered.add( domain_0 );
1496                         if ( pfams_gained_or_lost != null ) {
1497                             pfams_gained_or_lost.add( domain_0 );
1498                         }
1499                         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
1500                             all_pfams_encountered.add( domain_1 );
1501                             if ( pfams_gained_or_lost != null ) {
1502                                 pfams_gained_or_lost.add( domain_1 );
1503                             }
1504                         }
1505                         if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1506                             writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
1507                                                                      domain_0,
1508                                                                      domain_1 );
1509                             per_node_counter++;
1510                         }
1511                     }
1512                 }
1513                 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1514                     per_node_go_mapped_domain_gain_loss_outfile_writer.close();
1515                     if ( per_node_counter < 1 ) {
1516                         per_node_go_mapped_domain_gain_loss_outfile.delete();
1517                     }
1518                     per_node_counter = 0;
1519                 }
1520                 out.write( "</table>" );
1521                 out.write( SurfacingConstants.NL );
1522                 out.write( "<hr>" );
1523                 out.write( SurfacingConstants.NL );
1524             } // for( final String id : sorted_ids ) {  
1525             out.write( "</body>" );
1526             out.write( SurfacingConstants.NL );
1527             out.write( "</html>" );
1528             out.write( SurfacingConstants.NL );
1529             out.flush();
1530             out.close();
1531         }
1532         catch ( final IOException e ) {
1533             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1534         }
1535         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
1536     }
1537
1538     public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
1539                                                           final File output_dir,
1540                                                           final Writer per_genome_domain_promiscuity_statistics_writer,
1541                                                           final GenomeWideCombinableDomains gwcd,
1542                                                           final int i,
1543                                                           final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1544         File dc_outfile = new File( input_file_properties[ i ][ 1 ]
1545                 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
1546         if ( output_dir != null ) {
1547             dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
1548         }
1549         checkForOutputFileWriteability( dc_outfile );
1550         try {
1551             final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
1552             out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
1553             out.close();
1554         }
1555         catch ( final IOException e ) {
1556             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1557         }
1558         final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
1559         try {
1560             per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" );
1561             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
1562             if ( stats.getN() < 2 ) {
1563                 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
1564             }
1565             else {
1566                 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
1567                         .sampleStandardDeviation() ) + "\t" );
1568             }
1569             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
1570             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
1571             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
1572             per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
1573             final SortedSet<String> mpds = gwcd.getMostPromiscuosDomain();
1574             for( final String mpd : mpds ) {
1575                 per_genome_domain_promiscuity_statistics_writer.write( mpd + " " );
1576             }
1577             per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
1578         }
1579         catch ( final IOException e ) {
1580             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1581         }
1582         if ( input_file_properties[ i ].length == 3 ) {
1583             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1584                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1585                     + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
1586         }
1587         else {
1588             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1589                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
1590                     + dc_outfile + "\"" );
1591         }
1592     }
1593
1594     public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc,
1595                                                       final StringBuilder html_title,
1596                                                       final Writer simple_tab_writer,
1597                                                       final Writer single_writer,
1598                                                       Map<Character, Writer> split_writers,
1599                                                       final SortedSet<DomainSimilarity> similarities,
1600                                                       final boolean treat_as_binary,
1601                                                       final List<Species> species_order,
1602                                                       final PrintableDomainSimilarity.PRINT_OPTION print_option,
1603                                                       final DomainSimilarity.DomainSimilarityScoring scoring,
1604                                                       final boolean verbose,
1605                                                       final Map<String, Integer> tax_code_to_id_map,
1606                                                       Phylogeny phy ) throws IOException {
1607         if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
1608             split_writers = new HashMap<Character, Writer>();
1609             split_writers.put( '_', single_writer );
1610         }
1611         switch ( print_option ) {
1612             case SIMPLE_TAB_DELIMITED:
1613                 break;
1614             case HTML:
1615                 for( final Character key : split_writers.keySet() ) {
1616                     final Writer w = split_writers.get( key );
1617                     w.write( "<html>" );
1618                     w.write( SurfacingConstants.NL );
1619                     if ( key != '_' ) {
1620                         addHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() );
1621                     }
1622                     else {
1623                         addHtmlHead( w, "DC analysis (" + html_title + ")" );
1624                     }
1625                     w.write( SurfacingConstants.NL );
1626                     w.write( "<body>" );
1627                     w.write( SurfacingConstants.NL );
1628                     w.write( html_desc.toString() );
1629                     w.write( SurfacingConstants.NL );
1630                     w.write( "<hr>" );
1631                     w.write( SurfacingConstants.NL );
1632                     w.write( "<br>" );
1633                     w.write( SurfacingConstants.NL );
1634                     w.write( "<table>" );
1635                     w.write( SurfacingConstants.NL );
1636                     w.write( "<tr><td><b>Domains:</b></td></tr>" );
1637                     w.write( SurfacingConstants.NL );
1638                 }
1639                 break;
1640         }
1641         //
1642         for( final DomainSimilarity similarity : similarities ) {
1643             if ( ( species_order != null ) && !species_order.isEmpty() ) {
1644                 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
1645             }
1646             if ( single_writer != null ) {
1647                 single_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
1648                         + similarity.getDomainId() + "</a></b></td></tr>" );
1649                 single_writer.write( SurfacingConstants.NL );
1650             }
1651             else {
1652                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
1653                         .charAt( 0 ) );
1654                 if ( local_writer == null ) {
1655                     local_writer = split_writers.get( '0' );
1656                 }
1657                 local_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
1658                         + similarity.getDomainId() + "</a></b></td></tr>" );
1659                 local_writer.write( SurfacingConstants.NL );
1660             }
1661         }
1662         for( final Writer w : split_writers.values() ) {
1663             w.write( "</table>" );
1664             w.write( SurfacingConstants.NL );
1665             w.write( "<hr>" );
1666             w.write( SurfacingConstants.NL );
1667             w.write( "<table>" );
1668             w.write( SurfacingConstants.NL );
1669         }
1670         //
1671         for( final DomainSimilarity similarity : similarities ) {
1672             if ( ( species_order != null ) && !species_order.isEmpty() ) {
1673                 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
1674             }
1675             if ( simple_tab_writer != null ) {
1676                 simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED,
1677                                                                     tax_code_to_id_map,
1678                                                                     null ).toString() );
1679             }
1680             if ( single_writer != null ) {
1681                 single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
1682                 single_writer.write( SurfacingConstants.NL );
1683             }
1684             else {
1685                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
1686                         .charAt( 0 ) );
1687                 if ( local_writer == null ) {
1688                     local_writer = split_writers.get( '0' );
1689                 }
1690                 local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
1691                 local_writer.write( SurfacingConstants.NL );
1692             }
1693         }
1694         switch ( print_option ) {
1695             case HTML:
1696                 for( final Writer w : split_writers.values() ) {
1697                     w.write( SurfacingConstants.NL );
1698                     w.write( "</table>" );
1699                     w.write( SurfacingConstants.NL );
1700                     w.write( "</font>" );
1701                     w.write( SurfacingConstants.NL );
1702                     w.write( "</body>" );
1703                     w.write( SurfacingConstants.NL );
1704                     w.write( "</html>" );
1705                     w.write( SurfacingConstants.NL );
1706                 }
1707                 break;
1708         }
1709         for( final Writer w : split_writers.values() ) {
1710             w.close();
1711         }
1712     }
1713
1714     private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w )
1715             throws IOException {
1716         w.write( "<hr>" );
1717         w.write( "<br>" );
1718         w.write( SurfacingConstants.NL );
1719         w.write( "<tt><pre>" );
1720         w.write( SurfacingConstants.NL );
1721         if ( histo != null ) {
1722             w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
1723             w.write( SurfacingConstants.NL );
1724         }
1725         w.write( "</pre></tt>" );
1726         w.write( SurfacingConstants.NL );
1727         w.write( "<table>" );
1728         w.write( SurfacingConstants.NL );
1729         w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
1730         w.write( SurfacingConstants.NL );
1731         w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
1732         w.write( SurfacingConstants.NL );
1733         w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
1734         w.write( SurfacingConstants.NL );
1735         w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
1736         w.write( SurfacingConstants.NL );
1737         if ( stats.getN() > 1 ) {
1738             w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
1739         }
1740         else {
1741             w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
1742         }
1743         w.write( SurfacingConstants.NL );
1744         w.write( "</table>" );
1745         w.write( SurfacingConstants.NL );
1746         w.write( "<br>" );
1747         w.write( SurfacingConstants.NL );
1748     }
1749
1750     public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
1751                                           final String filename,
1752                                           final Format format ) {
1753         final File outfile = new File( filename );
1754         checkForOutputFileWriteability( outfile );
1755         try {
1756             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1757             matrix.toWriter( out, format );
1758             out.flush();
1759             out.close();
1760         }
1761         catch ( final IOException e ) {
1762             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1763         }
1764         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
1765     }
1766
1767     public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
1768         checkForOutputFileWriteability( matrix_outfile );
1769         try {
1770             final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
1771             for( final DistanceMatrix distance_matrix : matrices ) {
1772                 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
1773                 out.write( ForesterUtil.LINE_SEPARATOR );
1774                 out.flush();
1775             }
1776             out.close();
1777         }
1778         catch ( final IOException e ) {
1779             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1780         }
1781         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
1782     }
1783
1784     public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
1785         final PhylogenyWriter writer = new PhylogenyWriter();
1786         try {
1787             writer.toPhyloXML( new File( filename ), phylogeny, 1 );
1788         }
1789         catch ( final IOException e ) {
1790             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
1791                     + e );
1792         }
1793         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
1794     }
1795
1796     public static void writeTaxonomyLinks( final Writer writer,
1797                                            final String species,
1798                                            final Map<String, Integer> tax_code_to_id_map ) throws IOException {
1799         if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
1800             writer.write( " [" );
1801             if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) {
1802                 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK
1803                         + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot</a>" );
1804             }
1805             else {
1806                 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
1807                         + "\" target=\"taxonomy_window\">eol</a>" );
1808                 writer.write( "|" );
1809                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + species
1810                         + "\" target=\"taxonomy_window\">scholar</a>" );
1811                 writer.write( "|" );
1812                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_WEB_SEARCH_LINK + species
1813                         + "\" target=\"taxonomy_window\">google</a>" );
1814             }
1815             writer.write( "]" );
1816         }
1817     }
1818
1819     private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
1820         if ( map.containsKey( s ) ) {
1821             map.put( s, map.get( s ) + 1 );
1822         }
1823         else {
1824             map.put( s, 1 );
1825         }
1826     }
1827
1828     private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
1829                                                                     final String outfilename_for_counts,
1830                                                                     final String outfilename_for_dc,
1831                                                                     final String outfilename_for_dc_for_go_mapping,
1832                                                                     final String outfilename_for_dc_for_go_mapping_unique,
1833                                                                     final String outfilename_for_rank_counts,
1834                                                                     final String outfilename_for_ancestor_species_counts,
1835                                                                     final String outfilename_for_protein_stats,
1836                                                                     final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
1837                                                                     final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
1838                                                                     final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
1839         try {
1840             //
1841             //            if ( protein_length_stats_by_dc != null ) {
1842             //                for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
1843             //                    System.out.print( entry.getKey().toString() );
1844             //                    System.out.print( ": " );
1845             //                    double[] a = entry.getValue().getDataAsDoubleArray();
1846             //                    for( int i = 0; i < a.length; i++ ) {
1847             //                        System.out.print( a[ i ] + " " );
1848             //                    }
1849             //                    System.out.println();
1850             //                }
1851             //            }
1852             //            if ( domain_number_stats_by_dc != null ) {
1853             //                for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
1854             //                    System.out.print( entry.getKey().toString() );
1855             //                    System.out.print( ": " );
1856             //                    double[] a = entry.getValue().getDataAsDoubleArray();
1857             //                    for( int i = 0; i < a.length; i++ ) {
1858             //                        System.out.print( a[ i ] + " " );
1859             //                    }
1860             //                    System.out.println();
1861             //                }
1862             //            }
1863             //
1864             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
1865             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
1866             final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
1867             final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
1868             final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
1869             for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
1870                 final PhylogenyNode n = it.next();
1871                 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
1872                 for( final String dc : gained_dc ) {
1873                     if ( dc_gain_counts.containsKey( dc ) ) {
1874                         dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
1875                     }
1876                     else {
1877                         dc_gain_counts.put( dc, 1 );
1878                     }
1879                 }
1880             }
1881             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
1882             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
1883             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
1884             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
1885             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
1886             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
1887             final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
1888             final Set<String> dcs = dc_gain_counts.keySet();
1889             final SortedSet<String> more_than_once = new TreeSet<String>();
1890             DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
1891             DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
1892             DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
1893             final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
1894             long gained_multiple_times_domain_length_sum = 0;
1895             long gained_once_domain_length_sum = 0;
1896             long gained_multiple_times_domain_length_count = 0;
1897             long gained_once_domain_length_count = 0;
1898             for( final String dc : dcs ) {
1899                 final int count = dc_gain_counts.get( dc );
1900                 if ( histogram.containsKey( count ) ) {
1901                     histogram.put( count, histogram.get( count ) + 1 );
1902                     domain_lists.get( count ).append( ", " + dc );
1903                     domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
1904                     domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
1905                 }
1906                 else {
1907                     histogram.put( count, 1 );
1908                     domain_lists.put( count, new StringBuilder( dc ) );
1909                     final PriorityQueue<String> q = new PriorityQueue<String>();
1910                     q.addAll( splitDomainCombination( dc ) );
1911                     domain_lists_go.put( count, q );
1912                     final SortedSet<String> set = new TreeSet<String>();
1913                     set.addAll( splitDomainCombination( dc ) );
1914                     domain_lists_go_unique.put( count, set );
1915                 }
1916                 if ( protein_length_stats_by_dc != null ) {
1917                     if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
1918                         dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
1919                     }
1920                     dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
1921                             .arithmeticMean() );
1922                 }
1923                 if ( domain_number_stats_by_dc != null ) {
1924                     if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
1925                         dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
1926                     }
1927                     dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
1928                             .arithmeticMean() );
1929                 }
1930                 if ( domain_length_stats_by_domain != null ) {
1931                     if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
1932                         dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
1933                     }
1934                     final String[] ds = dc.split( "=" );
1935                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
1936                             .get( ds[ 0 ] ).arithmeticMean() );
1937                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
1938                             .get( ds[ 1 ] ).arithmeticMean() );
1939                 }
1940                 if ( count > 1 ) {
1941                     more_than_once.add( dc );
1942                     if ( protein_length_stats_by_dc != null ) {
1943                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
1944                         for( final double element : s.getData() ) {
1945                             gained_multiple_times_lengths_stats.addValue( element );
1946                         }
1947                     }
1948                     if ( domain_number_stats_by_dc != null ) {
1949                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
1950                         for( final double element : s.getData() ) {
1951                             gained_multiple_times_domain_count_stats.addValue( element );
1952                         }
1953                     }
1954                     if ( domain_length_stats_by_domain != null ) {
1955                         final String[] ds = dc.split( "=" );
1956                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
1957                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
1958                         for( final double element : s0.getData() ) {
1959                             gained_multiple_times_domain_length_sum += element;
1960                             ++gained_multiple_times_domain_length_count;
1961                         }
1962                         for( final double element : s1.getData() ) {
1963                             gained_multiple_times_domain_length_sum += element;
1964                             ++gained_multiple_times_domain_length_count;
1965                         }
1966                     }
1967                 }
1968                 else {
1969                     if ( protein_length_stats_by_dc != null ) {
1970                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
1971                         for( final double element : s.getData() ) {
1972                             gained_once_lengths_stats.addValue( element );
1973                         }
1974                     }
1975                     if ( domain_number_stats_by_dc != null ) {
1976                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
1977                         for( final double element : s.getData() ) {
1978                             gained_once_domain_count_stats.addValue( element );
1979                         }
1980                     }
1981                     if ( domain_length_stats_by_domain != null ) {
1982                         final String[] ds = dc.split( "=" );
1983                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
1984                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
1985                         for( final double element : s0.getData() ) {
1986                             gained_once_domain_length_sum += element;
1987                             ++gained_once_domain_length_count;
1988                         }
1989                         for( final double element : s1.getData() ) {
1990                             gained_once_domain_length_sum += element;
1991                             ++gained_once_domain_length_count;
1992                         }
1993                     }
1994                 }
1995             }
1996             final Set<Integer> histogram_keys = histogram.keySet();
1997             for( final Integer histogram_key : histogram_keys ) {
1998                 final int count = histogram.get( histogram_key );
1999                 final StringBuilder dc = domain_lists.get( histogram_key );
2000                 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
2001                 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
2002                 out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2003                 final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
2004                 Arrays.sort( sorted );
2005                 for( final Object domain : sorted ) {
2006                     out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
2007                 }
2008                 out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2009                 for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
2010                     out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
2011                 }
2012             }
2013             out_counts.close();
2014             out_dc.close();
2015             out_dc_for_go_mapping.close();
2016             out_dc_for_go_mapping_unique.close();
2017             final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
2018             final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
2019             for( final String dc : more_than_once ) {
2020                 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
2021                 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
2022                     final PhylogenyNode n = it.next();
2023                     if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
2024                         nodes.add( n );
2025                     }
2026                 }
2027                 for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
2028                     for( int j = i + 1; j < nodes.size(); ++j ) {
2029                         final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
2030                         String rank = "unknown";
2031                         if ( lca.getNodeData().isHasTaxonomy()
2032                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
2033                             rank = lca.getNodeData().getTaxonomy().getRank();
2034                         }
2035                         addToCountMap( lca_rank_counts, rank );
2036                         String lca_species;
2037                         if ( lca.getNodeData().isHasTaxonomy()
2038                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
2039                             lca_species = lca.getNodeData().getTaxonomy().getScientificName();
2040                         }
2041                         else if ( lca.getNodeData().isHasTaxonomy()
2042                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
2043                             lca_species = lca.getNodeData().getTaxonomy().getCommonName();
2044                         }
2045                         else {
2046                             lca_species = lca.getName();
2047                         }
2048                         addToCountMap( lca_ancestor_species_counts, lca_species );
2049                     }
2050                 }
2051             }
2052             final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
2053             final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
2054             ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
2055             ForesterUtil.map2writer( out_for_ancestor_species_counts,
2056                                      lca_ancestor_species_counts,
2057                                      "\t",
2058                                      ForesterUtil.LINE_SEPARATOR );
2059             out_for_rank_counts.close();
2060             out_for_ancestor_species_counts.close();
2061             if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
2062                     && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
2063                 final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
2064                 w.write( "Domain Lengths: " );
2065                 w.write( "\n" );
2066                 if ( domain_length_stats_by_domain != null ) {
2067                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
2068                             .entrySet() ) {
2069                         w.write( entry.getKey().toString() );
2070                         w.write( "\t" + entry.getValue().arithmeticMean() );
2071                         w.write( "\t" + entry.getValue().median() );
2072                         w.write( "\n" );
2073                     }
2074                 }
2075                 w.flush();
2076                 w.write( "\n" );
2077                 w.write( "\n" );
2078                 w.write( "Protein Lengths: " );
2079                 w.write( "\n" );
2080                 if ( protein_length_stats_by_dc != null ) {
2081                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
2082                             .entrySet() ) {
2083                         w.write( entry.getKey().toString() );
2084                         w.write( "\t" + entry.getValue().arithmeticMean() );
2085                         w.write( "\t" + entry.getValue().median() );
2086                         w.write( "\n" );
2087                     }
2088                 }
2089                 w.flush();
2090                 w.write( "\n" );
2091                 w.write( "\n" );
2092                 w.write( "Number of domains: " );
2093                 w.write( "\n" );
2094                 if ( domain_number_stats_by_dc != null ) {
2095                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
2096                             .entrySet() ) {
2097                         w.write( entry.getKey().toString() );
2098                         w.write( "\t" + entry.getValue().arithmeticMean() );
2099                         w.write( "\t" + entry.getValue().median() );
2100                         w.write( "\n" );
2101                     }
2102                 }
2103                 w.flush();
2104                 w.write( "\n" );
2105                 w.write( "\n" );
2106                 w.write( "Gained once, domain lengths:" );
2107                 w.write( "\n" );
2108                 w.write( "N: " + gained_once_domain_length_count );
2109                 w.write( "\n" );
2110                 w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
2111                 w.write( "\n" );
2112                 w.write( "\n" );
2113                 w.write( "Gained multiple times, domain lengths:" );
2114                 w.write( "\n" );
2115                 w.write( "N: " + gained_multiple_times_domain_length_count );
2116                 w.write( "\n" );
2117                 w.write( "Avg: "
2118                         + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
2119                 w.write( "\n" );
2120                 w.write( "\n" );
2121                 w.write( "\n" );
2122                 w.write( "\n" );
2123                 w.write( "Gained once, protein lengths:" );
2124                 w.write( "\n" );
2125                 w.write( gained_once_lengths_stats.toString() );
2126                 gained_once_lengths_stats = null;
2127                 w.write( "\n" );
2128                 w.write( "\n" );
2129                 w.write( "Gained once, domain counts:" );
2130                 w.write( "\n" );
2131                 w.write( gained_once_domain_count_stats.toString() );
2132                 gained_once_domain_count_stats = null;
2133                 w.write( "\n" );
2134                 w.write( "\n" );
2135                 w.write( "Gained multiple times, protein lengths:" );
2136                 w.write( "\n" );
2137                 w.write( gained_multiple_times_lengths_stats.toString() );
2138                 gained_multiple_times_lengths_stats = null;
2139                 w.write( "\n" );
2140                 w.write( "\n" );
2141                 w.write( "Gained multiple times, domain counts:" );
2142                 w.write( "\n" );
2143                 w.write( gained_multiple_times_domain_count_stats.toString() );
2144                 w.flush();
2145                 w.close();
2146             }
2147         }
2148         catch ( final IOException e ) {
2149             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2150         }
2151         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
2152                 + outfilename_for_counts + "]" );
2153         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
2154                 + outfilename_for_dc + "]" );
2155         ForesterUtil.programMessage( surfacing.PRG_NAME,
2156                                      "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
2157                                              + outfilename_for_dc_for_go_mapping + "]" );
2158         ForesterUtil.programMessage( surfacing.PRG_NAME,
2159                                      "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
2160                                              + outfilename_for_dc_for_go_mapping_unique + "]" );
2161     }
2162
2163     private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
2164                                                                         final boolean get_gains ) {
2165         final SortedSet<String> domains = new TreeSet<String>();
2166         for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
2167             final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
2168             if ( get_gains ) {
2169                 domains.addAll( chars.getGainedCharacters() );
2170             }
2171             else {
2172                 domains.addAll( chars.getLostCharacters() );
2173             }
2174         }
2175         return domains;
2176     }
2177
2178     private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
2179                                                             final boolean domain_combinations,
2180                                                             final CharacterStateMatrix.GainLossStates state,
2181                                                             final String outfile ) {
2182         File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
2183                 + ForesterUtil.FILE_SEPARATOR + base_dir );
2184         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2185             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2186         }
2187         if ( domain_combinations ) {
2188             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2189                     + ForesterUtil.FILE_SEPARATOR + "DC" );
2190         }
2191         else {
2192             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2193                     + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
2194         }
2195         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2196             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2197         }
2198         if ( state == GainLossStates.GAIN ) {
2199             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2200                     + ForesterUtil.FILE_SEPARATOR + "GAINS" );
2201         }
2202         else if ( state == GainLossStates.LOSS ) {
2203             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2204                     + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
2205         }
2206         else {
2207             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2208                     + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
2209         }
2210         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2211             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2212         }
2213         return per_node_go_mapped_domain_gain_loss_files_base_dir;
2214     }
2215
2216     private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
2217         final SortedMap<String, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
2218         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
2219         for( final String domain_id : cds.keySet() ) {
2220             final CombinableDomains cd = cds.get( domain_id );
2221             binary_combinations.addAll( cd.toBinaryDomainCombinations() );
2222         }
2223         return binary_combinations;
2224     }
2225
2226     private static List<String> splitDomainCombination( final String dc ) {
2227         final String[] s = dc.split( "=" );
2228         if ( s.length != 2 ) {
2229             ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
2230                     + dc );
2231             System.exit( -1 );
2232         }
2233         final List<String> l = new ArrayList<String>( 2 );
2234         l.add( s[ 0 ] );
2235         l.add( s[ 1 ] );
2236         return l;
2237     }
2238
2239     private static void writeAllEncounteredPfamsToFile( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2240                                                         final Map<GoId, GoTerm> go_id_to_term_map,
2241                                                         final String outfile_name,
2242                                                         final SortedSet<String> all_pfams_encountered ) {
2243         final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
2244         final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
2245                 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
2246         final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
2247         int biological_process_counter = 0;
2248         int cellular_component_counter = 0;
2249         int molecular_function_counter = 0;
2250         int pfams_with_mappings_counter = 0;
2251         int pfams_without_mappings_counter = 0;
2252         int pfams_without_mappings_to_bp_or_mf_counter = 0;
2253         int pfams_with_mappings_to_bp_or_mf_counter = 0;
2254         try {
2255             final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
2256             final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
2257             final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
2258             summary_writer.write( "# Pfam to GO mapping summary" );
2259             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2260             summary_writer.write( "# Actual summary is at the end of this file." );
2261             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2262             summary_writer.write( "# Encountered Pfams without a GO mapping:" );
2263             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2264             for( final String pfam : all_pfams_encountered ) {
2265                 all_pfams_encountered_writer.write( pfam );
2266                 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
2267                 final String domain_id = new String( pfam );
2268                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
2269                     ++pfams_with_mappings_counter;
2270                     all_pfams_encountered_with_go_annotation_writer.write( pfam );
2271                     all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
2272                     final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
2273                     boolean maps_to_bp = false;
2274                     boolean maps_to_cc = false;
2275                     boolean maps_to_mf = false;
2276                     for( final GoId go_id : go_ids ) {
2277                         final GoTerm go_term = go_id_to_term_map.get( go_id );
2278                         if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
2279                             maps_to_bp = true;
2280                         }
2281                         else if ( go_term.getGoNameSpace().isCellularComponent() ) {
2282                             maps_to_cc = true;
2283                         }
2284                         else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
2285                             maps_to_mf = true;
2286                         }
2287                     }
2288                     if ( maps_to_bp ) {
2289                         ++biological_process_counter;
2290                     }
2291                     if ( maps_to_cc ) {
2292                         ++cellular_component_counter;
2293                     }
2294                     if ( maps_to_mf ) {
2295                         ++molecular_function_counter;
2296                     }
2297                     if ( maps_to_bp || maps_to_mf ) {
2298                         ++pfams_with_mappings_to_bp_or_mf_counter;
2299                     }
2300                     else {
2301                         ++pfams_without_mappings_to_bp_or_mf_counter;
2302                     }
2303                 }
2304                 else {
2305                     ++pfams_without_mappings_to_bp_or_mf_counter;
2306                     ++pfams_without_mappings_counter;
2307                     summary_writer.write( pfam );
2308                     summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2309                 }
2310             }
2311             all_pfams_encountered_writer.close();
2312             all_pfams_encountered_with_go_annotation_writer.close();
2313             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
2314                     + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
2315             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
2316                     + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
2317                     + "\"" );
2318             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
2319                     + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
2320                     + encountered_pfams_summary_file + "\"" );
2321             ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered                : "
2322                     + all_pfams_encountered.size() );
2323             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping                 : "
2324                     + pfams_without_mappings_counter + " ["
2325                     + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2326             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
2327                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
2328                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2329             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping                    : "
2330                     + pfams_with_mappings_counter + " ["
2331                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2332             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func.  : "
2333                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
2334                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2335             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
2336                     + biological_process_counter + " ["
2337                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2338             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
2339                     + molecular_function_counter + " ["
2340                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2341             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
2342                     + cellular_component_counter + " ["
2343                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2344             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2345             summary_writer.write( "# Sum of Pfams encountered                : " + all_pfams_encountered.size() );
2346             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2347             summary_writer.write( "# Pfams without a mapping                 : " + pfams_without_mappings_counter
2348                     + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2349             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2350             summary_writer.write( "# Pfams without mapping to proc. or func. : "
2351                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
2352                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2353             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2354             summary_writer.write( "# Pfams with a mapping                    : " + pfams_with_mappings_counter + " ["
2355                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2356             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2357             summary_writer.write( "# Pfams with a mapping to proc. or func.  : "
2358                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
2359                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2360             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2361             summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
2362                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2363             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2364             summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
2365                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2366             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2367             summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
2368                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2369             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2370             summary_writer.close();
2371         }
2372         catch ( final IOException e ) {
2373             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2374         }
2375     }
2376
2377     private static void writeDomainData( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2378                                          final Map<GoId, GoTerm> go_id_to_term_map,
2379                                          final GoNameSpace go_namespace_limit,
2380                                          final Writer out,
2381                                          final String domain_0,
2382                                          final String domain_1,
2383                                          final String prefix_for_html,
2384                                          final String character_separator_for_non_html_output,
2385                                          final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
2386                                          final Set<GoId> all_go_ids ) throws IOException {
2387         boolean any_go_annotation_present = false;
2388         boolean first_has_no_go = false;
2389         int domain_count = 2; // To distinguish between domains and binary domain combinations.
2390         if ( ForesterUtil.isEmpty( domain_1 ) ) {
2391             domain_count = 1;
2392         }
2393         // The following has a difficult to understand logic.  
2394         for( int d = 0; d < domain_count; ++d ) {
2395             List<GoId> go_ids = null;
2396             boolean go_annotation_present = false;
2397             if ( d == 0 ) {
2398                 if ( domain_id_to_go_ids_map.containsKey( domain_0 ) ) {
2399                     go_annotation_present = true;
2400                     any_go_annotation_present = true;
2401                     go_ids = domain_id_to_go_ids_map.get( domain_0 );
2402                 }
2403                 else {
2404                     first_has_no_go = true;
2405                 }
2406             }
2407             else {
2408                 if ( domain_id_to_go_ids_map.containsKey( domain_1 ) ) {
2409                     go_annotation_present = true;
2410                     any_go_annotation_present = true;
2411                     go_ids = domain_id_to_go_ids_map.get( domain_1 );
2412                 }
2413             }
2414             if ( go_annotation_present ) {
2415                 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
2416                 for( final GoId go_id : go_ids ) {
2417                     out.write( "<tr>" );
2418                     if ( first ) {
2419                         first = false;
2420                         writeDomainIdsToHtml( out,
2421                                               domain_0,
2422                                               domain_1,
2423                                               prefix_for_html,
2424                                               domain_id_to_secondary_features_maps );
2425                     }
2426                     else {
2427                         out.write( "<td></td>" );
2428                     }
2429                     if ( !go_id_to_term_map.containsKey( go_id ) ) {
2430                         throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
2431                     }
2432                     final GoTerm go_term = go_id_to_term_map.get( go_id );
2433                     if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
2434                         // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
2435                         final String go_id_str = go_id.getId();
2436                         out.write( "<td>" );
2437                         out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
2438                                 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
2439                         out.write( "</td><td>" );
2440                         out.write( go_term.getName() );
2441                         if ( domain_count == 2 ) {
2442                             out.write( " (" + d + ")" );
2443                         }
2444                         out.write( "</td><td>" );
2445                         // out.write( top );
2446                         // out.write( "</td><td>" );
2447                         out.write( "[" );
2448                         out.write( go_term.getGoNameSpace().toShortString() );
2449                         out.write( "]" );
2450                         out.write( "</td>" );
2451                         if ( all_go_ids != null ) {
2452                             all_go_ids.add( go_id );
2453                         }
2454                     }
2455                     else {
2456                         out.write( "<td>" );
2457                         out.write( "</td><td>" );
2458                         out.write( "</td><td>" );
2459                         out.write( "</td><td>" );
2460                         out.write( "</td>" );
2461                     }
2462                     out.write( "</tr>" );
2463                     out.write( SurfacingConstants.NL );
2464                 }
2465             }
2466         } //  for( int d = 0; d < domain_count; ++d ) 
2467         if ( !any_go_annotation_present ) {
2468             out.write( "<tr>" );
2469             writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
2470             out.write( "<td>" );
2471             out.write( "</td><td>" );
2472             out.write( "</td><td>" );
2473             out.write( "</td><td>" );
2474             out.write( "</td>" );
2475             out.write( "</tr>" );
2476             out.write( SurfacingConstants.NL );
2477         }
2478     }
2479
2480     private static void writeDomainIdsToHtml( final Writer out,
2481                                               final String domain_0,
2482                                               final String domain_1,
2483                                               final String prefix_for_detailed_html,
2484                                               final Map<String, Set<String>>[] domain_id_to_secondary_features_maps )
2485             throws IOException {
2486         out.write( "<td>" );
2487         if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
2488             out.write( prefix_for_detailed_html );
2489             out.write( " " );
2490         }
2491         out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
2492         out.write( "</td>" );
2493     }
2494
2495     private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
2496                                                                  final String domain_0,
2497                                                                  final String domain_1 ) throws IOException {
2498         individual_files_writer.write( domain_0 );
2499         individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2500         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2501             individual_files_writer.write( domain_1 );
2502             individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2503         }
2504     }
2505
2506     private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
2507         try {
2508             final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
2509             for( final String pfam : pfams ) {
2510                 writer.write( pfam );
2511                 writer.write( ForesterUtil.LINE_SEPARATOR );
2512             }
2513             writer.close();
2514             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
2515                     + "]" );
2516         }
2517         catch ( final IOException e ) {
2518             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2519         }
2520     }
2521
2522     private static void writeToNexus( final String outfile_name,
2523                                       final CharacterStateMatrix<BinaryStates> matrix,
2524                                       final Phylogeny phylogeny ) {
2525         if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
2526             throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
2527                     + "] to nexus" );
2528         }
2529         final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
2530         final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
2531         phylogenies.add( phylogeny );
2532         try {
2533             final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
2534             w.write( NexusConstants.NEXUS );
2535             w.write( ForesterUtil.LINE_SEPARATOR );
2536             my_matrix.writeNexusTaxaBlock( w );
2537             my_matrix.writeNexusBinaryChractersBlock( w );
2538             PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
2539             w.flush();
2540             w.close();
2541             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
2542         }
2543         catch ( final IOException e ) {
2544             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2545         }
2546     }
2547
2548     private static void writeToNexus( final String outfile_name,
2549                                       final DomainParsimonyCalculator domain_parsimony,
2550                                       final Phylogeny phylogeny ) {
2551         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
2552                       domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
2553                       phylogeny );
2554         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
2555                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
2556                       phylogeny );
2557     }
2558
2559     final static class DomainComparator implements Comparator<Domain> {
2560
2561         final private boolean _ascending;
2562
2563         public DomainComparator( final boolean ascending ) {
2564             _ascending = ascending;
2565         }
2566
2567         @Override
2568         public final int compare( final Domain d0, final Domain d1 ) {
2569             if ( d0.getFrom() < d1.getFrom() ) {
2570                 return _ascending ? -1 : 1;
2571             }
2572             else if ( d0.getFrom() > d1.getFrom() ) {
2573                 return _ascending ? 1 : -1;
2574             }
2575             return 0;
2576         }
2577     }
2578
2579     final static Color getColorForTaxCode( final String tax ) {
2580         if ( tax.equals( "Deuterostomia" ) ) {
2581             return ForesterUtil.DEUTEROSTOMIA_COLOR;
2582         }
2583         else if ( tax.equals( "Protostomia" ) ) {
2584             return ForesterUtil.PROTOSTOMIA_COLOR;
2585         }
2586         else if ( tax.equals( "Metazoa" ) ) {
2587             return ForesterUtil.METAZOA_COLOR;
2588         }
2589         else if ( tax.equals( "Holozoa" ) ) {
2590             return ForesterUtil.HOLOZOA_COLOR;
2591         }
2592         else if ( tax.equals( "Fungi" ) ) {
2593             return ForesterUtil.FUNGI_COLOR;
2594         }
2595         else if ( tax.equals( "Holomycota" ) ) {
2596             return ForesterUtil.HOLOMYCOTA_COLOR;
2597         }
2598         else if ( tax.equals( "Amoebozoa" ) ) {
2599             return ForesterUtil.AMOEBOZOA_COLOR;
2600         }
2601         else if ( tax.equals( "Viridiplantae" ) ) {
2602             return ForesterUtil.VIRIDPLANTAE_COLOR;
2603         }
2604         else if ( tax.equals( "Rhodophytaa" ) ) {
2605             return ForesterUtil.RHODOPHYTA_COLOR;
2606         }
2607         else if ( tax.startsWith( "Hacrobia" ) ) {
2608             return ForesterUtil.HACROBIA_COLOR;
2609         }
2610         else if ( tax.equals( "Stramenopiles" ) ) {
2611             return ForesterUtil.STRAMENOPILES_COLOR;
2612         }
2613         else if ( tax.equals( "Alveolata" ) ) {
2614             return ForesterUtil.ALVEOLATA_COLOR;
2615         }
2616         else if ( tax.equals( "Rhizaria" ) ) {
2617             return ForesterUtil.RHIZARIA_COLOR;
2618         }
2619         else if ( tax.equals( "Excavata" ) ) {
2620             return ForesterUtil.EXCAVATA_COLOR;
2621         }
2622         else if ( tax.equals( "Archaea" ) ) {
2623             return ForesterUtil.ARCHAEA_COLOR;
2624         }
2625         else if ( tax.equals( "Bacteria" ) ) {
2626             return ForesterUtil.BACTERIA_COLOR;
2627         }
2628         return null;
2629     }
2630 }