inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.awt.Color;
30 import java.io.BufferedWriter;
31 import java.io.File;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.Writer;
35 import java.text.DecimalFormat;
36 import java.text.NumberFormat;
37 import java.util.ArrayList;
38 import java.util.Arrays;
39 import java.util.Collections;
40 import java.util.Comparator;
41 import java.util.HashMap;
42 import java.util.HashSet;
43 import java.util.Iterator;
44 import java.util.List;
45 import java.util.Map;
46 import java.util.Map.Entry;
47 import java.util.PriorityQueue;
48 import java.util.Set;
49 import java.util.SortedMap;
50 import java.util.SortedSet;
51 import java.util.TreeMap;
52 import java.util.TreeSet;
53 import java.util.regex.Matcher;
54 import java.util.regex.Pattern;
55
56 import org.forester.application.surfacing;
57 import org.forester.evoinference.distance.NeighborJoining;
58 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
59 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
60 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
61 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
62 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
63 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
64 import org.forester.evoinference.matrix.distance.DistanceMatrix;
65 import org.forester.go.GoId;
66 import org.forester.go.GoNameSpace;
67 import org.forester.go.GoTerm;
68 import org.forester.go.PfamToGoMapping;
69 import org.forester.io.parsers.nexus.NexusConstants;
70 import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
71 import org.forester.io.parsers.util.ParserUtils;
72 import org.forester.io.writers.PhylogenyWriter;
73 import org.forester.phylogeny.Phylogeny;
74 import org.forester.phylogeny.PhylogenyMethods;
75 import org.forester.phylogeny.PhylogenyNode;
76 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
77 import org.forester.phylogeny.data.BinaryCharacters;
78 import org.forester.phylogeny.data.Confidence;
79 import org.forester.phylogeny.data.Taxonomy;
80 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
81 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
82 import org.forester.protein.BasicDomain;
83 import org.forester.protein.BasicProtein;
84 import org.forester.protein.BinaryDomainCombination;
85 import org.forester.protein.Domain;
86 import org.forester.protein.Protein;
87 import org.forester.species.Species;
88 import org.forester.surfacing.DomainSimilarity.PRINT_OPTION;
89 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
90 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
91 import org.forester.util.AsciiHistogram;
92 import org.forester.util.BasicDescriptiveStatistics;
93 import org.forester.util.BasicTable;
94 import org.forester.util.BasicTableParser;
95 import org.forester.util.CommandLineArguments;
96 import org.forester.util.DescriptiveStatistics;
97 import org.forester.util.ForesterUtil;
98 import org.forester.util.TaxonomyColors;
99
100 public final class SurfacingUtil {
101
102     public final static Pattern              PATTERN_SP_STYLE_TAXONOMY        = Pattern.compile( "^[A-Z0-9]{3,5}$" );
103     private final static Map<String, String> _TAXCODE_HEXCOLORSTRING_MAP      = new HashMap<String, String>();
104     private final static Map<String, String> _TAXCODE_TAXGROUP_MAP            = new HashMap<String, String>();
105     private static final Comparator<Domain>  ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
106
107                                                                                   @Override
108                                                                                   public int compare( final Domain d1,
109                                                                                                       final Domain d2 ) {
110                                                                                       if ( d1.getPerSequenceEvalue() < d2
111                                                                                               .getPerSequenceEvalue() ) {
112                                                                                           return -1;
113                                                                                       }
114                                                                                       else if ( d1
115                                                                                               .getPerSequenceEvalue() > d2
116                                                                                               .getPerSequenceEvalue() ) {
117                                                                                           return 1;
118                                                                                       }
119                                                                                       else {
120                                                                                           return d1.compareTo( d2 );
121                                                                                       }
122                                                                                   }
123                                                                               };
124     private final static NumberFormat        FORMATTER_3                      = new DecimalFormat( "0.000" );
125
126     private SurfacingUtil() {
127         // Hidden constructor.
128     }
129
130     public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
131                                                            final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
132         final SortedMap<String, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
133         for( final String domain_id : all_cd.keySet() ) {
134             binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
135         }
136     }
137
138     public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
139                                              final SortedSet<String> domain_ids ) {
140         final SortedSet<String> domains = genome.getAllDomainIds();
141         for( final String domain : domains ) {
142             domain_ids.add( domain );
143         }
144     }
145
146     public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
147         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
148         for( final DomainSimilarity similarity : similarities ) {
149             stats.addValue( similarity.getMeanSimilarityScore() );
150         }
151         return stats;
152     }
153
154     public static void checkForOutputFileWriteability( final File outfile ) {
155         final String error = ForesterUtil.isWritableFile( outfile );
156         if ( !ForesterUtil.isEmpty( error ) ) {
157             ForesterUtil.fatalError( surfacing.PRG_NAME, error );
158         }
159     }
160
161     public static void checkWriteabilityForPairwiseComparisons( final DomainSimilarity.PRINT_OPTION domain_similarity_print_option,
162                                                                 final String[][] input_file_properties,
163                                                                 final String automated_pairwise_comparison_suffix,
164                                                                 final File outdir ) {
165         for( int i = 0; i < input_file_properties.length; ++i ) {
166             for( int j = 0; j < i; ++j ) {
167                 final String species_i = input_file_properties[ i ][ 1 ];
168                 final String species_j = input_file_properties[ j ][ 1 ];
169                 String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i
170                         + "_" + species_j + automated_pairwise_comparison_suffix;
171                 switch ( domain_similarity_print_option ) {
172                     case HTML:
173                         if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) {
174                             pairwise_similarities_output_file_str += ".html";
175                         }
176                         break;
177                 }
178                 final String error = ForesterUtil
179                         .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir
180                                 + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) );
181                 if ( !ForesterUtil.isEmpty( error ) ) {
182                     ForesterUtil.fatalError( surfacing.PRG_NAME, error );
183                 }
184             }
185         }
186     }
187
188     public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
189                                                                                            final BinaryDomainCombination.DomainCombinationType dc_type,
190                                                                                            final List<BinaryDomainCombination> all_binary_domains_combination_gained,
191                                                                                            final boolean get_gains ) {
192         final SortedSet<String> sorted_ids = new TreeSet<String>();
193         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
194             sorted_ids.add( matrix.getIdentifier( i ) );
195         }
196         for( final String id : sorted_ids ) {
197             for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
198                 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
199                         || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
200                     if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
201                         all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
202                                 .createInstance( matrix.getCharacter( c ) ) );
203                     }
204                     else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
205                         all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
206                                 .createInstance( matrix.getCharacter( c ) ) );
207                     }
208                     else {
209                         all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
210                                 .getCharacter( c ) ) );
211                     }
212                 }
213             }
214         }
215     }
216
217     public static Map<String, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
218         final Map<String, List<GoId>> domain_id_to_go_ids_map = new HashMap<String, List<GoId>>( pfam_to_go_mappings.size() );
219         for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
220             if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
221                 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
222             }
223             domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
224         }
225         return domain_id_to_go_ids_map;
226     }
227
228     public static Map<String, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
229             throws IOException {
230         final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
231         final Map<String, Set<String>> map = new TreeMap<String, Set<String>>();
232         for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
233             final String domain_id = primary_table.getValue( 0, r );
234             if ( !map.containsKey( domain_id ) ) {
235                 map.put( domain_id, new HashSet<String>() );
236             }
237             map.get( domain_id ).add( primary_table.getValue( 1, r ) );
238         }
239         return map;
240     }
241
242     public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
243         checkForOutputFileWriteability( nj_tree_outfile );
244         final NeighborJoining nj = NeighborJoining.createInstance();
245         final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
246         phylogeny.setName( nj_tree_outfile.getName() );
247         writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
248         return phylogeny;
249     }
250
251     public static StringBuilder createParametersAsString( final boolean ignore_dufs,
252                                                           final double e_value_max,
253                                                           final int max_allowed_overlap,
254                                                           final boolean no_engulfing_overlaps,
255                                                           final File cutoff_scores_file,
256                                                           final BinaryDomainCombination.DomainCombinationType dc_type ) {
257         final StringBuilder parameters_sb = new StringBuilder();
258         parameters_sb.append( "E-value: " + e_value_max );
259         if ( cutoff_scores_file != null ) {
260             parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file );
261         }
262         else {
263             parameters_sb.append( ", Cutoff-scores-file: not-set" );
264         }
265         if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) {
266             parameters_sb.append( ", Max-overlap: " + max_allowed_overlap );
267         }
268         else {
269             parameters_sb.append( ", Max-overlap: not-set" );
270         }
271         if ( no_engulfing_overlaps ) {
272             parameters_sb.append( ", Engulfing-overlaps: not-allowed" );
273         }
274         else {
275             parameters_sb.append( ", Engulfing-overlaps: allowed" );
276         }
277         if ( ignore_dufs ) {
278             parameters_sb.append( ", Ignore-dufs: true" );
279         }
280         else {
281             parameters_sb.append( ", Ignore-dufs: false" );
282         }
283         parameters_sb.append( ", DC type (if applicable): " + dc_type );
284         return parameters_sb;
285     }
286
287     public static void createSplitWriters( final File out_dir,
288                                            final String my_outfile,
289                                            final Map<Character, Writer> split_writers ) throws IOException {
290         split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
291                 + "_domains_A.html" ) ) );
292         split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
293                 + "_domains_B.html" ) ) );
294         split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
295                 + "_domains_C.html" ) ) );
296         split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
297                 + "_domains_D.html" ) ) );
298         split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
299                 + "_domains_E.html" ) ) );
300         split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
301                 + "_domains_F.html" ) ) );
302         split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
303                 + "_domains_G.html" ) ) );
304         split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
305                 + "_domains_H.html" ) ) );
306         split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
307                 + "_domains_I.html" ) ) );
308         split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
309                 + "_domains_J.html" ) ) );
310         split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
311                 + "_domains_K.html" ) ) );
312         split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
313                 + "_domains_L.html" ) ) );
314         split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
315                 + "_domains_M.html" ) ) );
316         split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
317                 + "_domains_N.html" ) ) );
318         split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
319                 + "_domains_O.html" ) ) );
320         split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
321                 + "_domains_P.html" ) ) );
322         split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
323                 + "_domains_Q.html" ) ) );
324         split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
325                 + "_domains_R.html" ) ) );
326         split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
327                 + "_domains_S.html" ) ) );
328         split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
329                 + "_domains_T.html" ) ) );
330         split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
331                 + "_domains_U.html" ) ) );
332         split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
333                 + "_domains_V.html" ) ) );
334         split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
335                 + "_domains_W.html" ) ) );
336         split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
337                 + "_domains_X.html" ) ) );
338         split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
339                 + "_domains_Y.html" ) ) );
340         split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
341                 + "_domains_Z.html" ) ) );
342         split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
343                 + "_domains_0.html" ) ) );
344     }
345
346     public static Map<String, Integer> createTaxCodeToIdMap( final Phylogeny phy ) {
347         final Map<String, Integer> m = new HashMap<String, Integer>();
348         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
349             final PhylogenyNode n = iter.next();
350             if ( n.getNodeData().isHasTaxonomy() ) {
351                 final Taxonomy t = n.getNodeData().getTaxonomy();
352                 final String c = t.getTaxonomyCode();
353                 if ( !ForesterUtil.isEmpty( c ) ) {
354                     if ( n.getNodeData().getTaxonomy() == null ) {
355                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
356                     }
357                     final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue();
358                     if ( ForesterUtil.isEmpty( id ) ) {
359                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
360                     }
361                     if ( m.containsKey( c ) ) {
362                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" );
363                     }
364                     final int iid = Integer.valueOf( id );
365                     if ( m.containsValue( iid ) ) {
366                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" );
367                     }
368                     m.put( c, iid );
369                 }
370             }
371             else {
372                 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n );
373             }
374         }
375         return m;
376     }
377
378     public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
379                                                             final Detailedness detailedness ) {
380         for( final DomainSimilarity domain_similarity : domain_similarities ) {
381             if ( domain_similarity instanceof DomainSimilarity ) {
382                 final DomainSimilarity printable_domain_similarity = domain_similarity;
383                 printable_domain_similarity.setDetailedness( detailedness );
384             }
385         }
386     }
387
388     public static void doit( final List<Protein> proteins,
389                              final List<String> query_domain_ids_nc_order,
390                              final Writer out,
391                              final String separator,
392                              final String limit_to_species,
393                              final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
394         for( final Protein protein : proteins ) {
395             if ( ForesterUtil.isEmpty( limit_to_species )
396                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
397                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
398                     out.write( protein.getSpecies().getSpeciesId() );
399                     out.write( separator );
400                     out.write( protein.getProteinId().getId() );
401                     out.write( separator );
402                     out.write( "[" );
403                     final Set<String> visited_domain_ids = new HashSet<String>();
404                     boolean first = true;
405                     for( final Domain domain : protein.getProteinDomains() ) {
406                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
407                             visited_domain_ids.add( domain.getDomainId() );
408                             if ( first ) {
409                                 first = false;
410                             }
411                             else {
412                                 out.write( " " );
413                             }
414                             out.write( domain.getDomainId() );
415                             out.write( " {" );
416                             out.write( "" + domain.getTotalCount() );
417                             out.write( "}" );
418                         }
419                     }
420                     out.write( "]" );
421                     out.write( separator );
422                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
423                             .equals( SurfacingConstants.NONE ) ) ) {
424                         out.write( protein.getDescription() );
425                     }
426                     out.write( separator );
427                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
428                             .equals( SurfacingConstants.NONE ) ) ) {
429                         out.write( protein.getAccession() );
430                     }
431                     out.write( SurfacingConstants.NL );
432                 }
433             }
434         }
435         out.flush();
436     }
437
438     public static void domainsPerProteinsStatistics( final String genome,
439                                                      final List<Protein> protein_list,
440                                                      final DescriptiveStatistics all_genomes_domains_per_potein_stats,
441                                                      final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
442                                                      final SortedSet<String> domains_which_are_always_single,
443                                                      final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
444                                                      final SortedSet<String> domains_which_never_single,
445                                                      final Writer writer ) {
446         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
447         for( final Protein protein : protein_list ) {
448             final int domains = protein.getNumberOfProteinDomains();
449             //System.out.println( domains );
450             stats.addValue( domains );
451             all_genomes_domains_per_potein_stats.addValue( domains );
452             if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
453                 all_genomes_domains_per_potein_histo.put( domains, 1 );
454             }
455             else {
456                 all_genomes_domains_per_potein_histo.put( domains,
457                                                           1 + all_genomes_domains_per_potein_histo.get( domains ) );
458             }
459             if ( domains == 1 ) {
460                 final String domain = protein.getProteinDomain( 0 ).getDomainId();
461                 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
462                     if ( domains_which_never_single.contains( domain ) ) {
463                         domains_which_never_single.remove( domain );
464                         domains_which_are_sometimes_single_sometimes_not.add( domain );
465                     }
466                     else {
467                         domains_which_are_always_single.add( domain );
468                     }
469                 }
470             }
471             else if ( domains > 1 ) {
472                 for( final Domain d : protein.getProteinDomains() ) {
473                     final String domain = d.getDomainId();
474                     // System.out.println( domain );
475                     if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
476                         if ( domains_which_are_always_single.contains( domain ) ) {
477                             domains_which_are_always_single.remove( domain );
478                             domains_which_are_sometimes_single_sometimes_not.add( domain );
479                         }
480                         else {
481                             domains_which_never_single.add( domain );
482                         }
483                     }
484                 }
485             }
486         }
487         try {
488             writer.write( genome );
489             writer.write( "\t" );
490             if ( stats.getN() >= 1 ) {
491                 writer.write( stats.arithmeticMean() + "" );
492                 writer.write( "\t" );
493                 if ( stats.getN() >= 2 ) {
494                     writer.write( stats.sampleStandardDeviation() + "" );
495                 }
496                 else {
497                     writer.write( "" );
498                 }
499                 writer.write( "\t" );
500                 writer.write( stats.median() + "" );
501                 writer.write( "\t" );
502                 writer.write( stats.getN() + "" );
503                 writer.write( "\t" );
504                 writer.write( stats.getMin() + "" );
505                 writer.write( "\t" );
506                 writer.write( stats.getMax() + "" );
507             }
508             else {
509                 writer.write( "\t" );
510                 writer.write( "\t" );
511                 writer.write( "\t" );
512                 writer.write( "0" );
513                 writer.write( "\t" );
514                 writer.write( "\t" );
515             }
516             writer.write( "\n" );
517         }
518         catch ( final IOException e ) {
519             e.printStackTrace();
520         }
521     }
522
523     public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
524                                                     final int number_of_genomes,
525                                                     final DomainLengthsTable domain_lengths_table,
526                                                     final File outfile ) throws IOException {
527         final DecimalFormat df = new DecimalFormat( "#.00" );
528         checkForOutputFileWriteability( outfile );
529         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
530         out.write( "MEAN BASED STATISTICS PER SPECIES" );
531         out.write( ForesterUtil.LINE_SEPARATOR );
532         out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
533         out.write( ForesterUtil.LINE_SEPARATOR );
534         out.write( ForesterUtil.LINE_SEPARATOR );
535         final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
536         out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
537         out.write( ForesterUtil.LINE_SEPARATOR );
538         for( final DomainLengths domain_lengths : domain_lengths_list ) {
539             final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
540             if ( species_list.size() > 0 ) {
541                 out.write( domain_lengths.getDomainId() + "\t" );
542                 for( final Species species : species_list ) {
543                     out.write( species + "\t" );
544                 }
545                 out.write( ForesterUtil.LINE_SEPARATOR );
546             }
547         }
548         out.write( ForesterUtil.LINE_SEPARATOR );
549         out.write( ForesterUtil.LINE_SEPARATOR );
550         out.write( "OUTLIER SPECIES (Z 1.0)" );
551         out.write( ForesterUtil.LINE_SEPARATOR );
552         final DescriptiveStatistics stats_for_all_species = domain_lengths_table
553                 .calculateMeanBasedStatisticsForAllSpecies();
554         out.write( stats_for_all_species.asSummary() );
555         out.write( ForesterUtil.LINE_SEPARATOR );
556         final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
557         out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
558         out.write( ForesterUtil.LINE_SEPARATOR );
559         final double population_sd = stats_for_all_species.sampleStandardDeviation();
560         final double population_mean = stats_for_all_species.arithmeticMean();
561         for( final Species species : domain_lengths_table.getSpecies() ) {
562             final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
563             final double z = ( x - population_mean ) / population_sd;
564             out.write( species + "\t" + z );
565             out.write( ForesterUtil.LINE_SEPARATOR );
566         }
567         out.write( ForesterUtil.LINE_SEPARATOR );
568         for( final Species species : domain_lengths_table.getSpecies() ) {
569             final DescriptiveStatistics stats_for_species = domain_lengths_table
570                     .calculateMeanBasedStatisticsForSpecies( species );
571             final double x = stats_for_species.arithmeticMean();
572             final double z = ( x - population_mean ) / population_sd;
573             if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
574                 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
575                 out.write( ForesterUtil.LINE_SEPARATOR );
576             }
577         }
578         out.close();
579         System.gc();
580     }
581
582     /**
583      * Warning: This side-effects 'all_bin_domain_combinations_encountered'!
584      * 
585      * 
586      * @param output_file
587      * @param all_bin_domain_combinations_changed
588      * @param sum_of_all_domains_encountered
589      * @param all_bin_domain_combinations_encountered
590      * @param is_gains_analysis
591      * @param protein_length_stats_by_dc 
592      * @throws IOException
593      */
594     public static void executeFitchGainsAnalysis( final File output_file,
595                                                   final List<BinaryDomainCombination> all_bin_domain_combinations_changed,
596                                                   final int sum_of_all_domains_encountered,
597                                                   final SortedSet<BinaryDomainCombination> all_bin_domain_combinations_encountered,
598                                                   final boolean is_gains_analysis ) throws IOException {
599         checkForOutputFileWriteability( output_file );
600         final Writer out = ForesterUtil.createBufferedWriter( output_file );
601         final SortedMap<Object, Integer> bdc_to_counts = ForesterUtil
602                 .listToSortedCountsMap( all_bin_domain_combinations_changed );
603         final SortedSet<String> all_domains_in_combination_changed_more_than_once = new TreeSet<String>();
604         final SortedSet<String> all_domains_in_combination_changed_only_once = new TreeSet<String>();
605         int above_one = 0;
606         int one = 0;
607         for( final Object bdc_object : bdc_to_counts.keySet() ) {
608             final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object;
609             final int count = bdc_to_counts.get( bdc_object );
610             if ( count < 1 ) {
611                 ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "count < 1 " );
612             }
613             out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR );
614             if ( count > 1 ) {
615                 all_domains_in_combination_changed_more_than_once.add( bdc.getId0() );
616                 all_domains_in_combination_changed_more_than_once.add( bdc.getId1() );
617                 above_one++;
618             }
619             else if ( count == 1 ) {
620                 all_domains_in_combination_changed_only_once.add( bdc.getId0() );
621                 all_domains_in_combination_changed_only_once.add( bdc.getId1() );
622                 one++;
623             }
624         }
625         final int all = all_bin_domain_combinations_encountered.size();
626         int never_lost = -1;
627         if ( !is_gains_analysis ) {
628             all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed );
629             never_lost = all_bin_domain_combinations_encountered.size();
630             for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) {
631                 out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR );
632             }
633         }
634         if ( is_gains_analysis ) {
635             out.write( "Sum of all distinct domain combinations appearing once               : " + one
636                     + ForesterUtil.LINE_SEPARATOR );
637             out.write( "Sum of all distinct domain combinations appearing more than once     : " + above_one
638                     + ForesterUtil.LINE_SEPARATOR );
639             out.write( "Sum of all distinct domains in combinations apppearing only once     : "
640                     + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR );
641             out.write( "Sum of all distinct domains in combinations apppearing more than once: "
642                     + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR );
643         }
644         else {
645             out.write( "Sum of all distinct domain combinations never lost                   : " + never_lost
646                     + ForesterUtil.LINE_SEPARATOR );
647             out.write( "Sum of all distinct domain combinations lost once                    : " + one
648                     + ForesterUtil.LINE_SEPARATOR );
649             out.write( "Sum of all distinct domain combinations lost more than once          : " + above_one
650                     + ForesterUtil.LINE_SEPARATOR );
651             out.write( "Sum of all distinct domains in combinations lost only once           : "
652                     + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR );
653             out.write( "Sum of all distinct domains in combinations lost more than once: "
654                     + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR );
655         }
656         out.write( "All binary combinations                                              : " + all
657                 + ForesterUtil.LINE_SEPARATOR );
658         out.write( "All domains                                                          : "
659                 + sum_of_all_domains_encountered );
660         out.close();
661         ForesterUtil.programMessage( surfacing.PRG_NAME,
662                                      "Wrote fitch domain combination dynamics counts analysis to \"" + output_file
663                                              + "\"" );
664     }
665
666     /**
667      * 
668      * @param all_binary_domains_combination_lost_fitch 
669      * @param use_last_in_fitch_parsimony 
670      * @param consider_directedness_and_adjacency_for_bin_combinations 
671      * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
672      * which were gained under unweighted (Fitch) parsimony.
673      */
674     public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
675                                                  final boolean radomize_fitch_parsimony,
676                                                  final String outfile_name,
677                                                  final DomainParsimonyCalculator domain_parsimony,
678                                                  final Phylogeny phylogeny,
679                                                  final Map<String, List<GoId>> domain_id_to_go_ids_map,
680                                                  final Map<GoId, GoTerm> go_id_to_term_map,
681                                                  final GoNameSpace go_namespace_limit,
682                                                  final String parameters_str,
683                                                  final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
684                                                  final SortedSet<String> positive_filter,
685                                                  final boolean output_binary_domain_combinations_for_graphs,
686                                                  final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
687                                                  final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
688                                                  final BinaryDomainCombination.DomainCombinationType dc_type,
689                                                  final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
690                                                  final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
691                                                  final Map<String, DescriptiveStatistics> domain_length_stats_by_domain,
692                                                  final Map<String, Integer> tax_code_to_id_map,
693                                                  final boolean write_to_nexus,
694                                                  final boolean use_last_in_fitch_parsimony ) {
695         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
696         final String date_time = ForesterUtil.getCurrentDateTime();
697         final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
698         final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
699         final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
700         final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
701         final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
702         if ( write_to_nexus ) {
703             writeToNexus( outfile_name, domain_parsimony, phylogeny );
704         }
705         // DOLLO DOMAINS
706         // -------------
707         Phylogeny local_phylogeny_l = phylogeny.copy();
708         if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
709             domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
710         }
711         else {
712             domain_parsimony.executeDolloParsimonyOnDomainPresence();
713         }
714         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
715                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
716         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
717                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
718         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
719                                                            CharacterStateMatrix.GainLossStates.GAIN,
720                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
721                                                            sep,
722                                                            ForesterUtil.LINE_SEPARATOR,
723                                                            null );
724         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
725                                                            CharacterStateMatrix.GainLossStates.LOSS,
726                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
727                                                            sep,
728                                                            ForesterUtil.LINE_SEPARATOR,
729                                                            null );
730         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
731                 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
732         //HTML:
733         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
734                                        go_id_to_term_map,
735                                        go_namespace_limit,
736                                        false,
737                                        domain_parsimony.getGainLossMatrix(),
738                                        CharacterStateMatrix.GainLossStates.GAIN,
739                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
740                                        sep,
741                                        ForesterUtil.LINE_SEPARATOR,
742                                        "Dollo Parsimony | Gains | Domains",
743                                        "+",
744                                        domain_id_to_secondary_features_maps,
745                                        all_pfams_encountered,
746                                        all_pfams_gained_as_domains,
747                                        "_dollo_gains_d",
748                                        tax_code_to_id_map );
749         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
750                                        go_id_to_term_map,
751                                        go_namespace_limit,
752                                        false,
753                                        domain_parsimony.getGainLossMatrix(),
754                                        CharacterStateMatrix.GainLossStates.LOSS,
755                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
756                                        sep,
757                                        ForesterUtil.LINE_SEPARATOR,
758                                        "Dollo Parsimony | Losses | Domains",
759                                        "-",
760                                        domain_id_to_secondary_features_maps,
761                                        all_pfams_encountered,
762                                        all_pfams_lost_as_domains,
763                                        "_dollo_losses_d",
764                                        tax_code_to_id_map );
765         //        writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
766         //                                       go_id_to_term_map,
767         //                                       go_namespace_limit,
768         //                                       false,
769         //                                       domain_parsimony.getGainLossMatrix(),
770         //                                       null,
771         //                                       outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
772         //                                       sep,
773         //                                       ForesterUtil.LINE_SEPARATOR,
774         //                                       "Dollo Parsimony | Present | Domains",
775         //                                       "",
776         //                                       domain_id_to_secondary_features_maps,
777         //                                       all_pfams_encountered,
778         //                                       null,
779         //                                       "_dollo_present_d",
780         //                                       tax_code_to_id_map );
781         preparePhylogeny( local_phylogeny_l,
782                           domain_parsimony,
783                           date_time,
784                           "Dollo parsimony on domain presence/absence",
785                           "dollo_on_domains_" + outfile_name,
786                           parameters_str );
787         SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
788                 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
789         try {
790             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
791             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
792         }
793         catch ( final IOException e ) {
794             e.printStackTrace();
795             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
796         }
797         if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
798             // FITCH DOMAIN COMBINATIONS
799             // -------------------------
800             local_phylogeny_l = phylogeny.copy();
801             String randomization = "no";
802             if ( radomize_fitch_parsimony ) {
803                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
804                 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
805             }
806             else {
807                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony );
808             }
809             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
810                     + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
811             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
812                     + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
813             SurfacingUtil
814                     .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
815                                                           CharacterStateMatrix.GainLossStates.GAIN,
816                                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
817                                                           sep,
818                                                           ForesterUtil.LINE_SEPARATOR,
819                                                           null );
820             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
821                                                                CharacterStateMatrix.GainLossStates.LOSS,
822                                                                outfile_name
823                                                                        + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
824                                                                sep,
825                                                                ForesterUtil.LINE_SEPARATOR,
826                                                                null );
827             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
828                     + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
829             if ( all_binary_domains_combination_gained_fitch != null ) {
830                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
831                                                                                     dc_type,
832                                                                                     all_binary_domains_combination_gained_fitch,
833                                                                                     true );
834             }
835             if ( all_binary_domains_combination_lost_fitch != null ) {
836                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
837                                                                                     dc_type,
838                                                                                     all_binary_domains_combination_lost_fitch,
839                                                                                     false );
840             }
841             if ( output_binary_domain_combinations_for_graphs ) {
842                 SurfacingUtil
843                         .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
844                                                                                                            .getGainLossMatrix(),
845                                                                                                    null,
846                                                                                                    outfile_name
847                                                                                                            + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
848                                                                                                    sep,
849                                                                                                    ForesterUtil.LINE_SEPARATOR,
850                                                                                                    BinaryDomainCombination.OutputFormat.DOT );
851             }
852             // HTML:
853             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
854                                            go_id_to_term_map,
855                                            go_namespace_limit,
856                                            true,
857                                            domain_parsimony.getGainLossMatrix(),
858                                            CharacterStateMatrix.GainLossStates.GAIN,
859                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
860                                            sep,
861                                            ForesterUtil.LINE_SEPARATOR,
862                                            "Fitch Parsimony | Gains | Domain Combinations",
863                                            "+",
864                                            null,
865                                            all_pfams_encountered,
866                                            all_pfams_gained_as_dom_combinations,
867                                            "_fitch_gains_dc",
868                                            tax_code_to_id_map );
869             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
870                                            go_id_to_term_map,
871                                            go_namespace_limit,
872                                            true,
873                                            domain_parsimony.getGainLossMatrix(),
874                                            CharacterStateMatrix.GainLossStates.LOSS,
875                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
876                                            sep,
877                                            ForesterUtil.LINE_SEPARATOR,
878                                            "Fitch Parsimony | Losses | Domain Combinations",
879                                            "-",
880                                            null,
881                                            all_pfams_encountered,
882                                            all_pfams_lost_as_dom_combinations,
883                                            "_fitch_losses_dc",
884                                            tax_code_to_id_map );
885             //            writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
886             //                                           go_id_to_term_map,
887             //                                           go_namespace_limit,
888             //                                           true,
889             //                                           domain_parsimony.getGainLossMatrix(),
890             //                                           null,
891             //                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
892             //                                           sep,
893             //                                           ForesterUtil.LINE_SEPARATOR,
894             //                                           "Fitch Parsimony | Present | Domain Combinations",
895             //                                           "",
896             //                                           null,
897             //                                           all_pfams_encountered,
898             //                                           null,
899             //                                           "_fitch_present_dc",
900             //                                           tax_code_to_id_map );
901             writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
902                                             go_id_to_term_map,
903                                             outfile_name,
904                                             all_pfams_encountered );
905             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
906             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
907             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
908                               all_pfams_gained_as_dom_combinations );
909             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
910             preparePhylogeny( local_phylogeny_l,
911                               domain_parsimony,
912                               date_time,
913                               "Fitch parsimony on binary domain combination presence/absence randomization: "
914                                       + randomization,
915                               "fitch_on_binary_domain_combinations_" + outfile_name,
916                               parameters_str );
917             SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
918                     + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
919             calculateIndependentDomainCombinationGains( local_phylogeny_l,
920                                                         outfile_name
921                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
922                                                         outfile_name
923                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
924                                                         outfile_name
925                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
926                                                         outfile_name
927                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
928                                                         outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
929                                                         outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
930                                                         outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
931                                                         protein_length_stats_by_dc,
932                                                         domain_number_stats_by_dc,
933                                                         domain_length_stats_by_domain );
934         }
935     }
936
937     public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
938                                                                      final DomainParsimonyCalculator secondary_features_parsimony,
939                                                                      final Phylogeny phylogeny,
940                                                                      final String parameters_str,
941                                                                      final Map<Species, MappingResults> mapping_results_map,
942                                                                      final boolean use_last_in_fitch_parsimony ) {
943         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
944         final String date_time = ForesterUtil.getCurrentDateTime();
945         System.out.println();
946         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
947                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
948                       phylogeny );
949         Phylogeny local_phylogeny_copy = phylogeny.copy();
950         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
951         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
952                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
953         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
954                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
955         SurfacingUtil
956                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
957                                                       CharacterStateMatrix.GainLossStates.GAIN,
958                                                       outfile_name
959                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
960                                                       sep,
961                                                       ForesterUtil.LINE_SEPARATOR,
962                                                       null );
963         SurfacingUtil
964                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
965                                                       CharacterStateMatrix.GainLossStates.LOSS,
966                                                       outfile_name
967                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
968                                                       sep,
969                                                       ForesterUtil.LINE_SEPARATOR,
970                                                       null );
971         SurfacingUtil
972                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
973                                                       null,
974                                                       outfile_name
975                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
976                                                       sep,
977                                                       ForesterUtil.LINE_SEPARATOR,
978                                                       null );
979         preparePhylogeny( local_phylogeny_copy,
980                           secondary_features_parsimony,
981                           date_time,
982                           "Dollo parsimony on secondary feature presence/absence",
983                           "dollo_on_secondary_features_" + outfile_name,
984                           parameters_str );
985         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
986                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
987         // FITCH DOMAIN COMBINATIONS
988         // -------------------------
989         local_phylogeny_copy = phylogeny.copy();
990         final String randomization = "no";
991         secondary_features_parsimony
992                 .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony );
993         preparePhylogeny( local_phylogeny_copy,
994                           secondary_features_parsimony,
995                           date_time,
996                           "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
997                                   + randomization,
998                           "fitch_on_binary_domain_combinations_" + outfile_name,
999                           parameters_str );
1000         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
1001                 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
1002         calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
1003                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
1004                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
1005                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
1006                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
1007                 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
1008                 + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
1009     }
1010
1011     public static void executePlusMinusAnalysis( final File output_file,
1012                                                  final List<String> plus_minus_analysis_high_copy_base,
1013                                                  final List<String> plus_minus_analysis_high_copy_target,
1014                                                  final List<String> plus_minus_analysis_low_copy,
1015                                                  final List<GenomeWideCombinableDomains> gwcd_list,
1016                                                  final SortedMap<Species, List<Protein>> protein_lists_per_species,
1017                                                  final Map<String, List<GoId>> domain_id_to_go_ids_map,
1018                                                  final Map<GoId, GoTerm> go_id_to_term_map,
1019                                                  final List<Object> plus_minus_analysis_numbers ) {
1020         final Set<String> all_spec = new HashSet<String>();
1021         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
1022             all_spec.add( gwcd.getSpecies().getSpeciesId() );
1023         }
1024         final File html_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX_HTML );
1025         final File plain_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX );
1026         final File html_out_dc = new File( output_file + surfacing.PLUS_MINUS_DC_SUFFIX_HTML );
1027         final File all_domains_go_ids_out_dom = new File( output_file + surfacing.PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX );
1028         final File passing_domains_go_ids_out_dom = new File( output_file
1029                 + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX );
1030         final File proteins_file_base = new File( output_file + "" );
1031         final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue();
1032         final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue();
1033         try {
1034             DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list,
1035                                                                        protein_lists_per_species,
1036                                                                        plus_minus_analysis_high_copy_base,
1037                                                                        plus_minus_analysis_high_copy_target,
1038                                                                        plus_minus_analysis_low_copy,
1039                                                                        min_diff,
1040                                                                        factor,
1041                                                                        plain_out_dom,
1042                                                                        html_out_dom,
1043                                                                        html_out_dc,
1044                                                                        domain_id_to_go_ids_map,
1045                                                                        go_id_to_term_map,
1046                                                                        all_domains_go_ids_out_dom,
1047                                                                        passing_domains_go_ids_out_dom,
1048                                                                        proteins_file_base );
1049         }
1050         catch ( final IOException e ) {
1051             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1052         }
1053         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \""
1054                 + html_out_dom + "\"" );
1055         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \""
1056                 + plain_out_dom + "\"" );
1057         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc
1058                 + "\"" );
1059         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \""
1060                 + passing_domains_go_ids_out_dom + "\"" );
1061         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \""
1062                 + all_domains_go_ids_out_dom + "\"" );
1063     }
1064
1065     public static void extractProteinNames( final List<Protein> proteins,
1066                                             final List<String> query_domain_ids_nc_order,
1067                                             final Writer out,
1068                                             final String separator,
1069                                             final String limit_to_species ) throws IOException {
1070         for( final Protein protein : proteins ) {
1071             if ( ForesterUtil.isEmpty( limit_to_species )
1072                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1073                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
1074                     out.write( protein.getSpecies().getSpeciesId() );
1075                     out.write( separator );
1076                     out.write( protein.getProteinId().getId() );
1077                     out.write( separator );
1078                     out.write( "[" );
1079                     final Set<String> visited_domain_ids = new HashSet<String>();
1080                     boolean first = true;
1081                     for( final Domain domain : protein.getProteinDomains() ) {
1082                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
1083                             visited_domain_ids.add( domain.getDomainId() );
1084                             if ( first ) {
1085                                 first = false;
1086                             }
1087                             else {
1088                                 out.write( " " );
1089                             }
1090                             out.write( domain.getDomainId() );
1091                             out.write( " {" );
1092                             out.write( "" + domain.getTotalCount() );
1093                             out.write( "}" );
1094                         }
1095                     }
1096                     out.write( "]" );
1097                     out.write( separator );
1098                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1099                             .equals( SurfacingConstants.NONE ) ) ) {
1100                         out.write( protein.getDescription() );
1101                     }
1102                     out.write( separator );
1103                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1104                             .equals( SurfacingConstants.NONE ) ) ) {
1105                         out.write( protein.getAccession() );
1106                     }
1107                     out.write( SurfacingConstants.NL );
1108                 }
1109             }
1110         }
1111         out.flush();
1112     }
1113
1114     public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
1115                                             final String domain_id,
1116                                             final Writer out,
1117                                             final String separator,
1118                                             final String limit_to_species,
1119                                             final double domain_e_cutoff ) throws IOException {
1120         //System.out.println( "Per domain E-value: " + domain_e_cutoff );
1121         for( final Species species : protein_lists_per_species.keySet() ) {
1122             //System.out.println( species + ":" );
1123             for( final Protein protein : protein_lists_per_species.get( species ) ) {
1124                 if ( ForesterUtil.isEmpty( limit_to_species )
1125                         || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1126                     final List<Domain> domains = protein.getProteinDomains( domain_id );
1127                     if ( domains.size() > 0 ) {
1128                         out.write( protein.getSpecies().getSpeciesId() );
1129                         out.write( separator );
1130                         out.write( protein.getProteinId().getId() );
1131                         out.write( separator );
1132                         out.write( domain_id.toString() );
1133                         out.write( separator );
1134                         int prev_to = -1;
1135                         for( final Domain domain : domains ) {
1136                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1137                                 out.write( "/" );
1138                                 out.write( domain.getFrom() + "-" + domain.getTo() );
1139                                 if ( prev_to >= 0 ) {
1140                                     final int l = domain.getFrom() - prev_to;
1141                                     // System.out.println( l );
1142                                 }
1143                                 prev_to = domain.getTo();
1144                             }
1145                         }
1146                         out.write( "/" );
1147                         out.write( separator );
1148                         final List<Domain> domain_list = new ArrayList<Domain>();
1149                         for( final Domain domain : protein.getProteinDomains() ) {
1150                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1151                                 domain_list.add( domain );
1152                             }
1153                         }
1154                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
1155                         for( int i = 0; i < domain_list.size(); ++i ) {
1156                             domain_ary[ i ] = domain_list.get( i );
1157                         }
1158                         Arrays.sort( domain_ary, new DomainComparator( true ) );
1159                         out.write( "{" );
1160                         boolean first = true;
1161                         for( final Domain domain : domain_ary ) {
1162                             if ( first ) {
1163                                 first = false;
1164                             }
1165                             else {
1166                                 out.write( "," );
1167                             }
1168                             out.write( domain.getDomainId().toString() );
1169                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
1170                             out.write( ":" + domain.getPerDomainEvalue() );
1171                         }
1172                         out.write( "}" );
1173                         if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1174                                 .equals( SurfacingConstants.NONE ) ) ) {
1175                             out.write( protein.getDescription() );
1176                         }
1177                         out.write( separator );
1178                         if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1179                                 .equals( SurfacingConstants.NONE ) ) ) {
1180                             out.write( protein.getAccession() );
1181                         }
1182                         out.write( SurfacingConstants.NL );
1183                     }
1184                 }
1185             }
1186         }
1187         out.flush();
1188     }
1189
1190     public static SortedSet<String> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
1191         final SortedSet<String> all_domains_ids = new TreeSet<String>();
1192         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
1193             final Set<String> all_domains = gwcd.getAllDomainIds();
1194             //    for( final Domain domain : all_domains ) {
1195             all_domains_ids.addAll( all_domains );
1196             //    }
1197         }
1198         return all_domains_ids;
1199     }
1200
1201     public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
1202         final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
1203         for( final Protein protein_domain_collection : protein_domain_collections ) {
1204             for( final Object name : protein_domain_collection.getProteinDomains() ) {
1205                 final BasicDomain protein_domain = ( BasicDomain ) name;
1206                 final String id = protein_domain.getDomainId();
1207                 if ( map.containsKey( id ) ) {
1208                     map.put( id, map.get( id ) + 1 );
1209                 }
1210                 else {
1211                     map.put( id, 1 );
1212                 }
1213             }
1214         }
1215         return map;
1216     }
1217
1218     public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
1219         final PhylogenyNodeIterator it = p.iteratorPostorder();
1220         int c = 0;
1221         while ( it.hasNext() ) {
1222             final PhylogenyNode n = it.next();
1223             if ( ForesterUtil.isEmpty( n.getName() )
1224                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1225                             .getScientificName() ) )
1226                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1227                             .getCommonName() ) ) ) {
1228                 if ( n.getParent() != null ) {
1229                     names.append( " " );
1230                     names.append( n.getParent().getName() );
1231                 }
1232                 final List l = n.getAllExternalDescendants();
1233                 for( final Object object : l ) {
1234                     System.out.println( l.toString() );
1235                 }
1236                 ++c;
1237             }
1238         }
1239         return c;
1240     }
1241
1242     public static void log( final String msg, final Writer w ) {
1243         try {
1244             w.write( msg );
1245             w.write( ForesterUtil.LINE_SEPARATOR );
1246         }
1247         catch ( final IOException e ) {
1248             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1249         }
1250     }
1251
1252     public static Phylogeny[] obtainAndPreProcessIntrees( final File[] intree_files,
1253                                                           final int number_of_genomes,
1254                                                           final String[][] input_file_properties ) {
1255         final Phylogeny[] intrees = new Phylogeny[ intree_files.length ];
1256         int i = 0;
1257         for( final File intree_file : intree_files ) {
1258             Phylogeny intree = null;
1259             final String error = ForesterUtil.isReadableFile( intree_file );
1260             if ( !ForesterUtil.isEmpty( error ) ) {
1261                 ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: "
1262                         + error );
1263             }
1264             try {
1265                 final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance()
1266                         .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) );
1267                 if ( p_array.length < 1 ) {
1268                     ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1269                             + "] does not contain any phylogeny in phyloXML format" );
1270                 }
1271                 else if ( p_array.length > 1 ) {
1272                     ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1273                             + "] contains more than one phylogeny in phyloXML format" );
1274                 }
1275                 intree = p_array[ 0 ];
1276             }
1277             catch ( final Exception e ) {
1278                 ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file
1279                         + "]: " + error );
1280             }
1281             if ( ( intree == null ) || intree.isEmpty() ) {
1282                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" );
1283             }
1284             if ( !intree.isRooted() ) {
1285                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" );
1286             }
1287             if ( intree.getNumberOfExternalNodes() < number_of_genomes ) {
1288                 ForesterUtil.fatalError( surfacing.PRG_NAME,
1289                                          "number of external nodes [" + intree.getNumberOfExternalNodes()
1290                                                  + "] of input tree [" + intree_file
1291                                                  + "] is smaller than the number of genomes the be analyzed ["
1292                                                  + number_of_genomes + "]" );
1293             }
1294             final StringBuilder parent_names = new StringBuilder();
1295             final int nodes_lacking_name = getNumberOfNodesLackingName( intree, parent_names );
1296             if ( nodes_lacking_name > 0 ) {
1297                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has "
1298                         + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" );
1299             }
1300             preparePhylogenyForParsimonyAnalyses( intree, input_file_properties );
1301             if ( !intree.isCompletelyBinary() ) {
1302                 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file
1303                         + "] is not completely binary" );
1304             }
1305             intrees[ i++ ] = intree;
1306         }
1307         return intrees;
1308     }
1309
1310     public static Phylogeny obtainFirstIntree( final File intree_file ) {
1311         Phylogeny intree = null;
1312         final String error = ForesterUtil.isReadableFile( intree_file );
1313         if ( !ForesterUtil.isEmpty( error ) ) {
1314             ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + error );
1315         }
1316         try {
1317             final Phylogeny[] phys = ParserBasedPhylogenyFactory.getInstance()
1318                     .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) );
1319             if ( phys.length < 1 ) {
1320                 ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1321                         + "] does not contain any phylogeny in phyloXML format" );
1322             }
1323             else if ( phys.length > 1 ) {
1324                 ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1325                         + "] contains more than one phylogeny in phyloXML format" );
1326             }
1327             intree = phys[ 0 ];
1328         }
1329         catch ( final Exception e ) {
1330             ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + "]: "
1331                     + error );
1332         }
1333         if ( ( intree == null ) || intree.isEmpty() ) {
1334             ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" );
1335         }
1336         if ( !intree.isRooted() ) {
1337             ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" );
1338         }
1339         return intree;
1340     }
1341
1342     public static String obtainHexColorStringDependingOnTaxonomyGroup( final String tax_code, final Phylogeny phy )
1343             throws IllegalArgumentException {
1344         if ( !_TAXCODE_HEXCOLORSTRING_MAP.containsKey( tax_code ) ) {
1345             if ( ( phy != null ) && !phy.isEmpty() ) {
1346                 //                final List<PhylogenyNode> nodes = phy.getNodesViaTaxonomyCode( tax_code );
1347                 //                Color c = null;
1348                 //                if ( ( nodes == null ) || nodes.isEmpty() ) {
1349                 //                    throw new IllegalArgumentException( "code " + tax_code + " is not found" );
1350                 //                }
1351                 //                if ( nodes.size() != 1 ) {
1352                 //                    throw new IllegalArgumentException( "code " + tax_code + " is not unique" );
1353                 //                }
1354                 //                PhylogenyNode n = nodes.get( 0 );
1355                 //                while ( n != null ) {
1356                 //                    if ( n.getNodeData().isHasTaxonomy()
1357                 //                            && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
1358                 //                        c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getNodeData().getTaxonomy()
1359                 //                                .getScientificName(), tax_code );
1360                 //                    }
1361                 //                    if ( ( c == null ) && !ForesterUtil.isEmpty( n.getName() ) ) {
1362                 //                        c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getName(), tax_code );
1363                 //                    }
1364                 //                    if ( c != null ) {
1365                 //                        break;
1366                 //                    }
1367                 //                    n = n.getParent();
1368                 //                }
1369                 final String group = obtainTaxonomyGroup( tax_code, phy );
1370                 final Color c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group );
1371                 if ( c == null ) {
1372                     throw new IllegalArgumentException( "no color found for taxonomy group \"" + group
1373                             + "\" for code \"" + tax_code + "\"" );
1374                 }
1375                 final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() );
1376                 _TAXCODE_HEXCOLORSTRING_MAP.put( tax_code, hex );
1377             }
1378             else {
1379                 throw new IllegalArgumentException( "unable to obtain color for code " + tax_code
1380                         + " (tree is null or empty and code is not in map)" );
1381             }
1382         }
1383         return _TAXCODE_HEXCOLORSTRING_MAP.get( tax_code );
1384     }
1385
1386     public static String obtainTaxonomyGroup( final String tax_code, final Phylogeny species_tree )
1387             throws IllegalArgumentException {
1388         if ( !_TAXCODE_TAXGROUP_MAP.containsKey( tax_code ) ) {
1389             if ( ( species_tree != null ) && !species_tree.isEmpty() ) {
1390                 final List<PhylogenyNode> nodes = species_tree.getNodesViaTaxonomyCode( tax_code );
1391                 if ( ( nodes == null ) || nodes.isEmpty() ) {
1392                     throw new IllegalArgumentException( "code " + tax_code + " is not found" );
1393                 }
1394                 if ( nodes.size() != 1 ) {
1395                     throw new IllegalArgumentException( "code " + tax_code + " is not unique" );
1396                 }
1397                 PhylogenyNode n = nodes.get( 0 );
1398                 String group = null;
1399                 while ( n != null ) {
1400                     if ( n.getNodeData().isHasTaxonomy()
1401                             && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
1402                         group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getNodeData().getTaxonomy()
1403                                 .getScientificName() );
1404                     }
1405                     if ( ForesterUtil.isEmpty( group ) && !ForesterUtil.isEmpty( n.getName() ) ) {
1406                         group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getName() );
1407                     }
1408                     if ( !ForesterUtil.isEmpty( group ) ) {
1409                         break;
1410                     }
1411                     n = n.getParent();
1412                 }
1413                 if ( ForesterUtil.isEmpty( group ) ) {
1414                     throw new IllegalArgumentException( "no group found for taxonomy code \"" + tax_code + "\"" );
1415                 }
1416                 _TAXCODE_TAXGROUP_MAP.put( tax_code, group );
1417             }
1418             else {
1419                 throw new IllegalArgumentException( "unable to obtain group for code " + tax_code
1420                         + " (tree is null or empty and code is not in map)" );
1421             }
1422         }
1423         return _TAXCODE_TAXGROUP_MAP.get( tax_code );
1424     }
1425
1426     public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
1427                                                           final SortedMap<String, Integer> domain_architecuture_counts,
1428                                                           final int min_count,
1429                                                           final File da_counts_outfile,
1430                                                           final File unique_da_outfile ) {
1431         checkForOutputFileWriteability( da_counts_outfile );
1432         checkForOutputFileWriteability( unique_da_outfile );
1433         try {
1434             final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
1435             final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
1436             final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
1437             while ( it.hasNext() ) {
1438                 final Map.Entry<String, Integer> e = it.next();
1439                 final String da = e.getKey();
1440                 final int count = e.getValue();
1441                 if ( count >= min_count ) {
1442                     da_counts_out.write( da );
1443                     da_counts_out.write( "\t" );
1444                     da_counts_out.write( String.valueOf( count ) );
1445                     da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
1446                 }
1447                 if ( count == 1 ) {
1448                     final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
1449                     while ( it2.hasNext() ) {
1450                         final Map.Entry<String, Set<String>> e2 = it2.next();
1451                         final String genome = e2.getKey();
1452                         final Set<String> das = e2.getValue();
1453                         if ( das.contains( da ) ) {
1454                             unique_da_out.write( genome );
1455                             unique_da_out.write( "\t" );
1456                             unique_da_out.write( da );
1457                             unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
1458                         }
1459                     }
1460                 }
1461             }
1462             unique_da_out.close();
1463             da_counts_out.close();
1464         }
1465         catch ( final IOException e ) {
1466             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1467         }
1468         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
1469         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
1470         //
1471     }
1472
1473     public static void preparePhylogeny( final Phylogeny p,
1474                                          final DomainParsimonyCalculator domain_parsimony,
1475                                          final String date_time,
1476                                          final String method,
1477                                          final String name,
1478                                          final String parameters_str ) {
1479         domain_parsimony.decoratePhylogenyWithDomains( p );
1480         final StringBuilder desc = new StringBuilder();
1481         desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
1482         desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
1483         desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
1484         desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
1485         desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
1486         desc.append( "[Parameters: " + parameters_str + "]" );
1487         p.setName( name );
1488         p.setDescription( desc.toString() );
1489         p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
1490         p.setRerootable( false );
1491         p.setRooted( true );
1492     }
1493
1494     public static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
1495                                                              final String[][] input_file_properties ) {
1496         final String[] genomes = new String[ input_file_properties.length ];
1497         for( int i = 0; i < input_file_properties.length; ++i ) {
1498             if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) {
1499                 ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ]
1500                         + "] is not unique in input tree " + intree.getName() );
1501             }
1502             genomes[ i ] = input_file_properties[ i ][ 1 ];
1503         }
1504         //
1505         final PhylogenyNodeIterator it = intree.iteratorPostorder();
1506         while ( it.hasNext() ) {
1507             final PhylogenyNode n = it.next();
1508             if ( ForesterUtil.isEmpty( n.getName() ) ) {
1509                 if ( n.getNodeData().isHasTaxonomy()
1510                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) {
1511                     n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() );
1512                 }
1513                 else if ( n.getNodeData().isHasTaxonomy()
1514                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
1515                     n.setName( n.getNodeData().getTaxonomy().getScientificName() );
1516                 }
1517                 else if ( n.getNodeData().isHasTaxonomy()
1518                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) {
1519                     n.setName( n.getNodeData().getTaxonomy().getCommonName() );
1520                 }
1521                 else {
1522                     ForesterUtil
1523                             .fatalError( surfacing.PRG_NAME,
1524                                          "node with no name, scientific name, common name, or taxonomy code present" );
1525                 }
1526             }
1527         }
1528         //
1529         final List<String> igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree );
1530         if ( igns.size() > 0 ) {
1531             System.out.println( "Not using the following " + igns.size() + " nodes:" );
1532             for( int i = 0; i < igns.size(); ++i ) {
1533                 System.out.println( " " + i + ": " + igns.get( i ) );
1534             }
1535             System.out.println( "--" );
1536         }
1537         for( final String[] input_file_propertie : input_file_properties ) {
1538             try {
1539                 intree.getNode( input_file_propertie[ 1 ] );
1540             }
1541             catch ( final IllegalArgumentException e ) {
1542                 ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_propertie[ 1 ]
1543                         + "] not present/not unique in input tree" );
1544             }
1545         }
1546     }
1547
1548     public static void printOutPercentageOfMultidomainProteins( final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
1549                                                                 final Writer log_writer ) {
1550         int sum = 0;
1551         for( final Entry<Integer, Integer> entry : all_genomes_domains_per_potein_histo.entrySet() ) {
1552             sum += entry.getValue();
1553         }
1554         final double percentage = ( 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) ) / sum;
1555         ForesterUtil.programMessage( surfacing.PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" );
1556         log( "Percentage of multidomain proteins:            : " + percentage + "%", log_writer );
1557     }
1558
1559     public static void processFilter( final File filter_file, final SortedSet<String> filter ) {
1560         SortedSet<String> filter_str = null;
1561         try {
1562             filter_str = ForesterUtil.file2set( filter_file );
1563         }
1564         catch ( final IOException e ) {
1565             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1566         }
1567         if ( filter_str != null ) {
1568             for( final String string : filter_str ) {
1569                 filter.add( string );
1570             }
1571         }
1572         if ( surfacing.VERBOSE ) {
1573             System.out.println( "Filter:" );
1574             for( final String domainId : filter ) {
1575                 System.out.println( domainId );
1576             }
1577         }
1578     }
1579
1580     public static String[][] processInputGenomesFile( final File input_genomes ) {
1581         String[][] input_file_properties = null;
1582         try {
1583             input_file_properties = ForesterUtil.file22dArray( input_genomes );
1584         }
1585         catch ( final IOException e ) {
1586             ForesterUtil.fatalError( surfacing.PRG_NAME,
1587                                      "genomes files is to be in the following format \"<hmmpfam output file> <species>\": "
1588                                              + e.getLocalizedMessage() );
1589         }
1590         final Set<String> specs = new HashSet<String>();
1591         final Set<String> paths = new HashSet<String>();
1592         for( int i = 0; i < input_file_properties.length; ++i ) {
1593             if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) {
1594                 ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: "
1595                         + input_file_properties[ i ][ 1 ] );
1596             }
1597             if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) {
1598                 ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ]
1599                         + " is not unique" );
1600             }
1601             specs.add( input_file_properties[ i ][ 1 ] );
1602             if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) {
1603                 ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ]
1604                         + " is not unique" );
1605             }
1606             paths.add( input_file_properties[ i ][ 0 ] );
1607             final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) );
1608             if ( !ForesterUtil.isEmpty( error ) ) {
1609                 ForesterUtil.fatalError( surfacing.PRG_NAME, error );
1610             }
1611         }
1612         return input_file_properties;
1613     }
1614
1615     public static void processPlusMinusAnalysisOption( final CommandLineArguments cla,
1616                                                        final List<String> high_copy_base,
1617                                                        final List<String> high_copy_target,
1618                                                        final List<String> low_copy,
1619                                                        final List<Object> numbers ) {
1620         if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) {
1621             if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) {
1622                 ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -"
1623                         + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=<file>" );
1624             }
1625             final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) );
1626             final String msg = ForesterUtil.isReadableFile( plus_minus_file );
1627             if ( !ForesterUtil.isEmpty( msg ) ) {
1628                 ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg );
1629             }
1630             processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers );
1631         }
1632     }
1633
1634     // First numbers is minimal difference, second is factor.
1635     public static void processPlusMinusFile( final File plus_minus_file,
1636                                              final List<String> high_copy_base,
1637                                              final List<String> high_copy_target,
1638                                              final List<String> low_copy,
1639                                              final List<Object> numbers ) {
1640         Set<String> species_set = null;
1641         int min_diff = surfacing.PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT;
1642         double factor = surfacing.PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT;
1643         try {
1644             species_set = ForesterUtil.file2set( plus_minus_file );
1645         }
1646         catch ( final IOException e ) {
1647             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1648         }
1649         if ( species_set != null ) {
1650             for( final String species : species_set ) {
1651                 final String species_trimmed = species.substring( 1 );
1652                 if ( species.startsWith( "+" ) ) {
1653                     if ( low_copy.contains( species_trimmed ) ) {
1654                         ForesterUtil.fatalError( surfacing.PRG_NAME,
1655                                                  "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \""
1656                                                          + species_trimmed + "\"" );
1657                     }
1658                     high_copy_base.add( species_trimmed );
1659                 }
1660                 else if ( species.startsWith( "*" ) ) {
1661                     if ( low_copy.contains( species_trimmed ) ) {
1662                         ForesterUtil.fatalError( surfacing.PRG_NAME,
1663                                                  "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \""
1664                                                          + species_trimmed + "\"" );
1665                     }
1666                     high_copy_target.add( species_trimmed );
1667                 }
1668                 else if ( species.startsWith( "-" ) ) {
1669                     if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) {
1670                         ForesterUtil.fatalError( surfacing.PRG_NAME,
1671                                                  "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \""
1672                                                          + species_trimmed + "\"" );
1673                     }
1674                     low_copy.add( species_trimmed );
1675                 }
1676                 else if ( species.startsWith( "$D" ) ) {
1677                     try {
1678                         min_diff = Integer.parseInt( species.substring( 3 ) );
1679                     }
1680                     catch ( final NumberFormatException e ) {
1681                         ForesterUtil.fatalError( surfacing.PRG_NAME,
1682                                                  "could not parse integer value for minimal difference from: \""
1683                                                          + species.substring( 3 ) + "\"" );
1684                     }
1685                 }
1686                 else if ( species.startsWith( "$F" ) ) {
1687                     try {
1688                         factor = Double.parseDouble( species.substring( 3 ) );
1689                     }
1690                     catch ( final NumberFormatException e ) {
1691                         ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \""
1692                                 + species.substring( 3 ) + "\"" );
1693                     }
1694                 }
1695                 else if ( species.startsWith( "#" ) ) {
1696                     // Comment, ignore.
1697                 }
1698                 else {
1699                     ForesterUtil
1700                             .fatalError( surfacing.PRG_NAME,
1701                                          "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=<integer>' minimal Difference (default is 1), '$F=<double>' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \""
1702                                                  + species + "\"" );
1703                 }
1704                 numbers.add( new Integer( min_diff + "" ) );
1705                 numbers.add( new Double( factor + "" ) );
1706             }
1707         }
1708         else {
1709             ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" );
1710         }
1711     }
1712
1713     /*
1714      * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
1715      * 
1716      * 
1717      */
1718     static public StringBuffer proteinToDomainCombinations( final Protein protein,
1719                                                             final String protein_id,
1720                                                             final String separator ) {
1721         final StringBuffer sb = new StringBuffer();
1722         if ( protein.getSpecies() == null ) {
1723             throw new IllegalArgumentException( "species must not be null" );
1724         }
1725         if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
1726             throw new IllegalArgumentException( "species id must not be empty" );
1727         }
1728         final List<Domain> domains = protein.getProteinDomains();
1729         if ( domains.size() > 1 ) {
1730             final Map<String, Integer> counts = new HashMap<String, Integer>();
1731             for( final Domain domain : domains ) {
1732                 final String id = domain.getDomainId();
1733                 if ( counts.containsKey( id ) ) {
1734                     counts.put( id, counts.get( id ) + 1 );
1735                 }
1736                 else {
1737                     counts.put( id, 1 );
1738                 }
1739             }
1740             final Set<String> dcs = new HashSet<String>();
1741             for( int i = 1; i < domains.size(); ++i ) {
1742                 for( int j = 0; j < i; ++j ) {
1743                     Domain domain_n = domains.get( i );
1744                     Domain domain_c = domains.get( j );
1745                     if ( domain_n.getFrom() > domain_c.getFrom() ) {
1746                         domain_n = domains.get( j );
1747                         domain_c = domains.get( i );
1748                     }
1749                     final String dc = domain_n.getDomainId() + domain_c.getDomainId();
1750                     if ( !dcs.contains( dc ) ) {
1751                         dcs.add( dc );
1752                         sb.append( protein.getSpecies() );
1753                         sb.append( separator );
1754                         sb.append( protein_id );
1755                         sb.append( separator );
1756                         sb.append( domain_n.getDomainId() );
1757                         sb.append( separator );
1758                         sb.append( domain_c.getDomainId() );
1759                         sb.append( separator );
1760                         sb.append( domain_n.getPerDomainEvalue() );
1761                         sb.append( separator );
1762                         sb.append( domain_c.getPerDomainEvalue() );
1763                         sb.append( separator );
1764                         sb.append( counts.get( domain_n.getDomainId() ) );
1765                         sb.append( separator );
1766                         sb.append( counts.get( domain_c.getDomainId() ) );
1767                         sb.append( ForesterUtil.LINE_SEPARATOR );
1768                     }
1769                 }
1770             }
1771         }
1772         else if ( domains.size() == 1 ) {
1773             sb.append( protein.getSpecies() );
1774             sb.append( separator );
1775             sb.append( protein_id );
1776             sb.append( separator );
1777             sb.append( domains.get( 0 ).getDomainId() );
1778             sb.append( separator );
1779             sb.append( separator );
1780             sb.append( domains.get( 0 ).getPerDomainEvalue() );
1781             sb.append( separator );
1782             sb.append( separator );
1783             sb.append( 1 );
1784             sb.append( separator );
1785             sb.append( ForesterUtil.LINE_SEPARATOR );
1786         }
1787         else {
1788             sb.append( protein.getSpecies() );
1789             sb.append( separator );
1790             sb.append( protein_id );
1791             sb.append( separator );
1792             sb.append( separator );
1793             sb.append( separator );
1794             sb.append( separator );
1795             sb.append( separator );
1796             sb.append( separator );
1797             sb.append( ForesterUtil.LINE_SEPARATOR );
1798         }
1799         return sb;
1800     }
1801
1802     public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
1803         final List<Domain> domains = new ArrayList<Domain>();
1804         for( final Domain d : protein.getProteinDomains() ) {
1805             domains.add( d );
1806         }
1807         Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
1808         return domains;
1809     }
1810
1811     public static int storeDomainArchitectures( final String genome,
1812                                                 final SortedMap<String, Set<String>> domain_architecutures,
1813                                                 final List<Protein> protein_list,
1814                                                 final Map<String, Integer> distinct_domain_architecuture_counts ) {
1815         final Set<String> da = new HashSet<String>();
1816         domain_architecutures.put( genome, da );
1817         for( final Protein protein : protein_list ) {
1818             final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
1819             if ( !da.contains( da_str ) ) {
1820                 if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
1821                     distinct_domain_architecuture_counts.put( da_str, 1 );
1822                 }
1823                 else {
1824                     distinct_domain_architecuture_counts.put( da_str,
1825                                                               distinct_domain_architecuture_counts.get( da_str ) + 1 );
1826                 }
1827                 da.add( da_str );
1828             }
1829         }
1830         return da.size();
1831     }
1832
1833     public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
1834                                                             final boolean get_gains,
1835                                                             final String outdir,
1836                                                             final String suffix_for_filename ) throws IOException {
1837         CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
1838         if ( !get_gains ) {
1839             state = CharacterStateMatrix.GainLossStates.LOSS;
1840         }
1841         final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
1842                                                                   false,
1843                                                                   state,
1844                                                                   outdir );
1845         for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
1846             final PhylogenyNode node = it.next();
1847             if ( !node.isExternal() ) {
1848                 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
1849                 if ( domains.size() > 0 ) {
1850                     final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
1851                             + node.getName() + suffix_for_filename );
1852                     for( final String domain : domains ) {
1853                         writer.write( domain );
1854                         writer.write( ForesterUtil.LINE_SEPARATOR );
1855                     }
1856                     writer.close();
1857                 }
1858             }
1859         }
1860     }
1861
1862     public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
1863                                                                           final File output_dir,
1864                                                                           final GenomeWideCombinableDomains gwcd,
1865                                                                           final int i,
1866                                                                           final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1867         File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ]
1868                 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
1869         if ( output_dir != null ) {
1870             dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
1871         }
1872         checkForOutputFileWriteability( dc_outfile_dot );
1873         final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
1874         try {
1875             final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
1876             for( final BinaryDomainCombination bdc : binary_combinations ) {
1877                 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
1878                         .toString() );
1879                 out_dot.write( SurfacingConstants.NL );
1880             }
1881             out_dot.close();
1882         }
1883         catch ( final IOException e ) {
1884             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1885         }
1886         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
1887                 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1888                 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
1889     }
1890
1891     public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1892                                                             final CharacterStateMatrix.GainLossStates state,
1893                                                             final String filename,
1894                                                             final String indentifier_characters_separator,
1895                                                             final String character_separator,
1896                                                             final Map<String, String> descriptions ) {
1897         final File outfile = new File( filename );
1898         checkForOutputFileWriteability( outfile );
1899         final SortedSet<String> sorted_ids = new TreeSet<String>();
1900         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1901             sorted_ids.add( matrix.getIdentifier( i ) );
1902         }
1903         try {
1904             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1905             for( final String id : sorted_ids ) {
1906                 out.write( indentifier_characters_separator );
1907                 out.write( "#" + id );
1908                 out.write( indentifier_characters_separator );
1909                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1910                     // Not nice:
1911                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1912                     if ( ( matrix.getState( id, c ) == state )
1913                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1914                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1915                         out.write( matrix.getCharacter( c ) );
1916                         if ( ( descriptions != null ) && !descriptions.isEmpty()
1917                                 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
1918                             out.write( "\t" );
1919                             out.write( descriptions.get( matrix.getCharacter( c ) ) );
1920                         }
1921                         out.write( character_separator );
1922                     }
1923                 }
1924             }
1925             out.flush();
1926             out.close();
1927         }
1928         catch ( final IOException e ) {
1929             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1930         }
1931         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1932     }
1933
1934     public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1935                                                                                                  final CharacterStateMatrix.GainLossStates state,
1936                                                                                                  final String filename,
1937                                                                                                  final String indentifier_characters_separator,
1938                                                                                                  final String character_separator,
1939                                                                                                  final BinaryDomainCombination.OutputFormat bc_output_format ) {
1940         final File outfile = new File( filename );
1941         checkForOutputFileWriteability( outfile );
1942         final SortedSet<String> sorted_ids = new TreeSet<String>();
1943         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1944             sorted_ids.add( matrix.getIdentifier( i ) );
1945         }
1946         try {
1947             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1948             for( final String id : sorted_ids ) {
1949                 out.write( indentifier_characters_separator );
1950                 out.write( "#" + id );
1951                 out.write( indentifier_characters_separator );
1952                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1953                     // Not nice:
1954                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1955                     if ( ( matrix.getState( id, c ) == state )
1956                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1957                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1958                         BinaryDomainCombination bdc = null;
1959                         try {
1960                             bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
1961                         }
1962                         catch ( final Exception e ) {
1963                             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1964                         }
1965                         out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
1966                         out.write( character_separator );
1967                     }
1968                 }
1969             }
1970             out.flush();
1971             out.close();
1972         }
1973         catch ( final IOException e ) {
1974             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1975         }
1976         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1977     }
1978
1979     public static void writeBinaryStatesMatrixToList( final Map<String, List<GoId>> domain_id_to_go_ids_map,
1980                                                       final Map<GoId, GoTerm> go_id_to_term_map,
1981                                                       final GoNameSpace go_namespace_limit,
1982                                                       final boolean domain_combinations,
1983                                                       final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1984                                                       final CharacterStateMatrix.GainLossStates state,
1985                                                       final String filename,
1986                                                       final String indentifier_characters_separator,
1987                                                       final String character_separator,
1988                                                       final String title_for_html,
1989                                                       final String prefix_for_html,
1990                                                       final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
1991                                                       final SortedSet<String> all_pfams_encountered,
1992                                                       final SortedSet<String> pfams_gained_or_lost,
1993                                                       final String suffix_for_per_node_events_file,
1994                                                       final Map<String, Integer> tax_code_to_id_map ) {
1995         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1996             throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1997         }
1998         else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1999             throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
2000         }
2001         else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
2002             throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
2003         }
2004         final File outfile = new File( filename );
2005         checkForOutputFileWriteability( outfile );
2006         final SortedSet<String> sorted_ids = new TreeSet<String>();
2007         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
2008             sorted_ids.add( matrix.getIdentifier( i ) );
2009         }
2010         try {
2011             final Writer out = new BufferedWriter( new FileWriter( outfile ) );
2012             final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
2013                                                                                                                 domain_combinations,
2014                                                                                                                 state,
2015                                                                                                                 filename );
2016             Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
2017             File per_node_go_mapped_domain_gain_loss_outfile = null;
2018             int per_node_counter = 0;
2019             out.write( "<html>" );
2020             out.write( SurfacingConstants.NL );
2021             writeHtmlHead( out, title_for_html );
2022             out.write( SurfacingConstants.NL );
2023             out.write( "<body>" );
2024             out.write( SurfacingConstants.NL );
2025             out.write( "<h1>" );
2026             out.write( SurfacingConstants.NL );
2027             out.write( title_for_html );
2028             out.write( SurfacingConstants.NL );
2029             out.write( "</h1>" );
2030             out.write( SurfacingConstants.NL );
2031             out.write( "<table>" );
2032             out.write( SurfacingConstants.NL );
2033             for( final String id : sorted_ids ) {
2034                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
2035                 if ( matcher.matches() ) {
2036                     continue;
2037                 }
2038                 out.write( "<tr>" );
2039                 out.write( "<td>" );
2040                 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
2041                 out.write( "</td>" );
2042                 out.write( "</tr>" );
2043                 out.write( SurfacingConstants.NL );
2044             }
2045             out.write( "</table>" );
2046             out.write( SurfacingConstants.NL );
2047             for( final String id : sorted_ids ) {
2048                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
2049                 if ( matcher.matches() ) {
2050                     continue;
2051                 }
2052                 out.write( SurfacingConstants.NL );
2053                 out.write( "<h2>" );
2054                 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
2055                 writeTaxonomyLinks( out, id, tax_code_to_id_map );
2056                 out.write( "</h2>" );
2057                 out.write( SurfacingConstants.NL );
2058                 out.write( "<table>" );
2059                 out.write( SurfacingConstants.NL );
2060                 out.write( "<tr>" );
2061                 out.write( "<td><b>" );
2062                 out.write( "Pfam domain(s)" );
2063                 out.write( "</b></td><td><b>" );
2064                 out.write( "GO term acc" );
2065                 out.write( "</b></td><td><b>" );
2066                 out.write( "GO term" );
2067                 out.write( "</b></td><td><b>" );
2068                 out.write( "GO namespace" );
2069                 out.write( "</b></td>" );
2070                 out.write( "</tr>" );
2071                 out.write( SurfacingConstants.NL );
2072                 out.write( "</tr>" );
2073                 out.write( SurfacingConstants.NL );
2074                 per_node_counter = 0;
2075                 if ( matrix.getNumberOfCharacters() > 0 ) {
2076                     per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2077                             + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
2078                     SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
2079                     per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
2080                             .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
2081                 }
2082                 else {
2083                     per_node_go_mapped_domain_gain_loss_outfile = null;
2084                     per_node_go_mapped_domain_gain_loss_outfile_writer = null;
2085                 }
2086                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
2087                     // Not nice:
2088                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
2089                     if ( ( matrix.getState( id, c ) == state )
2090                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
2091                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
2092                         final String character = matrix.getCharacter( c );
2093                         String domain_0 = "";
2094                         String domain_1 = "";
2095                         if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
2096                             final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
2097                             if ( s.length != 2 ) {
2098                                 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
2099                                         + character + "]" );
2100                             }
2101                             domain_0 = s[ 0 ];
2102                             domain_1 = s[ 1 ];
2103                         }
2104                         else {
2105                             domain_0 = character;
2106                         }
2107                         writeDomainData( domain_id_to_go_ids_map,
2108                                          go_id_to_term_map,
2109                                          go_namespace_limit,
2110                                          out,
2111                                          domain_0,
2112                                          domain_1,
2113                                          prefix_for_html,
2114                                          character_separator,
2115                                          domain_id_to_secondary_features_maps,
2116                                          null );
2117                         all_pfams_encountered.add( domain_0 );
2118                         if ( pfams_gained_or_lost != null ) {
2119                             pfams_gained_or_lost.add( domain_0 );
2120                         }
2121                         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2122                             all_pfams_encountered.add( domain_1 );
2123                             if ( pfams_gained_or_lost != null ) {
2124                                 pfams_gained_or_lost.add( domain_1 );
2125                             }
2126                         }
2127                         if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2128                             writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
2129                                                                      domain_0,
2130                                                                      domain_1 );
2131                             per_node_counter++;
2132                         }
2133                     }
2134                 }
2135                 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2136                     per_node_go_mapped_domain_gain_loss_outfile_writer.close();
2137                     if ( per_node_counter < 1 ) {
2138                         per_node_go_mapped_domain_gain_loss_outfile.delete();
2139                     }
2140                     per_node_counter = 0;
2141                 }
2142                 out.write( "</table>" );
2143                 out.write( SurfacingConstants.NL );
2144                 out.write( "<hr>" );
2145                 out.write( SurfacingConstants.NL );
2146             } // for( final String id : sorted_ids ) {  
2147             out.write( "</body>" );
2148             out.write( SurfacingConstants.NL );
2149             out.write( "</html>" );
2150             out.write( SurfacingConstants.NL );
2151             out.flush();
2152             out.close();
2153         }
2154         catch ( final IOException e ) {
2155             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2156         }
2157         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
2158     }
2159
2160     public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
2161                                                           final File output_dir,
2162                                                           final Writer per_genome_domain_promiscuity_statistics_writer,
2163                                                           final GenomeWideCombinableDomains gwcd,
2164                                                           final int i,
2165                                                           final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
2166         File dc_outfile = new File( input_file_properties[ i ][ 1 ]
2167                 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
2168         if ( output_dir != null ) {
2169             dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
2170         }
2171         checkForOutputFileWriteability( dc_outfile );
2172         try {
2173             final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
2174             out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
2175             out.close();
2176         }
2177         catch ( final IOException e ) {
2178             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2179         }
2180         final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
2181         try {
2182             per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" );
2183             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
2184             if ( stats.getN() < 2 ) {
2185                 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
2186             }
2187             else {
2188                 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
2189                         .sampleStandardDeviation() ) + "\t" );
2190             }
2191             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
2192             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
2193             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
2194             per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
2195             final SortedSet<String> mpds = gwcd.getMostPromiscuosDomain();
2196             for( final String mpd : mpds ) {
2197                 per_genome_domain_promiscuity_statistics_writer.write( mpd + " " );
2198             }
2199             per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
2200         }
2201         catch ( final IOException e ) {
2202             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2203         }
2204         if ( input_file_properties[ i ].length == 3 ) {
2205             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2206                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
2207                     + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
2208         }
2209         else {
2210             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2211                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
2212                     + dc_outfile + "\"" );
2213         }
2214     }
2215
2216     public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc,
2217                                                       final StringBuilder html_title,
2218                                                       final Writer simple_tab_writer,
2219                                                       final Writer single_writer,
2220                                                       Map<Character, Writer> split_writers,
2221                                                       final SortedSet<DomainSimilarity> similarities,
2222                                                       final boolean treat_as_binary,
2223                                                       final List<Species> species_order,
2224                                                       final DomainSimilarity.PRINT_OPTION print_option,
2225                                                       final DomainSimilarity.DomainSimilarityScoring scoring,
2226                                                       final boolean verbose,
2227                                                       final Map<String, Integer> tax_code_to_id_map,
2228                                                       final Phylogeny phy,
2229                                                       final Set<String> pos_filter_doms ) throws IOException {
2230         if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
2231             split_writers = new HashMap<Character, Writer>();
2232             split_writers.put( '_', single_writer );
2233         }
2234         switch ( print_option ) {
2235             case SIMPLE_TAB_DELIMITED:
2236                 break;
2237             case HTML:
2238                 for( final Character key : split_writers.keySet() ) {
2239                     final Writer w = split_writers.get( key );
2240                     w.write( "<html>" );
2241                     w.write( SurfacingConstants.NL );
2242                     if ( key != '_' ) {
2243                         writeHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() );
2244                     }
2245                     else {
2246                         writeHtmlHead( w, "DC analysis (" + html_title + ")" );
2247                     }
2248                     w.write( SurfacingConstants.NL );
2249                     w.write( "<body>" );
2250                     w.write( SurfacingConstants.NL );
2251                     w.write( html_desc.toString() );
2252                     w.write( SurfacingConstants.NL );
2253                     w.write( "<hr>" );
2254                     w.write( SurfacingConstants.NL );
2255                     w.write( "<br>" );
2256                     w.write( SurfacingConstants.NL );
2257                     w.write( "<table>" );
2258                     w.write( SurfacingConstants.NL );
2259                     w.write( "<tr><td><b>Domains:</b></td></tr>" );
2260                     w.write( SurfacingConstants.NL );
2261                 }
2262                 break;
2263         }
2264         //
2265         for( final DomainSimilarity similarity : similarities ) {
2266             if ( ( species_order != null ) && !species_order.isEmpty() ) {
2267                 ( similarity ).setSpeciesOrder( species_order );
2268             }
2269             if ( single_writer != null ) {
2270                 if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) {
2271                     single_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId()
2272                             + "\"><span style=\"color:#00ff00\">" + similarity.getDomainId()
2273                             + "</span></a></b></td></tr>" );
2274                 }
2275                 else {
2276                     single_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
2277                             + similarity.getDomainId() + "</a></b></td></tr>" );
2278                 }
2279                 single_writer.write( SurfacingConstants.NL );
2280             }
2281             else {
2282                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
2283                         .charAt( 0 ) );
2284                 if ( local_writer == null ) {
2285                     local_writer = split_writers.get( '0' );
2286                 }
2287                 if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) {
2288                     local_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId()
2289                             + "\"><span style=\"color:#00ff00\">" + similarity.getDomainId()
2290                             + "</span></a></b></td></tr>" );
2291                 }
2292                 else {
2293                     local_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
2294                             + similarity.getDomainId() + "</a></b></td></tr>" );
2295                 }
2296                 local_writer.write( SurfacingConstants.NL );
2297             }
2298         }
2299         for( final Writer w : split_writers.values() ) {
2300             w.write( "</table>" );
2301             w.write( SurfacingConstants.NL );
2302             w.write( "<hr>" );
2303             w.write( SurfacingConstants.NL );
2304             //
2305             w.write( "<table>" );
2306             w.write( SurfacingConstants.NL );
2307             w.write( "<tr><td><b>" );
2308             w.write( "Species group colors:" );
2309             w.write( "</b></td></tr>" );
2310             w.write( SurfacingConstants.NL );
2311             writeColorLabels( "Deuterostomia", TaxonomyColors.DEUTEROSTOMIA_COLOR, w );
2312             writeColorLabels( "Protostomia", TaxonomyColors.PROTOSTOMIA_COLOR, w );
2313             writeColorLabels( "Cnidaria", TaxonomyColors.CNIDARIA_COLOR, w );
2314             writeColorLabels( "Placozoa", TaxonomyColors.PLACOZOA_COLOR, w );
2315             writeColorLabels( "Ctenophora (comb jellies)", TaxonomyColors.CTENOPHORA_COLOR, w );
2316             writeColorLabels( "Porifera (sponges)", TaxonomyColors.PORIFERA_COLOR, w );
2317             writeColorLabels( "Choanoflagellida", TaxonomyColors.CHOANOFLAGELLIDA, w );
2318             writeColorLabels( "Ichthyosporea & Filasterea", TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA, w );
2319             writeColorLabels( "Dikarya (Ascomycota & Basidiomycota, so-called \"higher fungi\")",
2320                               TaxonomyColors.DIKARYA_COLOR,
2321                               w );
2322             writeColorLabels( "other Fungi", TaxonomyColors.OTHER_FUNGI_COLOR, w );
2323             writeColorLabels( "Nucleariidae and Fonticula group",
2324                               TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR,
2325                               w );
2326             writeColorLabels( "Amoebozoa", TaxonomyColors.AMOEBOZOA_COLOR, w );
2327             writeColorLabels( "Embryophyta (plants)", TaxonomyColors.EMBRYOPHYTA_COLOR, w );
2328             writeColorLabels( "Chlorophyta (green algae)", TaxonomyColors.CHLOROPHYTA_COLOR, w );
2329             writeColorLabels( "Rhodophyta (red algae)", TaxonomyColors.RHODOPHYTA_COLOR, w );
2330             writeColorLabels( "Glaucocystophyce (Glaucophyta)", TaxonomyColors.GLAUCOPHYTA_COLOR, w );
2331             writeColorLabels( "Hacrobia (Cryptophyta & Haptophyceae & Centroheliozoa)",
2332                               TaxonomyColors.HACROBIA_COLOR,
2333                               w );
2334             writeColorLabels( "Stramenopiles (Chromophyta, heterokonts)", TaxonomyColors.STRAMENOPILES_COLOR, w );
2335             writeColorLabels( "Alveolata", TaxonomyColors.ALVEOLATA_COLOR, w );
2336             writeColorLabels( "Rhizaria", TaxonomyColors.RHIZARIA_COLOR, w );
2337             writeColorLabels( "Excavata", TaxonomyColors.EXCAVATA_COLOR, w );
2338             writeColorLabels( "Apusozoa", TaxonomyColors.APUSOZOA_COLOR, w );
2339             writeColorLabels( "Archaea", TaxonomyColors.ARCHAEA_COLOR, w );
2340             writeColorLabels( "Bacteria", TaxonomyColors.BACTERIA_COLOR, w );
2341             w.write( "</table>" );
2342             w.write( SurfacingConstants.NL );
2343             //
2344             w.write( "<hr>" );
2345             w.write( SurfacingConstants.NL );
2346             w.write( "<table>" );
2347             w.write( SurfacingConstants.NL );
2348         }
2349         //
2350         for( final DomainSimilarity similarity : similarities ) {
2351             if ( ( species_order != null ) && !species_order.isEmpty() ) {
2352                 ( similarity ).setSpeciesOrder( species_order );
2353             }
2354             if ( simple_tab_writer != null ) {
2355                 simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED,
2356                                                                     tax_code_to_id_map,
2357                                                                     null ).toString() );
2358             }
2359             if ( single_writer != null ) {
2360                 single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
2361                 single_writer.write( SurfacingConstants.NL );
2362             }
2363             else {
2364                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
2365                         .charAt( 0 ) );
2366                 if ( local_writer == null ) {
2367                     local_writer = split_writers.get( '0' );
2368                 }
2369                 local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
2370                 local_writer.write( SurfacingConstants.NL );
2371             }
2372         }
2373         switch ( print_option ) {
2374             case HTML:
2375                 for( final Writer w : split_writers.values() ) {
2376                     w.write( SurfacingConstants.NL );
2377                     w.write( "</table>" );
2378                     w.write( SurfacingConstants.NL );
2379                     w.write( "</font>" );
2380                     w.write( SurfacingConstants.NL );
2381                     w.write( "</body>" );
2382                     w.write( SurfacingConstants.NL );
2383                     w.write( "</html>" );
2384                     w.write( SurfacingConstants.NL );
2385                 }
2386                 break;
2387             default:
2388                 break;
2389         }
2390         for( final Writer w : split_writers.values() ) {
2391             w.close();
2392         }
2393     }
2394
2395     public static void writeHtmlHead( final Writer w, final String title ) throws IOException {
2396         w.write( SurfacingConstants.NL );
2397         w.write( "<head>" );
2398         w.write( "<title>" );
2399         w.write( title );
2400         w.write( "</title>" );
2401         w.write( SurfacingConstants.NL );
2402         w.write( "<style>" );
2403         w.write( SurfacingConstants.NL );
2404         w.write( "a:visited { color : #000066; text-decoration : none; }" );
2405         w.write( SurfacingConstants.NL );
2406         w.write( "a:link { color : #000066; text-decoration : none; }" );
2407         w.write( SurfacingConstants.NL );
2408         w.write( "a:active { color : ##000066; text-decoration : none; }" );
2409         w.write( SurfacingConstants.NL );
2410         w.write( "a:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; }" );
2411         w.write( SurfacingConstants.NL );
2412         //
2413         w.write( "a.pl:visited { color : #505050; text-decoration : none; font-size: 7px;}" );
2414         w.write( SurfacingConstants.NL );
2415         w.write( "a.pl:link { color : #505050; text-decoration : none; font-size: 7px;}" );
2416         w.write( SurfacingConstants.NL );
2417         w.write( "a.pl:active { color : #505050; text-decoration : none; font-size: 7px;}" );
2418         w.write( SurfacingConstants.NL );
2419         w.write( "a.pl:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; font-size: 7px;}" );
2420         w.write( SurfacingConstants.NL );
2421         //
2422         w.write( "a.ps:visited { color : #707070; text-decoration : none; font-size: 7px;}" );
2423         w.write( SurfacingConstants.NL );
2424         w.write( "a.ps:link { color : #707070; text-decoration : none; font-size: 7px;}" );
2425         w.write( SurfacingConstants.NL );
2426         w.write( "a.ps:active { color : #707070; text-decoration : none; font-size: 7px;}" );
2427         w.write( SurfacingConstants.NL );
2428         w.write( "a.ps:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; font-size: 7px;}" );
2429         w.write( SurfacingConstants.NL );
2430         //
2431         w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
2432         w.write( SurfacingConstants.NL );
2433         w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
2434         w.write( SurfacingConstants.NL );
2435         w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
2436         w.write( SurfacingConstants.NL );
2437         w.write( "</style>" );
2438         w.write( SurfacingConstants.NL );
2439         w.write( "</head>" );
2440         w.write( SurfacingConstants.NL );
2441     }
2442
2443     public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
2444                                           final String filename,
2445                                           final Format format ) {
2446         final File outfile = new File( filename );
2447         checkForOutputFileWriteability( outfile );
2448         try {
2449             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
2450             matrix.toWriter( out, format );
2451             out.flush();
2452             out.close();
2453         }
2454         catch ( final IOException e ) {
2455             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2456         }
2457         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
2458     }
2459
2460     public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
2461         checkForOutputFileWriteability( matrix_outfile );
2462         try {
2463             final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
2464             for( final DistanceMatrix distance_matrix : matrices ) {
2465                 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
2466                 out.write( ForesterUtil.LINE_SEPARATOR );
2467                 out.flush();
2468             }
2469             out.close();
2470         }
2471         catch ( final IOException e ) {
2472             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2473         }
2474         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
2475     }
2476
2477     public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
2478         final PhylogenyWriter writer = new PhylogenyWriter();
2479         try {
2480             writer.toPhyloXML( new File( filename ), phylogeny, 1 );
2481         }
2482         catch ( final IOException e ) {
2483             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
2484                     + e );
2485         }
2486         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
2487     }
2488
2489     public static void writePresentToNexus( final File output_file,
2490                                             final File positive_filter_file,
2491                                             final SortedSet<String> filter,
2492                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
2493         try {
2494             writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list,
2495                                                                                                 positive_filter_file == null ? null
2496                                                                                                         : filter ),
2497                                output_file + surfacing.DOMAINS_PRESENT_NEXUS,
2498                                Format.NEXUS_BINARY );
2499             writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ),
2500                                output_file + surfacing.BDC_PRESENT_NEXUS,
2501                                Format.NEXUS_BINARY );
2502         }
2503         catch ( final Exception e ) {
2504             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
2505         }
2506     }
2507
2508     public static void writeProteinListsForAllSpecies( final File output_dir,
2509                                                        final SortedMap<Species, List<Protein>> protein_lists_per_species,
2510                                                        final List<GenomeWideCombinableDomains> gwcd_list,
2511                                                        final double domain_e_cutoff ) {
2512         final SortedSet<String> all_domains = new TreeSet<String>();
2513         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
2514             all_domains.addAll( gwcd.getAllDomainIds() );
2515         }
2516         for( final String domain : all_domains ) {
2517             final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX );
2518             checkForOutputFileWriteability( out );
2519             try {
2520                 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) );
2521                 extractProteinNames( protein_lists_per_species,
2522                                      domain,
2523                                      proteins_file_writer,
2524                                      "\t",
2525                                      surfacing.LIMIT_SPEC_FOR_PROT_EX,
2526                                      domain_e_cutoff );
2527                 proteins_file_writer.close();
2528             }
2529             catch ( final IOException e ) {
2530                 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
2531             }
2532             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" );
2533         }
2534     }
2535
2536     public static void writeTaxonomyLinks( final Writer writer,
2537                                            final String species,
2538                                            final Map<String, Integer> tax_code_to_id_map ) throws IOException {
2539         if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
2540             writer.write( " [" );
2541             if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) {
2542                 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK
2543                         + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot</a>" );
2544             }
2545             else {
2546                 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
2547                         + "\" target=\"taxonomy_window\">eol</a>" );
2548                 writer.write( "|" );
2549                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + species
2550                         + "\" target=\"taxonomy_window\">scholar</a>" );
2551                 writer.write( "|" );
2552                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_WEB_SEARCH_LINK + species
2553                         + "\" target=\"taxonomy_window\">google</a>" );
2554             }
2555             writer.write( "]" );
2556         }
2557     }
2558
2559     private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
2560         if ( map.containsKey( s ) ) {
2561             map.put( s, map.get( s ) + 1 );
2562         }
2563         else {
2564             map.put( s, 1 );
2565         }
2566     }
2567
2568     private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
2569                                                                     final String outfilename_for_counts,
2570                                                                     final String outfilename_for_dc,
2571                                                                     final String outfilename_for_dc_for_go_mapping,
2572                                                                     final String outfilename_for_dc_for_go_mapping_unique,
2573                                                                     final String outfilename_for_rank_counts,
2574                                                                     final String outfilename_for_ancestor_species_counts,
2575                                                                     final String outfilename_for_protein_stats,
2576                                                                     final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
2577                                                                     final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
2578                                                                     final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
2579         try {
2580             //
2581             //            if ( protein_length_stats_by_dc != null ) {
2582             //                for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
2583             //                    System.out.print( entry.getKey().toString() );
2584             //                    System.out.print( ": " );
2585             //                    double[] a = entry.getValue().getDataAsDoubleArray();
2586             //                    for( int i = 0; i < a.length; i++ ) {
2587             //                        System.out.print( a[ i ] + " " );
2588             //                    }
2589             //                    System.out.println();
2590             //                }
2591             //            }
2592             //            if ( domain_number_stats_by_dc != null ) {
2593             //                for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
2594             //                    System.out.print( entry.getKey().toString() );
2595             //                    System.out.print( ": " );
2596             //                    double[] a = entry.getValue().getDataAsDoubleArray();
2597             //                    for( int i = 0; i < a.length; i++ ) {
2598             //                        System.out.print( a[ i ] + " " );
2599             //                    }
2600             //                    System.out.println();
2601             //                }
2602             //            }
2603             //
2604             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
2605             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
2606             final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
2607             final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
2608             final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
2609             for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
2610                 final PhylogenyNode n = it.next();
2611                 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
2612                 for( final String dc : gained_dc ) {
2613                     if ( dc_gain_counts.containsKey( dc ) ) {
2614                         dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
2615                     }
2616                     else {
2617                         dc_gain_counts.put( dc, 1 );
2618                     }
2619                 }
2620             }
2621             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
2622             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
2623             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
2624             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
2625             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
2626             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
2627             final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
2628             final Set<String> dcs = dc_gain_counts.keySet();
2629             final SortedSet<String> more_than_once = new TreeSet<String>();
2630             DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
2631             DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
2632             DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
2633             final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
2634             long gained_multiple_times_domain_length_sum = 0;
2635             long gained_once_domain_length_sum = 0;
2636             long gained_multiple_times_domain_length_count = 0;
2637             long gained_once_domain_length_count = 0;
2638             for( final String dc : dcs ) {
2639                 final int count = dc_gain_counts.get( dc );
2640                 if ( histogram.containsKey( count ) ) {
2641                     histogram.put( count, histogram.get( count ) + 1 );
2642                     domain_lists.get( count ).append( ", " + dc );
2643                     domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
2644                     domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
2645                 }
2646                 else {
2647                     histogram.put( count, 1 );
2648                     domain_lists.put( count, new StringBuilder( dc ) );
2649                     final PriorityQueue<String> q = new PriorityQueue<String>();
2650                     q.addAll( splitDomainCombination( dc ) );
2651                     domain_lists_go.put( count, q );
2652                     final SortedSet<String> set = new TreeSet<String>();
2653                     set.addAll( splitDomainCombination( dc ) );
2654                     domain_lists_go_unique.put( count, set );
2655                 }
2656                 if ( protein_length_stats_by_dc != null ) {
2657                     if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
2658                         dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
2659                     }
2660                     dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
2661                             .arithmeticMean() );
2662                 }
2663                 if ( domain_number_stats_by_dc != null ) {
2664                     if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
2665                         dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
2666                     }
2667                     dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
2668                             .arithmeticMean() );
2669                 }
2670                 if ( domain_length_stats_by_domain != null ) {
2671                     if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
2672                         dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
2673                     }
2674                     final String[] ds = dc.split( "=" );
2675                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
2676                             .get( ds[ 0 ] ).arithmeticMean() );
2677                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
2678                             .get( ds[ 1 ] ).arithmeticMean() );
2679                 }
2680                 if ( count > 1 ) {
2681                     more_than_once.add( dc );
2682                     if ( protein_length_stats_by_dc != null ) {
2683                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
2684                         for( final double element : s.getData() ) {
2685                             gained_multiple_times_lengths_stats.addValue( element );
2686                         }
2687                     }
2688                     if ( domain_number_stats_by_dc != null ) {
2689                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
2690                         for( final double element : s.getData() ) {
2691                             gained_multiple_times_domain_count_stats.addValue( element );
2692                         }
2693                     }
2694                     if ( domain_length_stats_by_domain != null ) {
2695                         final String[] ds = dc.split( "=" );
2696                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
2697                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
2698                         for( final double element : s0.getData() ) {
2699                             gained_multiple_times_domain_length_sum += element;
2700                             ++gained_multiple_times_domain_length_count;
2701                         }
2702                         for( final double element : s1.getData() ) {
2703                             gained_multiple_times_domain_length_sum += element;
2704                             ++gained_multiple_times_domain_length_count;
2705                         }
2706                     }
2707                 }
2708                 else {
2709                     if ( protein_length_stats_by_dc != null ) {
2710                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
2711                         for( final double element : s.getData() ) {
2712                             gained_once_lengths_stats.addValue( element );
2713                         }
2714                     }
2715                     if ( domain_number_stats_by_dc != null ) {
2716                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
2717                         for( final double element : s.getData() ) {
2718                             gained_once_domain_count_stats.addValue( element );
2719                         }
2720                     }
2721                     if ( domain_length_stats_by_domain != null ) {
2722                         final String[] ds = dc.split( "=" );
2723                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
2724                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
2725                         for( final double element : s0.getData() ) {
2726                             gained_once_domain_length_sum += element;
2727                             ++gained_once_domain_length_count;
2728                         }
2729                         for( final double element : s1.getData() ) {
2730                             gained_once_domain_length_sum += element;
2731                             ++gained_once_domain_length_count;
2732                         }
2733                     }
2734                 }
2735             }
2736             final Set<Integer> histogram_keys = histogram.keySet();
2737             for( final Integer histogram_key : histogram_keys ) {
2738                 final int count = histogram.get( histogram_key );
2739                 final StringBuilder dc = domain_lists.get( histogram_key );
2740                 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
2741                 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
2742                 out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2743                 final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
2744                 Arrays.sort( sorted );
2745                 for( final Object domain : sorted ) {
2746                     out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
2747                 }
2748                 out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
2749                 for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
2750                     out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
2751                 }
2752             }
2753             out_counts.close();
2754             out_dc.close();
2755             out_dc_for_go_mapping.close();
2756             out_dc_for_go_mapping_unique.close();
2757             final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
2758             final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
2759             for( final String dc : more_than_once ) {
2760                 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
2761                 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
2762                     final PhylogenyNode n = it.next();
2763                     if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
2764                         nodes.add( n );
2765                     }
2766                 }
2767                 for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
2768                     for( int j = i + 1; j < nodes.size(); ++j ) {
2769                         final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
2770                         String rank = "unknown";
2771                         if ( lca.getNodeData().isHasTaxonomy()
2772                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
2773                             rank = lca.getNodeData().getTaxonomy().getRank();
2774                         }
2775                         addToCountMap( lca_rank_counts, rank );
2776                         String lca_species;
2777                         if ( lca.getNodeData().isHasTaxonomy()
2778                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
2779                             lca_species = lca.getNodeData().getTaxonomy().getScientificName();
2780                         }
2781                         else if ( lca.getNodeData().isHasTaxonomy()
2782                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
2783                             lca_species = lca.getNodeData().getTaxonomy().getCommonName();
2784                         }
2785                         else {
2786                             lca_species = lca.getName();
2787                         }
2788                         addToCountMap( lca_ancestor_species_counts, lca_species );
2789                     }
2790                 }
2791             }
2792             final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
2793             final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
2794             ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
2795             ForesterUtil.map2writer( out_for_ancestor_species_counts,
2796                                      lca_ancestor_species_counts,
2797                                      "\t",
2798                                      ForesterUtil.LINE_SEPARATOR );
2799             out_for_rank_counts.close();
2800             out_for_ancestor_species_counts.close();
2801             if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
2802                     && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
2803                 final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
2804                 w.write( "Domain Lengths: " );
2805                 w.write( "\n" );
2806                 if ( domain_length_stats_by_domain != null ) {
2807                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
2808                             .entrySet() ) {
2809                         w.write( entry.getKey().toString() );
2810                         w.write( "\t" + entry.getValue().arithmeticMean() );
2811                         w.write( "\t" + entry.getValue().median() );
2812                         w.write( "\n" );
2813                     }
2814                 }
2815                 w.flush();
2816                 w.write( "\n" );
2817                 w.write( "\n" );
2818                 w.write( "Protein Lengths: " );
2819                 w.write( "\n" );
2820                 if ( protein_length_stats_by_dc != null ) {
2821                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
2822                             .entrySet() ) {
2823                         w.write( entry.getKey().toString() );
2824                         w.write( "\t" + entry.getValue().arithmeticMean() );
2825                         w.write( "\t" + entry.getValue().median() );
2826                         w.write( "\n" );
2827                     }
2828                 }
2829                 w.flush();
2830                 w.write( "\n" );
2831                 w.write( "\n" );
2832                 w.write( "Number of domains: " );
2833                 w.write( "\n" );
2834                 if ( domain_number_stats_by_dc != null ) {
2835                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
2836                             .entrySet() ) {
2837                         w.write( entry.getKey().toString() );
2838                         w.write( "\t" + entry.getValue().arithmeticMean() );
2839                         w.write( "\t" + entry.getValue().median() );
2840                         w.write( "\n" );
2841                     }
2842                 }
2843                 w.flush();
2844                 w.write( "\n" );
2845                 w.write( "\n" );
2846                 w.write( "Gained once, domain lengths:" );
2847                 w.write( "\n" );
2848                 w.write( "N: " + gained_once_domain_length_count );
2849                 w.write( "\n" );
2850                 w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
2851                 w.write( "\n" );
2852                 w.write( "\n" );
2853                 w.write( "Gained multiple times, domain lengths:" );
2854                 w.write( "\n" );
2855                 w.write( "N: " + gained_multiple_times_domain_length_count );
2856                 w.write( "\n" );
2857                 w.write( "Avg: "
2858                         + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
2859                 w.write( "\n" );
2860                 w.write( "\n" );
2861                 w.write( "\n" );
2862                 w.write( "\n" );
2863                 w.write( "Gained once, protein lengths:" );
2864                 w.write( "\n" );
2865                 w.write( gained_once_lengths_stats.toString() );
2866                 gained_once_lengths_stats = null;
2867                 w.write( "\n" );
2868                 w.write( "\n" );
2869                 w.write( "Gained once, domain counts:" );
2870                 w.write( "\n" );
2871                 w.write( gained_once_domain_count_stats.toString() );
2872                 gained_once_domain_count_stats = null;
2873                 w.write( "\n" );
2874                 w.write( "\n" );
2875                 w.write( "Gained multiple times, protein lengths:" );
2876                 w.write( "\n" );
2877                 w.write( gained_multiple_times_lengths_stats.toString() );
2878                 gained_multiple_times_lengths_stats = null;
2879                 w.write( "\n" );
2880                 w.write( "\n" );
2881                 w.write( "Gained multiple times, domain counts:" );
2882                 w.write( "\n" );
2883                 w.write( gained_multiple_times_domain_count_stats.toString() );
2884                 w.flush();
2885                 w.close();
2886             }
2887         }
2888         catch ( final IOException e ) {
2889             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2890         }
2891         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
2892                 + outfilename_for_counts + "]" );
2893         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
2894                 + outfilename_for_dc + "]" );
2895         ForesterUtil.programMessage( surfacing.PRG_NAME,
2896                                      "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
2897                                              + outfilename_for_dc_for_go_mapping + "]" );
2898         ForesterUtil.programMessage( surfacing.PRG_NAME,
2899                                      "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
2900                                              + outfilename_for_dc_for_go_mapping_unique + "]" );
2901     }
2902
2903     private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
2904                                                                         final boolean get_gains ) {
2905         final SortedSet<String> domains = new TreeSet<String>();
2906         for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
2907             final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
2908             if ( get_gains ) {
2909                 domains.addAll( chars.getGainedCharacters() );
2910             }
2911             else {
2912                 domains.addAll( chars.getLostCharacters() );
2913             }
2914         }
2915         return domains;
2916     }
2917
2918     private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
2919                                                             final boolean domain_combinations,
2920                                                             final CharacterStateMatrix.GainLossStates state,
2921                                                             final String outfile ) {
2922         File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
2923                 + ForesterUtil.FILE_SEPARATOR + base_dir );
2924         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2925             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2926         }
2927         if ( domain_combinations ) {
2928             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2929                     + ForesterUtil.FILE_SEPARATOR + "DC" );
2930         }
2931         else {
2932             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2933                     + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
2934         }
2935         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2936             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2937         }
2938         if ( state == GainLossStates.GAIN ) {
2939             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2940                     + ForesterUtil.FILE_SEPARATOR + "GAINS" );
2941         }
2942         else if ( state == GainLossStates.LOSS ) {
2943             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2944                     + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
2945         }
2946         else {
2947             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2948                     + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
2949         }
2950         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
2951             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
2952         }
2953         return per_node_go_mapped_domain_gain_loss_files_base_dir;
2954     }
2955
2956     private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
2957         final SortedMap<String, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
2958         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
2959         for( final String domain_id : cds.keySet() ) {
2960             final CombinableDomains cd = cds.get( domain_id );
2961             binary_combinations.addAll( cd.toBinaryDomainCombinations() );
2962         }
2963         return binary_combinations;
2964     }
2965
2966     private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w )
2967             throws IOException {
2968         w.write( "<hr>" );
2969         w.write( "<br>" );
2970         w.write( SurfacingConstants.NL );
2971         w.write( "<tt><pre>" );
2972         w.write( SurfacingConstants.NL );
2973         if ( histo != null ) {
2974             w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
2975             w.write( SurfacingConstants.NL );
2976         }
2977         w.write( "</pre></tt>" );
2978         w.write( SurfacingConstants.NL );
2979         w.write( "<table>" );
2980         w.write( SurfacingConstants.NL );
2981         w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
2982         w.write( SurfacingConstants.NL );
2983         w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
2984         w.write( SurfacingConstants.NL );
2985         w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
2986         w.write( SurfacingConstants.NL );
2987         w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
2988         w.write( SurfacingConstants.NL );
2989         if ( stats.getN() > 1 ) {
2990             w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
2991         }
2992         else {
2993             w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
2994         }
2995         w.write( SurfacingConstants.NL );
2996         w.write( "</table>" );
2997         w.write( SurfacingConstants.NL );
2998         w.write( "<br>" );
2999         w.write( SurfacingConstants.NL );
3000     }
3001
3002     private static List<String> splitDomainCombination( final String dc ) {
3003         final String[] s = dc.split( "=" );
3004         if ( s.length != 2 ) {
3005             ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
3006                     + dc );
3007             System.exit( -1 );
3008         }
3009         final List<String> l = new ArrayList<String>( 2 );
3010         l.add( s[ 0 ] );
3011         l.add( s[ 1 ] );
3012         return l;
3013     }
3014
3015     private static void writeAllEncounteredPfamsToFile( final Map<String, List<GoId>> domain_id_to_go_ids_map,
3016                                                         final Map<GoId, GoTerm> go_id_to_term_map,
3017                                                         final String outfile_name,
3018                                                         final SortedSet<String> all_pfams_encountered ) {
3019         final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
3020         final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
3021                 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
3022         final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
3023         int biological_process_counter = 0;
3024         int cellular_component_counter = 0;
3025         int molecular_function_counter = 0;
3026         int pfams_with_mappings_counter = 0;
3027         int pfams_without_mappings_counter = 0;
3028         int pfams_without_mappings_to_bp_or_mf_counter = 0;
3029         int pfams_with_mappings_to_bp_or_mf_counter = 0;
3030         try {
3031             final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
3032             final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
3033             final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
3034             summary_writer.write( "# Pfam to GO mapping summary" );
3035             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3036             summary_writer.write( "# Actual summary is at the end of this file." );
3037             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3038             summary_writer.write( "# Encountered Pfams without a GO mapping:" );
3039             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3040             for( final String pfam : all_pfams_encountered ) {
3041                 all_pfams_encountered_writer.write( pfam );
3042                 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
3043                 final String domain_id = new String( pfam );
3044                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
3045                     ++pfams_with_mappings_counter;
3046                     all_pfams_encountered_with_go_annotation_writer.write( pfam );
3047                     all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
3048                     final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
3049                     boolean maps_to_bp = false;
3050                     boolean maps_to_cc = false;
3051                     boolean maps_to_mf = false;
3052                     for( final GoId go_id : go_ids ) {
3053                         final GoTerm go_term = go_id_to_term_map.get( go_id );
3054                         if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
3055                             maps_to_bp = true;
3056                         }
3057                         else if ( go_term.getGoNameSpace().isCellularComponent() ) {
3058                             maps_to_cc = true;
3059                         }
3060                         else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
3061                             maps_to_mf = true;
3062                         }
3063                     }
3064                     if ( maps_to_bp ) {
3065                         ++biological_process_counter;
3066                     }
3067                     if ( maps_to_cc ) {
3068                         ++cellular_component_counter;
3069                     }
3070                     if ( maps_to_mf ) {
3071                         ++molecular_function_counter;
3072                     }
3073                     if ( maps_to_bp || maps_to_mf ) {
3074                         ++pfams_with_mappings_to_bp_or_mf_counter;
3075                     }
3076                     else {
3077                         ++pfams_without_mappings_to_bp_or_mf_counter;
3078                     }
3079                 }
3080                 else {
3081                     ++pfams_without_mappings_to_bp_or_mf_counter;
3082                     ++pfams_without_mappings_counter;
3083                     summary_writer.write( pfam );
3084                     summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3085                 }
3086             }
3087             all_pfams_encountered_writer.close();
3088             all_pfams_encountered_with_go_annotation_writer.close();
3089             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
3090                     + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
3091             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
3092                     + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
3093                     + "\"" );
3094             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
3095                     + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
3096                     + encountered_pfams_summary_file + "\"" );
3097             ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered                : "
3098                     + all_pfams_encountered.size() );
3099             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping                 : "
3100                     + pfams_without_mappings_counter + " ["
3101                     + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
3102             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
3103                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
3104                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
3105             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping                    : "
3106                     + pfams_with_mappings_counter + " ["
3107                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
3108             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func.  : "
3109                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
3110                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
3111             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
3112                     + biological_process_counter + " ["
3113                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
3114             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
3115                     + molecular_function_counter + " ["
3116                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
3117             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
3118                     + cellular_component_counter + " ["
3119                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
3120             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3121             summary_writer.write( "# Sum of Pfams encountered                : " + all_pfams_encountered.size() );
3122             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3123             summary_writer.write( "# Pfams without a mapping                 : " + pfams_without_mappings_counter
3124                     + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
3125             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3126             summary_writer.write( "# Pfams without mapping to proc. or func. : "
3127                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
3128                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
3129             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3130             summary_writer.write( "# Pfams with a mapping                    : " + pfams_with_mappings_counter + " ["
3131                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
3132             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3133             summary_writer.write( "# Pfams with a mapping to proc. or func.  : "
3134                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
3135                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
3136             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3137             summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
3138                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
3139             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3140             summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
3141                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
3142             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3143             summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
3144                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
3145             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
3146             summary_writer.close();
3147         }
3148         catch ( final IOException e ) {
3149             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
3150         }
3151     }
3152
3153     private final static void writeColorLabels( final String l, final Color c, final Writer w ) throws IOException {
3154         w.write( "<tr><td><b><span style=\"color:" );
3155         w.write( String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ) );
3156         w.write( "\">" );
3157         w.write( l );
3158         w.write( "</span></b></td></tr>" );
3159         w.write( SurfacingConstants.NL );
3160     }
3161
3162     private static void writeDomainData( final Map<String, List<GoId>> domain_id_to_go_ids_map,
3163                                          final Map<GoId, GoTerm> go_id_to_term_map,
3164                                          final GoNameSpace go_namespace_limit,
3165                                          final Writer out,
3166                                          final String domain_0,
3167                                          final String domain_1,
3168                                          final String prefix_for_html,
3169                                          final String character_separator_for_non_html_output,
3170                                          final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
3171                                          final Set<GoId> all_go_ids ) throws IOException {
3172         boolean any_go_annotation_present = false;
3173         boolean first_has_no_go = false;
3174         int domain_count = 2; // To distinguish between domains and binary domain combinations.
3175         if ( ForesterUtil.isEmpty( domain_1 ) ) {
3176             domain_count = 1;
3177         }
3178         // The following has a difficult to understand logic.  
3179         for( int d = 0; d < domain_count; ++d ) {
3180             List<GoId> go_ids = null;
3181             boolean go_annotation_present = false;
3182             if ( d == 0 ) {
3183                 if ( domain_id_to_go_ids_map.containsKey( domain_0 ) ) {
3184                     go_annotation_present = true;
3185                     any_go_annotation_present = true;
3186                     go_ids = domain_id_to_go_ids_map.get( domain_0 );
3187                 }
3188                 else {
3189                     first_has_no_go = true;
3190                 }
3191             }
3192             else {
3193                 if ( domain_id_to_go_ids_map.containsKey( domain_1 ) ) {
3194                     go_annotation_present = true;
3195                     any_go_annotation_present = true;
3196                     go_ids = domain_id_to_go_ids_map.get( domain_1 );
3197                 }
3198             }
3199             if ( go_annotation_present ) {
3200                 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
3201                 for( final GoId go_id : go_ids ) {
3202                     out.write( "<tr>" );
3203                     if ( first ) {
3204                         first = false;
3205                         writeDomainIdsToHtml( out,
3206                                               domain_0,
3207                                               domain_1,
3208                                               prefix_for_html,
3209                                               domain_id_to_secondary_features_maps );
3210                     }
3211                     else {
3212                         out.write( "<td></td>" );
3213                     }
3214                     if ( !go_id_to_term_map.containsKey( go_id ) ) {
3215                         throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
3216                     }
3217                     final GoTerm go_term = go_id_to_term_map.get( go_id );
3218                     if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
3219                         // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
3220                         final String go_id_str = go_id.getId();
3221                         out.write( "<td>" );
3222                         out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
3223                                 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
3224                         out.write( "</td><td>" );
3225                         out.write( go_term.getName() );
3226                         if ( domain_count == 2 ) {
3227                             out.write( " (" + d + ")" );
3228                         }
3229                         out.write( "</td><td>" );
3230                         // out.write( top );
3231                         // out.write( "</td><td>" );
3232                         out.write( "[" );
3233                         out.write( go_term.getGoNameSpace().toShortString() );
3234                         out.write( "]" );
3235                         out.write( "</td>" );
3236                         if ( all_go_ids != null ) {
3237                             all_go_ids.add( go_id );
3238                         }
3239                     }
3240                     else {
3241                         out.write( "<td>" );
3242                         out.write( "</td><td>" );
3243                         out.write( "</td><td>" );
3244                         out.write( "</td><td>" );
3245                         out.write( "</td>" );
3246                     }
3247                     out.write( "</tr>" );
3248                     out.write( SurfacingConstants.NL );
3249                 }
3250             }
3251         } //  for( int d = 0; d < domain_count; ++d ) 
3252         if ( !any_go_annotation_present ) {
3253             out.write( "<tr>" );
3254             writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
3255             out.write( "<td>" );
3256             out.write( "</td><td>" );
3257             out.write( "</td><td>" );
3258             out.write( "</td><td>" );
3259             out.write( "</td>" );
3260             out.write( "</tr>" );
3261             out.write( SurfacingConstants.NL );
3262         }
3263     }
3264
3265     private static void writeDomainIdsToHtml( final Writer out,
3266                                               final String domain_0,
3267                                               final String domain_1,
3268                                               final String prefix_for_detailed_html,
3269                                               final Map<String, Set<String>>[] domain_id_to_secondary_features_maps )
3270             throws IOException {
3271         out.write( "<td>" );
3272         if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
3273             out.write( prefix_for_detailed_html );
3274             out.write( " " );
3275         }
3276         out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
3277         out.write( "</td>" );
3278     }
3279
3280     private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
3281                                                                  final String domain_0,
3282                                                                  final String domain_1 ) throws IOException {
3283         individual_files_writer.write( domain_0 );
3284         individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
3285         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
3286             individual_files_writer.write( domain_1 );
3287             individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
3288         }
3289     }
3290
3291     private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
3292         try {
3293             final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
3294             for( final String pfam : pfams ) {
3295                 writer.write( pfam );
3296                 writer.write( ForesterUtil.LINE_SEPARATOR );
3297             }
3298             writer.close();
3299             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
3300                     + "]" );
3301         }
3302         catch ( final IOException e ) {
3303             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
3304         }
3305     }
3306
3307     private static void writeToNexus( final String outfile_name,
3308                                       final CharacterStateMatrix<BinaryStates> matrix,
3309                                       final Phylogeny phylogeny ) {
3310         if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
3311             throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
3312                     + "] to nexus" );
3313         }
3314         final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
3315         final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
3316         phylogenies.add( phylogeny );
3317         try {
3318             final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
3319             w.write( NexusConstants.NEXUS );
3320             w.write( ForesterUtil.LINE_SEPARATOR );
3321             my_matrix.writeNexusTaxaBlock( w );
3322             my_matrix.writeNexusBinaryChractersBlock( w );
3323             PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
3324             w.flush();
3325             w.close();
3326             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
3327         }
3328         catch ( final IOException e ) {
3329             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
3330         }
3331     }
3332
3333     private static void writeToNexus( final String outfile_name,
3334                                       final DomainParsimonyCalculator domain_parsimony,
3335                                       final Phylogeny phylogeny ) {
3336         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
3337                       domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
3338                       phylogeny );
3339         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
3340                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
3341                       phylogeny );
3342     }
3343
3344     final static class DomainComparator implements Comparator<Domain> {
3345
3346         final private boolean _ascending;
3347
3348         public DomainComparator( final boolean ascending ) {
3349             _ascending = ascending;
3350         }
3351
3352         @Override
3353         public final int compare( final Domain d0, final Domain d1 ) {
3354             if ( d0.getFrom() < d1.getFrom() ) {
3355                 return _ascending ? -1 : 1;
3356             }
3357             else if ( d0.getFrom() > d1.getFrom() ) {
3358                 return _ascending ? 1 : -1;
3359             }
3360             return 0;
3361         }
3362     }
3363 }