in progress
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.io.BufferedWriter;
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.Writer;
34 import java.text.DecimalFormat;
35 import java.text.NumberFormat;
36 import java.util.ArrayList;
37 import java.util.Collections;
38 import java.util.Comparator;
39 import java.util.HashMap;
40 import java.util.HashSet;
41 import java.util.List;
42 import java.util.Map;
43 import java.util.PriorityQueue;
44 import java.util.Set;
45 import java.util.SortedMap;
46 import java.util.SortedSet;
47 import java.util.TreeMap;
48 import java.util.TreeSet;
49 import java.util.regex.Matcher;
50 import java.util.regex.Pattern;
51
52 import org.forester.application.surfacing;
53 import org.forester.evoinference.distance.NeighborJoining;
54 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
55 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
56 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
57 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
58 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
59 import org.forester.evoinference.matrix.distance.DistanceMatrix;
60 import org.forester.go.GoId;
61 import org.forester.go.GoNameSpace;
62 import org.forester.go.GoTerm;
63 import org.forester.go.PfamToGoMapping;
64 import org.forester.io.parsers.nexus.NexusConstants;
65 import org.forester.io.writers.PhylogenyWriter;
66 import org.forester.phylogeny.Phylogeny;
67 import org.forester.phylogeny.PhylogenyMethods;
68 import org.forester.phylogeny.PhylogenyNode;
69 import org.forester.phylogeny.data.BinaryCharacters;
70 import org.forester.phylogeny.data.Confidence;
71 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
72 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
73 import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput;
74 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
75 import org.forester.util.AsciiHistogram;
76 import org.forester.util.BasicDescriptiveStatistics;
77 import org.forester.util.BasicTable;
78 import org.forester.util.BasicTableParser;
79 import org.forester.util.DescriptiveStatistics;
80 import org.forester.util.ForesterUtil;
81
82 public final class SurfacingUtil {
83
84     private final static NumberFormat       FORMATTER                        = new DecimalFormat( "0.0E0" );
85     private final static NumberFormat       FORMATTER_3                      = new DecimalFormat( "0.000" );
86     private static final Comparator<Domain> ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
87
88                                                                                  @Override
89                                                                                  public int compare( final Domain d1,
90                                                                                                      final Domain d2 ) {
91                                                                                      if ( d1.getPerSequenceEvalue() < d2
92                                                                                              .getPerSequenceEvalue() ) {
93                                                                                          return -1;
94                                                                                      }
95                                                                                      else if ( d1
96                                                                                              .getPerSequenceEvalue() > d2
97                                                                                              .getPerSequenceEvalue() ) {
98                                                                                          return 1;
99                                                                                      }
100                                                                                      else {
101                                                                                          return d1.compareTo( d2 );
102                                                                                      }
103                                                                                  }
104                                                                              };
105     public final static Pattern             PATTERN_SP_STYLE_TAXONOMY        = Pattern.compile( "^[A-Z0-9]{3,5}$" );
106
107     private SurfacingUtil() {
108         // Hidden constructor.
109     }
110
111     public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
112                                                            final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
113         final SortedMap<DomainId, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
114         for( final DomainId domain_id : all_cd.keySet() ) {
115             binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
116         }
117     }
118
119     public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
120                                              final SortedSet<DomainId> domain_ids ) {
121         final SortedSet<DomainId> domains = genome.getAllDomainIds();
122         for( final DomainId domain : domains ) {
123             domain_ids.add( domain );
124         }
125     }
126
127     public static void addHtmlHead( final Writer w, final String title ) throws IOException {
128         w.write( SurfacingConstants.NL );
129         w.write( "<head>" );
130         w.write( "<title>" );
131         w.write( title );
132         w.write( "</title>" );
133         w.write( SurfacingConstants.NL );
134         w.write( "<style>" );
135         w.write( SurfacingConstants.NL );
136         w.write( "a:visited { color : #6633FF; text-decoration : none; }" );
137         w.write( SurfacingConstants.NL );
138         w.write( "a:link { color : #6633FF; text-decoration : none; }" );
139         w.write( SurfacingConstants.NL );
140         w.write( "a:active { color : #99FF00; text-decoration : none; }" );
141         w.write( SurfacingConstants.NL );
142         w.write( "a:hover { color : #FFFFFF; background-color : #99FF00; text-decoration : none; }" );
143         w.write( SurfacingConstants.NL );
144         w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
145         w.write( SurfacingConstants.NL );
146         w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
147         w.write( SurfacingConstants.NL );
148         w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
149         w.write( SurfacingConstants.NL );
150         w.write( "</style>" );
151         w.write( SurfacingConstants.NL );
152         w.write( "</head>" );
153         w.write( SurfacingConstants.NL );
154     }
155
156     public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
157         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
158         for( final DomainSimilarity similarity : similarities ) {
159             stats.addValue( similarity.getMeanSimilarityScore() );
160         }
161         return stats;
162     }
163
164     private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
165                                                                     final String outfilename_for_counts,
166                                                                     final String outfilename_for_dc,
167                                                                     final String outfilename_for_dc_for_go_mapping ) {
168         try {
169             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
170             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
171             final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
172             final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
173             for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
174                 final PhylogenyNode n = it.next();
175                 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
176                 for( final String dc : gained_dc ) {
177                     if ( dc_gain_counts.containsKey( dc ) ) {
178                         dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
179                     }
180                     else {
181                         dc_gain_counts.put( dc, 1 );
182                     }
183                 }
184             }
185             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
186             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
187             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
188             final Set<String> dcs = dc_gain_counts.keySet();
189             for( final String dc : dcs ) {
190                 final int count = dc_gain_counts.get( dc );
191                 if ( histogram.containsKey( count ) ) {
192                     histogram.put( count, histogram.get( count ) + 1 );
193                     domain_lists.put( count, domain_lists.get( count ).append( ", " + dc ) );
194                     domain_lists_go.get( count ).add( dc );
195                 }
196                 else {
197                     histogram.put( count, 1 );
198                     domain_lists.put( count, new StringBuilder( dc ) );
199                     final PriorityQueue<String> q = new PriorityQueue<String>();
200                     q.add( dc );
201                     domain_lists_go.put( count, q );
202                 }
203             }
204             final Set<Integer> histogram_keys = histogram.keySet();
205             for( final Integer histogram_key : histogram_keys ) {
206                 final int count = histogram.get( histogram_key );
207                 final StringBuilder dc = domain_lists.get( histogram_key );
208                 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
209                 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
210             }
211             out_counts.close();
212             out_dc.close();
213             out_dc_for_go_mapping.close();
214         }
215         catch ( final IOException e ) {
216             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
217         }
218         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
219                 + outfilename_for_counts + "]" );
220         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
221                 + outfilename_for_dc + "]" );
222         ForesterUtil.programMessage( surfacing.PRG_NAME,
223                                      "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
224                                              + outfilename_for_dc_for_go_mapping + "]" );
225     }
226
227     public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
228         int overlap_count = 0;
229         for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
230             if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) {
231                 ++overlap_count;
232             }
233         }
234         return overlap_count;
235     }
236
237     public static void checkForOutputFileWriteability( final File outfile ) {
238         final String error = ForesterUtil.isWritableFile( outfile );
239         if ( !ForesterUtil.isEmpty( error ) ) {
240             ForesterUtil.fatalError( surfacing.PRG_NAME, error );
241         }
242     }
243
244     private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
245                                                                         final boolean get_gains ) {
246         final SortedSet<String> domains = new TreeSet<String>();
247         for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
248             final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
249             if ( get_gains ) {
250                 domains.addAll( chars.getGainedCharacters() );
251             }
252             else {
253                 domains.addAll( chars.getLostCharacters() );
254             }
255         }
256         return domains;
257     }
258
259     public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
260                                                                                            final BinaryDomainCombination.DomainCombinationType dc_type,
261                                                                                            final List<BinaryDomainCombination> all_binary_domains_combination_gained,
262                                                                                            final boolean get_gains ) {
263         final SortedSet<String> sorted_ids = new TreeSet<String>();
264         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
265             sorted_ids.add( matrix.getIdentifier( i ) );
266         }
267         for( final String id : sorted_ids ) {
268             for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
269                 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
270                         || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
271                     if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
272                         all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
273                                 .createInstance( matrix.getCharacter( c ) ) );
274                     }
275                     else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
276                         all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
277                                 .createInstance( matrix.getCharacter( c ) ) );
278                     }
279                     else {
280                         all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
281                                 .getCharacter( c ) ) );
282                     }
283                 }
284             }
285         }
286     }
287
288     private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
289                                                             final boolean domain_combinations,
290                                                             final CharacterStateMatrix.GainLossStates state,
291                                                             final String outfile ) {
292         File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
293                 + ForesterUtil.FILE_SEPARATOR + base_dir );
294         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
295             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
296         }
297         if ( domain_combinations ) {
298             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
299                     + ForesterUtil.FILE_SEPARATOR + "DC" );
300         }
301         else {
302             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
303                     + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
304         }
305         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
306             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
307         }
308         if ( state == GainLossStates.GAIN ) {
309             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
310                     + ForesterUtil.FILE_SEPARATOR + "GAINS" );
311         }
312         else if ( state == GainLossStates.LOSS ) {
313             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
314                     + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
315         }
316         else {
317             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
318                     + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
319         }
320         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
321             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
322         }
323         return per_node_go_mapped_domain_gain_loss_files_base_dir;
324     }
325
326     public static Map<DomainId, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
327         final Map<DomainId, List<GoId>> domain_id_to_go_ids_map = new HashMap<DomainId, List<GoId>>( pfam_to_go_mappings
328                 .size() );
329         for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
330             if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
331                 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
332             }
333             domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
334         }
335         return domain_id_to_go_ids_map;
336     }
337
338     public static Map<DomainId, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
339             throws IOException {
340         final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, "\t" );
341         final Map<DomainId, Set<String>> map = new TreeMap<DomainId, Set<String>>();
342         for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
343             final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) );
344             if ( !map.containsKey( domain_id ) ) {
345                 map.put( domain_id, new HashSet<String>() );
346             }
347             map.get( domain_id ).add( primary_table.getValue( 1, r ) );
348         }
349         return map;
350     }
351
352     public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
353         checkForOutputFileWriteability( nj_tree_outfile );
354         final NeighborJoining nj = NeighborJoining.createInstance();
355         final Phylogeny phylogeny = nj.execute( distance );
356         phylogeny.setName( nj_tree_outfile.getName() );
357         writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
358         return phylogeny;
359     }
360
361     private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
362         final SortedMap<DomainId, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
363         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
364         for( final DomainId domain_id : cds.keySet() ) {
365             final CombinableDomains cd = cds.get( domain_id );
366             binary_combinations.addAll( cd.toBinaryDomainCombinations() );
367         }
368         return binary_combinations;
369     }
370
371     public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
372                                                             final Detailedness detailedness,
373                                                             final GoAnnotationOutput go_annotation_output,
374                                                             final Map<GoId, GoTerm> go_id_to_term_map,
375                                                             final GoNameSpace go_namespace_limit ) {
376         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) {
377             throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" );
378         }
379         for( final DomainSimilarity domain_similarity : domain_similarities ) {
380             if ( domain_similarity instanceof PrintableDomainSimilarity ) {
381                 final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity;
382                 printable_domain_similarity.setDetailedness( detailedness );
383                 printable_domain_similarity.setGoAnnotationOutput( go_annotation_output );
384                 printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map );
385                 printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit );
386             }
387         }
388     }
389
390     public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
391                                                     final int number_of_genomes,
392                                                     final DomainLengthsTable domain_lengths_table,
393                                                     final File outfile ) throws IOException {
394         final DecimalFormat df = new DecimalFormat( "#.00" );
395         checkForOutputFileWriteability( outfile );
396         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
397         out.write( "MEAN BASED STATISTICS PER SPECIES" );
398         out.write( ForesterUtil.LINE_SEPARATOR );
399         out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
400         out.write( ForesterUtil.LINE_SEPARATOR );
401         out.write( ForesterUtil.LINE_SEPARATOR );
402         final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
403         out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
404         out.write( ForesterUtil.LINE_SEPARATOR );
405         for( final DomainLengths domain_lengths : domain_lengths_list ) {
406             final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
407             if ( species_list.size() > 0 ) {
408                 out.write( domain_lengths.getDomainId() + "\t" );
409                 for( final Species species : species_list ) {
410                     out.write( species + "\t" );
411                 }
412                 out.write( ForesterUtil.LINE_SEPARATOR );
413                 // DescriptiveStatistics stats_for_domain = domain_lengths
414                 //         .calculateMeanBasedStatistics();
415                 //AsciiHistogram histo = new AsciiHistogram( stats_for_domain );
416                 //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
417             }
418         }
419         out.write( ForesterUtil.LINE_SEPARATOR );
420         out.write( ForesterUtil.LINE_SEPARATOR );
421         out.write( "OUTLIER SPECIES (Z 1.0)" );
422         out.write( ForesterUtil.LINE_SEPARATOR );
423         final DescriptiveStatistics stats_for_all_species = domain_lengths_table
424                 .calculateMeanBasedStatisticsForAllSpecies();
425         out.write( stats_for_all_species.asSummary() );
426         out.write( ForesterUtil.LINE_SEPARATOR );
427         final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
428         out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
429         out.write( ForesterUtil.LINE_SEPARATOR );
430         final double population_sd = stats_for_all_species.sampleStandardDeviation();
431         final double population_mean = stats_for_all_species.arithmeticMean();
432         for( final Species species : domain_lengths_table.getSpecies() ) {
433             final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
434             final double z = ( x - population_mean ) / population_sd;
435             out.write( species + "\t" + z );
436             out.write( ForesterUtil.LINE_SEPARATOR );
437         }
438         out.write( ForesterUtil.LINE_SEPARATOR );
439         for( final Species species : domain_lengths_table.getSpecies() ) {
440             final DescriptiveStatistics stats_for_species = domain_lengths_table
441                     .calculateMeanBasedStatisticsForSpecies( species );
442             final double x = stats_for_species.arithmeticMean();
443             final double z = ( x - population_mean ) / population_sd;
444             if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
445                 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
446                 out.write( ForesterUtil.LINE_SEPARATOR );
447             }
448         }
449         out.close();
450         //        final List<HistogramData> histogram_datas = new ArrayList<HistogramData>();
451         //        for( int i = 0; i < number_of_genomes; ++i ) {
452         //            final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] );
453         //            histogram_datas
454         //                    .add( new HistogramData( species.toString(), domain_lengths_table
455         //                            .calculateMeanBasedStatisticsForSpecies( species )
456         //                            .getDataAsDoubleArray(), 5, 600, null, 60 ) );
457         //        }
458         //        final HistogramsFrame hf = new HistogramsFrame( histogram_datas );
459         //        hf.setVisible( true );
460         System.gc();
461     }
462
463     /**
464      * 
465      * @param all_binary_domains_combination_lost_fitch 
466      * @param consider_directedness_and_adjacency_for_bin_combinations 
467      * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
468      * which were gained under unweighted (Fitch) parsimony.
469      */
470     public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
471                                                  final boolean radomize_fitch_parsimony,
472                                                  final String outfile_name,
473                                                  final DomainParsimonyCalculator domain_parsimony,
474                                                  final Phylogeny phylogeny,
475                                                  final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
476                                                  final Map<GoId, GoTerm> go_id_to_term_map,
477                                                  final GoNameSpace go_namespace_limit,
478                                                  final String parameters_str,
479                                                  final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
480                                                  final SortedSet<DomainId> positive_filter,
481                                                  final boolean output_binary_domain_combinations_for_graphs,
482                                                  final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
483                                                  final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
484                                                  final BinaryDomainCombination.DomainCombinationType dc_type ) {
485         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
486         final String date_time = ForesterUtil.getCurrentDateTime();
487         final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
488         final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
489         final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
490         final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
491         final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
492         writeToNexus( outfile_name, domain_parsimony, phylogeny );
493         // DOLLO DOMAINS
494         // -------------
495         Phylogeny local_phylogeny_l = phylogeny.copy();
496         if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
497             domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
498         }
499         else {
500             domain_parsimony.executeDolloParsimonyOnDomainPresence();
501         }
502         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
503                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
504         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
505                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
506         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
507                                                            CharacterStateMatrix.GainLossStates.GAIN,
508                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
509                                                            sep,
510                                                            ForesterUtil.LINE_SEPARATOR,
511                                                            null );
512         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
513                                                            CharacterStateMatrix.GainLossStates.LOSS,
514                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
515                                                            sep,
516                                                            ForesterUtil.LINE_SEPARATOR,
517                                                            null );
518         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
519                 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
520         //HTML:
521         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
522                                        go_id_to_term_map,
523                                        go_namespace_limit,
524                                        false,
525                                        domain_parsimony.getGainLossMatrix(),
526                                        CharacterStateMatrix.GainLossStates.GAIN,
527                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
528                                        sep,
529                                        ForesterUtil.LINE_SEPARATOR,
530                                        "Dollo Parsimony | Gains | Domains",
531                                        "+",
532                                        domain_id_to_secondary_features_maps,
533                                        all_pfams_encountered,
534                                        all_pfams_gained_as_domains,
535                                        "_dollo_gains_d" );
536         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
537                                        go_id_to_term_map,
538                                        go_namespace_limit,
539                                        false,
540                                        domain_parsimony.getGainLossMatrix(),
541                                        CharacterStateMatrix.GainLossStates.LOSS,
542                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
543                                        sep,
544                                        ForesterUtil.LINE_SEPARATOR,
545                                        "Dollo Parsimony | Losses | Domains",
546                                        "-",
547                                        domain_id_to_secondary_features_maps,
548                                        all_pfams_encountered,
549                                        all_pfams_lost_as_domains,
550                                        "_dollo_losses_d" );
551         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
552                                        go_id_to_term_map,
553                                        go_namespace_limit,
554                                        false,
555                                        domain_parsimony.getGainLossMatrix(),
556                                        null,
557                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
558                                        sep,
559                                        ForesterUtil.LINE_SEPARATOR,
560                                        "Dollo Parsimony | Present | Domains",
561                                        "",
562                                        domain_id_to_secondary_features_maps,
563                                        all_pfams_encountered,
564                                        null,
565                                        "_dollo_present_d" );
566         preparePhylogeny( local_phylogeny_l,
567                           domain_parsimony,
568                           date_time,
569                           "Dollo parsimony on domain presence/absence",
570                           "dollo_on_domains_" + outfile_name,
571                           parameters_str );
572         SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
573                 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
574         try {
575             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
576             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
577         }
578         catch ( final IOException e ) {
579             e.printStackTrace();
580             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
581         }
582         if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
583             // FITCH DOMAIN COMBINATIONS
584             // -------------------------
585             local_phylogeny_l = phylogeny.copy();
586             String randomization = "no";
587             if ( radomize_fitch_parsimony ) {
588                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
589                 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
590             }
591             else {
592                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( false );
593             }
594             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
595                     + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
596             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
597                     + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
598             SurfacingUtil
599                     .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
600                                                           CharacterStateMatrix.GainLossStates.GAIN,
601                                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
602                                                           sep,
603                                                           ForesterUtil.LINE_SEPARATOR,
604                                                           null );
605             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
606                                                                CharacterStateMatrix.GainLossStates.LOSS,
607                                                                outfile_name
608                                                                        + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
609                                                                sep,
610                                                                ForesterUtil.LINE_SEPARATOR,
611                                                                null );
612             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
613                     + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
614             if ( all_binary_domains_combination_gained_fitch != null ) {
615                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
616                                                                                     dc_type,
617                                                                                     all_binary_domains_combination_gained_fitch,
618                                                                                     true );
619             }
620             if ( all_binary_domains_combination_lost_fitch != null ) {
621                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
622                                                                                     dc_type,
623                                                                                     all_binary_domains_combination_lost_fitch,
624                                                                                     false );
625             }
626             if ( output_binary_domain_combinations_for_graphs ) {
627                 SurfacingUtil
628                         .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
629                                                                                                            .getGainLossMatrix(),
630                                                                                                    null,
631                                                                                                    outfile_name
632                                                                                                            + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
633                                                                                                    sep,
634                                                                                                    ForesterUtil.LINE_SEPARATOR,
635                                                                                                    BinaryDomainCombination.OutputFormat.DOT );
636             }
637             // HTML:
638             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
639                                            go_id_to_term_map,
640                                            go_namespace_limit,
641                                            true,
642                                            domain_parsimony.getGainLossMatrix(),
643                                            CharacterStateMatrix.GainLossStates.GAIN,
644                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
645                                            sep,
646                                            ForesterUtil.LINE_SEPARATOR,
647                                            "Fitch Parsimony | Gains | Domain Combinations",
648                                            "+",
649                                            null,
650                                            all_pfams_encountered,
651                                            all_pfams_gained_as_dom_combinations,
652                                            "_fitch_gains_dc" );
653             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
654                                            go_id_to_term_map,
655                                            go_namespace_limit,
656                                            true,
657                                            domain_parsimony.getGainLossMatrix(),
658                                            CharacterStateMatrix.GainLossStates.LOSS,
659                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
660                                            sep,
661                                            ForesterUtil.LINE_SEPARATOR,
662                                            "Fitch Parsimony | Losses | Domain Combinations",
663                                            "-",
664                                            null,
665                                            all_pfams_encountered,
666                                            all_pfams_lost_as_dom_combinations,
667                                            "_fitch_losses_dc" );
668             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
669                                            go_id_to_term_map,
670                                            go_namespace_limit,
671                                            true,
672                                            domain_parsimony.getGainLossMatrix(),
673                                            null,
674                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
675                                            sep,
676                                            ForesterUtil.LINE_SEPARATOR,
677                                            "Fitch Parsimony | Present | Domain Combinations",
678                                            "",
679                                            null,
680                                            all_pfams_encountered,
681                                            null,
682                                            "_fitch_present_dc" );
683             writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
684                                             go_id_to_term_map,
685                                             outfile_name,
686                                             all_pfams_encountered );
687             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
688             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
689             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
690                               all_pfams_gained_as_dom_combinations );
691             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
692             preparePhylogeny( local_phylogeny_l,
693                               domain_parsimony,
694                               date_time,
695                               "Fitch parsimony on binary domain combination presence/absence randomization: "
696                                       + randomization,
697                               "fitch_on_binary_domain_combinations_" + outfile_name,
698                               parameters_str );
699             SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
700                     + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
701             calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name
702                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
703                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
704                     + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX );
705         }
706     }
707
708     public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
709                                                                      final DomainParsimonyCalculator secondary_features_parsimony,
710                                                                      final Phylogeny phylogeny,
711                                                                      final String parameters_str,
712                                                                      final Map<Species, MappingResults> mapping_results_map ) {
713         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
714         final String date_time = ForesterUtil.getCurrentDateTime();
715         System.out.println();
716         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
717                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
718                       phylogeny );
719         final Phylogeny local_phylogeny_copy = phylogeny.copy();
720         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
721         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
722                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
723         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
724                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
725         SurfacingUtil
726                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
727                                                       CharacterStateMatrix.GainLossStates.GAIN,
728                                                       outfile_name
729                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
730                                                       sep,
731                                                       ForesterUtil.LINE_SEPARATOR,
732                                                       null );
733         SurfacingUtil
734                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
735                                                       CharacterStateMatrix.GainLossStates.LOSS,
736                                                       outfile_name
737                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
738                                                       sep,
739                                                       ForesterUtil.LINE_SEPARATOR,
740                                                       null );
741         SurfacingUtil
742                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
743                                                       null,
744                                                       outfile_name
745                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
746                                                       sep,
747                                                       ForesterUtil.LINE_SEPARATOR,
748                                                       null );
749         preparePhylogeny( local_phylogeny_copy,
750                           secondary_features_parsimony,
751                           date_time,
752                           "Dollo parsimony on secondary feature presence/absence",
753                           "dollo_on_secondary_features_" + outfile_name,
754                           parameters_str );
755         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
756                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
757     }
758
759     public static void extractProteinNames( final List<Protein> proteins,
760                                             final List<DomainId> query_domain_ids_nc_order,
761                                             final Writer out,
762                                             final String separator ) throws IOException {
763         for( final Protein protein : proteins ) {
764             if ( protein.contains( query_domain_ids_nc_order, true ) ) {
765                 out.write( protein.getSpecies().getSpeciesId() );
766                 out.write( separator );
767                 out.write( protein.getProteinId().getId() );
768                 out.write( separator );
769                 out.write( "[" );
770                 final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
771                 boolean first = true;
772                 for( final Domain domain : protein.getProteinDomains() ) {
773                     if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
774                         visited_domain_ids.add( domain.getDomainId() );
775                         if ( first ) {
776                             first = false;
777                         }
778                         else {
779                             out.write( " " );
780                         }
781                         out.write( domain.getDomainId().getId() );
782                         out.write( " {" );
783                         out.write( "" + domain.getTotalCount() );
784                         out.write( "}" );
785                     }
786                 }
787                 out.write( "]" );
788                 out.write( separator );
789                 if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
790                         .equals( SurfacingConstants.NONE ) ) ) {
791                     out.write( protein.getDescription() );
792                 }
793                 out.write( separator );
794                 if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
795                         .equals( SurfacingConstants.NONE ) ) ) {
796                     out.write( protein.getAccession() );
797                 }
798                 out.write( SurfacingConstants.NL );
799             }
800         }
801         out.flush();
802     }
803
804     public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
805                                             final DomainId domain_id,
806                                             final Writer out,
807                                             final String separator ) throws IOException {
808         for( final Species species : protein_lists_per_species.keySet() ) {
809             for( final Protein protein : protein_lists_per_species.get( species ) ) {
810                 final List<Domain> domains = protein.getProteinDomains( domain_id );
811                 if ( domains.size() > 0 ) {
812                     final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
813                     for( final Domain domain : domains ) {
814                         stats.addValue( domain.getPerSequenceEvalue() );
815                     }
816                     out.write( protein.getSpecies().getSpeciesId() );
817                     out.write( separator );
818                     out.write( protein.getProteinId().getId() );
819                     out.write( separator );
820                     out.write( "[" + FORMATTER.format( stats.median() ) + "]" );
821                     out.write( separator );
822                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
823                             .equals( SurfacingConstants.NONE ) ) ) {
824                         out.write( protein.getDescription() );
825                     }
826                     out.write( separator );
827                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
828                             .equals( SurfacingConstants.NONE ) ) ) {
829                         out.write( protein.getAccession() );
830                     }
831                     out.write( SurfacingConstants.NL );
832                 }
833             }
834         }
835         out.flush();
836     }
837
838     public static SortedSet<DomainId> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
839         final SortedSet<DomainId> all_domains_ids = new TreeSet<DomainId>();
840         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
841             final Set<DomainId> all_domains = gwcd.getAllDomainIds();
842             //    for( final Domain domain : all_domains ) {
843             all_domains_ids.addAll( all_domains );
844             //    }
845         }
846         return all_domains_ids;
847     }
848
849     public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
850         final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
851         for( final Protein protein_domain_collection : protein_domain_collections ) {
852             for( final Object name : protein_domain_collection.getProteinDomains() ) {
853                 final BasicDomain protein_domain = ( BasicDomain ) name;
854                 final String id = protein_domain.getDomainId().getId();
855                 if ( map.containsKey( id ) ) {
856                     map.put( id, map.get( id ) + 1 );
857                 }
858                 else {
859                     map.put( id, 1 );
860                 }
861             }
862         }
863         return map;
864     }
865
866     public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
867         final PhylogenyNodeIterator it = p.iteratorPostorder();
868         int c = 0;
869         while ( it.hasNext() ) {
870             final PhylogenyNode n = it.next();
871             if ( ForesterUtil.isEmpty( n.getName() )
872                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
873                             .getScientificName() ) ) ) {
874                 if ( n.getParent() != null ) {
875                     names.append( " " );
876                     names.append( n.getParent().getName() );
877                 }
878                 ++c;
879             }
880         }
881         return c;
882     }
883
884     /**
885      * Returns true is Domain domain falls in an uninterrupted stretch of
886      * covered positions.
887      * 
888      * @param domain
889      * @param covered_positions
890      * @return
891      */
892     public static boolean isEngulfed( final Domain domain, final List<Boolean> covered_positions ) {
893         for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
894             if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) {
895                 return false;
896             }
897         }
898         return true;
899     }
900
901     public static void preparePhylogeny( final Phylogeny p,
902                                          final DomainParsimonyCalculator domain_parsimony,
903                                          final String date_time,
904                                          final String method,
905                                          final String name,
906                                          final String parameters_str ) {
907         domain_parsimony.decoratePhylogenyWithDomains( p );
908         final StringBuilder desc = new StringBuilder();
909         desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
910         desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
911         desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
912         desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
913         desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
914         desc.append( "[Parameters: " + parameters_str + "]" );
915         p.setName( name );
916         p.setDescription( desc.toString() );
917         p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
918         p.setRerootable( false );
919         p.setRooted( true );
920     }
921
922     /*
923      * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
924      * 
925      * 
926      */
927     static public StringBuffer proteinToDomainCombinations( final Protein protein,
928                                                             final String protein_id,
929                                                             final String separator ) {
930         final StringBuffer sb = new StringBuffer();
931         if ( protein.getSpecies() == null ) {
932             throw new IllegalArgumentException( "species must not be null" );
933         }
934         if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
935             throw new IllegalArgumentException( "species id must not be empty" );
936         }
937         final List<Domain> domains = protein.getProteinDomains();
938         if ( domains.size() > 1 ) {
939             final Map<String, Integer> counts = new HashMap<String, Integer>();
940             for( final Domain domain : domains ) {
941                 final String id = domain.getDomainId().getId();
942                 if ( counts.containsKey( id ) ) {
943                     counts.put( id, counts.get( id ) + 1 );
944                 }
945                 else {
946                     counts.put( id, 1 );
947                 }
948             }
949             final Set<String> dcs = new HashSet<String>();
950             for( int i = 1; i < domains.size(); ++i ) {
951                 for( int j = 0; j < i; ++j ) {
952                     Domain domain_n = domains.get( i );
953                     Domain domain_c = domains.get( j );
954                     if ( domain_n.getFrom() > domain_c.getFrom() ) {
955                         domain_n = domains.get( j );
956                         domain_c = domains.get( i );
957                     }
958                     final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId();
959                     if ( !dcs.contains( dc ) ) {
960                         dcs.add( dc );
961                         sb.append( protein.getSpecies() );
962                         sb.append( separator );
963                         sb.append( protein_id );
964                         sb.append( separator );
965                         sb.append( domain_n.getDomainId().getId() );
966                         sb.append( separator );
967                         sb.append( domain_c.getDomainId().getId() );
968                         sb.append( separator );
969                         sb.append( domain_n.getPerDomainEvalue() );
970                         sb.append( separator );
971                         sb.append( domain_c.getPerDomainEvalue() );
972                         sb.append( separator );
973                         sb.append( counts.get( domain_n.getDomainId().getId() ) );
974                         sb.append( separator );
975                         sb.append( counts.get( domain_c.getDomainId().getId() ) );
976                         sb.append( ForesterUtil.LINE_SEPARATOR );
977                     }
978                 }
979             }
980         }
981         else if ( domains.size() == 1 ) {
982             sb.append( protein.getSpecies() );
983             sb.append( separator );
984             sb.append( protein_id );
985             sb.append( separator );
986             sb.append( domains.get( 0 ).getDomainId().getId() );
987             sb.append( separator );
988             sb.append( separator );
989             sb.append( domains.get( 0 ).getPerDomainEvalue() );
990             sb.append( separator );
991             sb.append( separator );
992             sb.append( 1 );
993             sb.append( separator );
994             sb.append( ForesterUtil.LINE_SEPARATOR );
995         }
996         else {
997             sb.append( protein.getSpecies() );
998             sb.append( separator );
999             sb.append( protein_id );
1000             sb.append( separator );
1001             sb.append( separator );
1002             sb.append( separator );
1003             sb.append( separator );
1004             sb.append( separator );
1005             sb.append( separator );
1006             sb.append( ForesterUtil.LINE_SEPARATOR );
1007         }
1008         return sb;
1009     }
1010
1011     /**
1012      * 
1013      * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 =>
1014      * domain with 0.3 is ignored
1015      * 
1016      * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored
1017      * 
1018      * 
1019      * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_
1020      * ignored
1021      * 
1022      * @param max_allowed_overlap
1023      *            maximal allowed overlap (inclusive) to be still considered not
1024      *            overlapping (zero or negative value to allow any overlap)
1025      * @param remove_engulfed_domains
1026      *            to remove domains which are completely engulfed by coverage of
1027      *            domains with better support
1028      * @param protein
1029      * @return
1030      */
1031     public static Protein removeOverlappingDomains( final int max_allowed_overlap,
1032                                                     final boolean remove_engulfed_domains,
1033                                                     final Protein protein ) {
1034         final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
1035                 .getSpeciesId() );
1036         final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
1037         final List<Boolean> covered_positions = new ArrayList<Boolean>();
1038         for( final Domain domain : sorted ) {
1039             if ( ( ( max_allowed_overlap < 0 ) || ( SurfacingUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) )
1040                     && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) {
1041                 final int covered_positions_size = covered_positions.size();
1042                 for( int i = covered_positions_size; i < domain.getFrom(); ++i ) {
1043                     covered_positions.add( false );
1044                 }
1045                 final int new_covered_positions_size = covered_positions.size();
1046                 for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
1047                     if ( i < new_covered_positions_size ) {
1048                         covered_positions.set( i, true );
1049                     }
1050                     else {
1051                         covered_positions.add( true );
1052                     }
1053                 }
1054                 pruned_protein.addProteinDomain( domain );
1055             }
1056         }
1057         return pruned_protein;
1058     }
1059
1060     public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
1061         final List<Domain> domains = new ArrayList<Domain>();
1062         for( final Domain d : protein.getProteinDomains() ) {
1063             domains.add( d );
1064         }
1065         Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
1066         return domains;
1067     }
1068
1069     public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
1070                                                             final boolean get_gains,
1071                                                             final String outdir,
1072                                                             final String suffix_for_filename ) throws IOException {
1073         CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
1074         if ( !get_gains ) {
1075             state = CharacterStateMatrix.GainLossStates.LOSS;
1076         }
1077         final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
1078                                                                   false,
1079                                                                   state,
1080                                                                   outdir );
1081         for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
1082             final PhylogenyNode node = it.next();
1083             if ( !node.isExternal() ) {
1084                 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
1085                 if ( domains.size() > 0 ) {
1086                     final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
1087                             + node.getName() + suffix_for_filename );
1088                     for( final String domain : domains ) {
1089                         writer.write( domain );
1090                         writer.write( ForesterUtil.LINE_SEPARATOR );
1091                     }
1092                     writer.close();
1093                 }
1094             }
1095         }
1096     }
1097
1098     private static void writeAllEncounteredPfamsToFile( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1099                                                         final Map<GoId, GoTerm> go_id_to_term_map,
1100                                                         final String outfile_name,
1101                                                         final SortedSet<String> all_pfams_encountered ) {
1102         final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
1103         final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
1104                 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
1105         final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
1106         int biological_process_counter = 0;
1107         int cellular_component_counter = 0;
1108         int molecular_function_counter = 0;
1109         int pfams_with_mappings_counter = 0;
1110         int pfams_without_mappings_counter = 0;
1111         int pfams_without_mappings_to_bp_or_mf_counter = 0;
1112         int pfams_with_mappings_to_bp_or_mf_counter = 0;
1113         try {
1114             final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
1115             final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
1116             final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
1117             summary_writer.write( "# Pfam to GO mapping summary" );
1118             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1119             summary_writer.write( "# Actual summary is at the end of this file." );
1120             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1121             summary_writer.write( "# Encountered Pfams without a GO mapping:" );
1122             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1123             for( final String pfam : all_pfams_encountered ) {
1124                 all_pfams_encountered_writer.write( pfam );
1125                 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
1126                 final DomainId domain_id = new DomainId( pfam );
1127                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
1128                     ++pfams_with_mappings_counter;
1129                     all_pfams_encountered_with_go_annotation_writer.write( pfam );
1130                     all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
1131                     final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
1132                     boolean maps_to_bp = false;
1133                     boolean maps_to_cc = false;
1134                     boolean maps_to_mf = false;
1135                     for( final GoId go_id : go_ids ) {
1136                         final GoTerm go_term = go_id_to_term_map.get( go_id );
1137                         if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
1138                             maps_to_bp = true;
1139                         }
1140                         else if ( go_term.getGoNameSpace().isCellularComponent() ) {
1141                             maps_to_cc = true;
1142                         }
1143                         else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
1144                             maps_to_mf = true;
1145                         }
1146                     }
1147                     if ( maps_to_bp ) {
1148                         ++biological_process_counter;
1149                     }
1150                     if ( maps_to_cc ) {
1151                         ++cellular_component_counter;
1152                     }
1153                     if ( maps_to_mf ) {
1154                         ++molecular_function_counter;
1155                     }
1156                     if ( maps_to_bp || maps_to_mf ) {
1157                         ++pfams_with_mappings_to_bp_or_mf_counter;
1158                     }
1159                     else {
1160                         ++pfams_without_mappings_to_bp_or_mf_counter;
1161                     }
1162                 }
1163                 else {
1164                     ++pfams_without_mappings_to_bp_or_mf_counter;
1165                     ++pfams_without_mappings_counter;
1166                     summary_writer.write( pfam );
1167                     summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1168                 }
1169             }
1170             all_pfams_encountered_writer.close();
1171             all_pfams_encountered_with_go_annotation_writer.close();
1172             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
1173                     + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
1174             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
1175                     + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
1176                     + "\"" );
1177             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
1178                     + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
1179                     + encountered_pfams_summary_file + "\"" );
1180             ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered                : "
1181                     + all_pfams_encountered.size() );
1182             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping                 : "
1183                     + pfams_without_mappings_counter + " ["
1184                     + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
1185             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
1186                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
1187                     + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
1188             ForesterUtil.programMessage( surfacing.PRG_NAME,
1189                                          "Pfams with a mapping                    : " + pfams_with_mappings_counter
1190                                                  + " ["
1191                                                  + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() )
1192                                                  + "%]" );
1193             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func.  : "
1194                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
1195                     + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
1196             ForesterUtil.programMessage( surfacing.PRG_NAME,
1197                                          "Pfams with mapping to biological process: " + biological_process_counter
1198                                                  + " ["
1199                                                  + ( 100 * biological_process_counter / all_pfams_encountered.size() )
1200                                                  + "%]" );
1201             ForesterUtil.programMessage( surfacing.PRG_NAME,
1202                                          "Pfams with mapping to molecular function: " + molecular_function_counter
1203                                                  + " ["
1204                                                  + ( 100 * molecular_function_counter / all_pfams_encountered.size() )
1205                                                  + "%]" );
1206             ForesterUtil.programMessage( surfacing.PRG_NAME,
1207                                          "Pfams with mapping to cellular component: " + cellular_component_counter
1208                                                  + " ["
1209                                                  + ( 100 * cellular_component_counter / all_pfams_encountered.size() )
1210                                                  + "%]" );
1211             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1212             summary_writer.write( "# Sum of Pfams encountered                : " + all_pfams_encountered.size() );
1213             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1214             summary_writer.write( "# Pfams without a mapping                 : " + pfams_without_mappings_counter
1215                     + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
1216             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1217             summary_writer.write( "# Pfams without mapping to proc. or func. : "
1218                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
1219                     + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
1220             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1221             summary_writer.write( "# Pfams with a mapping                    : " + pfams_with_mappings_counter + " ["
1222                     + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" );
1223             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1224             summary_writer.write( "# Pfams with a mapping to proc. or func.  : "
1225                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
1226                     + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
1227             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1228             summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
1229                     + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" );
1230             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1231             summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
1232                     + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" );
1233             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1234             summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
1235                     + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" );
1236             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
1237             summary_writer.close();
1238         }
1239         catch ( final IOException e ) {
1240             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
1241         }
1242     }
1243
1244     public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
1245                                                                           final File output_dir,
1246                                                                           final GenomeWideCombinableDomains gwcd,
1247                                                                           final int i,
1248                                                                           final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1249         File dc_outfile_dot = new File( input_file_properties[ i ][ 0 ]
1250                 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
1251         if ( output_dir != null ) {
1252             dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
1253         }
1254         checkForOutputFileWriteability( dc_outfile_dot );
1255         final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
1256         try {
1257             final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
1258             for( final BinaryDomainCombination bdc : binary_combinations ) {
1259                 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
1260                         .toString() );
1261                 out_dot.write( SurfacingConstants.NL );
1262             }
1263             out_dot.close();
1264         }
1265         catch ( final IOException e ) {
1266             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1267         }
1268         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
1269                 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1270                 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
1271     }
1272
1273     public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1274                                                             final CharacterStateMatrix.GainLossStates state,
1275                                                             final String filename,
1276                                                             final String indentifier_characters_separator,
1277                                                             final String character_separator,
1278                                                             final Map<String, String> descriptions ) {
1279         final File outfile = new File( filename );
1280         checkForOutputFileWriteability( outfile );
1281         final SortedSet<String> sorted_ids = new TreeSet<String>();
1282         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1283             sorted_ids.add( matrix.getIdentifier( i ) );
1284         }
1285         try {
1286             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1287             for( final String id : sorted_ids ) {
1288                 out.write( indentifier_characters_separator );
1289                 out.write( "#" + id );
1290                 out.write( indentifier_characters_separator );
1291                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1292                     // Not nice:
1293                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1294                     if ( ( matrix.getState( id, c ) == state )
1295                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1296                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1297                         out.write( matrix.getCharacter( c ) );
1298                         if ( ( descriptions != null ) && !descriptions.isEmpty()
1299                                 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
1300                             out.write( "\t" );
1301                             out.write( descriptions.get( matrix.getCharacter( c ) ) );
1302                         }
1303                         out.write( character_separator );
1304                     }
1305                 }
1306             }
1307             out.flush();
1308             out.close();
1309         }
1310         catch ( final IOException e ) {
1311             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1312         }
1313         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1314     }
1315
1316     public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1317                                                                                                  final CharacterStateMatrix.GainLossStates state,
1318                                                                                                  final String filename,
1319                                                                                                  final String indentifier_characters_separator,
1320                                                                                                  final String character_separator,
1321                                                                                                  final BinaryDomainCombination.OutputFormat bc_output_format ) {
1322         final File outfile = new File( filename );
1323         checkForOutputFileWriteability( outfile );
1324         final SortedSet<String> sorted_ids = new TreeSet<String>();
1325         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1326             sorted_ids.add( matrix.getIdentifier( i ) );
1327         }
1328         try {
1329             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
1330             for( final String id : sorted_ids ) {
1331                 out.write( indentifier_characters_separator );
1332                 out.write( "#" + id );
1333                 out.write( indentifier_characters_separator );
1334                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1335                     // Not nice:
1336                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1337                     if ( ( matrix.getState( id, c ) == state )
1338                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
1339                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
1340                         BinaryDomainCombination bdc = null;
1341                         try {
1342                             bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
1343                         }
1344                         catch ( final Exception e ) {
1345                             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1346                         }
1347                         out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
1348                         out.write( character_separator );
1349                     }
1350                 }
1351             }
1352             out.flush();
1353             out.close();
1354         }
1355         catch ( final IOException e ) {
1356             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1357         }
1358         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
1359     }
1360
1361     public static void writeBinaryStatesMatrixToList( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1362                                                       final Map<GoId, GoTerm> go_id_to_term_map,
1363                                                       final GoNameSpace go_namespace_limit,
1364                                                       final boolean domain_combinations,
1365                                                       final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1366                                                       final CharacterStateMatrix.GainLossStates state,
1367                                                       final String filename,
1368                                                       final String indentifier_characters_separator,
1369                                                       final String character_separator,
1370                                                       final String title_for_html,
1371                                                       final String prefix_for_html,
1372                                                       final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
1373                                                       final SortedSet<String> all_pfams_encountered,
1374                                                       final SortedSet<String> pfams_gained_or_lost,
1375                                                       final String suffix_for_per_node_events_file ) {
1376         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1377             throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1378         }
1379         else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1380             throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
1381         }
1382         else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1383             throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
1384         }
1385         final File outfile = new File( filename );
1386         checkForOutputFileWriteability( outfile );
1387         final SortedSet<String> sorted_ids = new TreeSet<String>();
1388         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1389             sorted_ids.add( matrix.getIdentifier( i ) );
1390         }
1391         try {
1392             final Writer out = new BufferedWriter( new FileWriter( outfile ) );
1393             final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
1394                                                                                                                 domain_combinations,
1395                                                                                                                 state,
1396                                                                                                                 filename );
1397             Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1398             File per_node_go_mapped_domain_gain_loss_outfile = null;
1399             int per_node_counter = 0;
1400             out.write( "<html>" );
1401             out.write( SurfacingConstants.NL );
1402             addHtmlHead( out, title_for_html );
1403             out.write( SurfacingConstants.NL );
1404             out.write( "<body>" );
1405             out.write( SurfacingConstants.NL );
1406             out.write( "<h1>" );
1407             out.write( SurfacingConstants.NL );
1408             out.write( title_for_html );
1409             out.write( SurfacingConstants.NL );
1410             out.write( "</h1>" );
1411             out.write( SurfacingConstants.NL );
1412             out.write( "<table>" );
1413             out.write( SurfacingConstants.NL );
1414             for( final String id : sorted_ids ) {
1415                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1416                 if ( matcher.matches() ) {
1417                     continue;
1418                 }
1419                 out.write( "<tr>" );
1420                 out.write( "<td>" );
1421                 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
1422                 out.write( "</td>" );
1423                 out.write( "</tr>" );
1424                 out.write( SurfacingConstants.NL );
1425             }
1426             out.write( "</table>" );
1427             out.write( SurfacingConstants.NL );
1428             for( final String id : sorted_ids ) {
1429                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
1430                 if ( matcher.matches() ) {
1431                     continue;
1432                 }
1433                 out.write( SurfacingConstants.NL );
1434                 out.write( "<h2>" );
1435                 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
1436                 writeTaxonomyLinks( out, id );
1437                 out.write( "</h2>" );
1438                 out.write( SurfacingConstants.NL );
1439                 out.write( "<table>" );
1440                 out.write( SurfacingConstants.NL );
1441                 out.write( "<tr>" );
1442                 out.write( "<td><b>" );
1443                 out.write( "Pfam domain(s)" );
1444                 out.write( "</b></td><td><b>" );
1445                 out.write( "GO term acc" );
1446                 out.write( "</b></td><td><b>" );
1447                 out.write( "GO term" );
1448                 out.write( "</b></td><td><b>" );
1449                 out.write( "GO namespace" );
1450                 out.write( "</b></td>" );
1451                 out.write( "</tr>" );
1452                 out.write( SurfacingConstants.NL );
1453                 out.write( "</tr>" );
1454                 out.write( SurfacingConstants.NL );
1455                 per_node_counter = 0;
1456                 if ( matrix.getNumberOfCharacters() > 0 ) {
1457                     per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
1458                             + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
1459                     SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
1460                     per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
1461                             .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
1462                 }
1463                 else {
1464                     per_node_go_mapped_domain_gain_loss_outfile = null;
1465                     per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1466                 }
1467                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1468                     // Not nice:
1469                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1470                     if ( ( matrix.getState( id, c ) == state )
1471                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
1472                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
1473                         final String character = matrix.getCharacter( c );
1474                         String domain_0 = "";
1475                         String domain_1 = "";
1476                         if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
1477                             final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
1478                             if ( s.length != 2 ) {
1479                                 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
1480                                         + character + "]" );
1481                             }
1482                             domain_0 = s[ 0 ];
1483                             domain_1 = s[ 1 ];
1484                         }
1485                         else {
1486                             domain_0 = character;
1487                         }
1488                         writeDomainData( domain_id_to_go_ids_map,
1489                                          go_id_to_term_map,
1490                                          go_namespace_limit,
1491                                          out,
1492                                          domain_0,
1493                                          domain_1,
1494                                          prefix_for_html,
1495                                          character_separator,
1496                                          domain_id_to_secondary_features_maps,
1497                                          null );
1498                         all_pfams_encountered.add( domain_0 );
1499                         if ( pfams_gained_or_lost != null ) {
1500                             pfams_gained_or_lost.add( domain_0 );
1501                         }
1502                         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
1503                             all_pfams_encountered.add( domain_1 );
1504                             if ( pfams_gained_or_lost != null ) {
1505                                 pfams_gained_or_lost.add( domain_1 );
1506                             }
1507                         }
1508                         if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1509                             writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
1510                                                                      domain_0,
1511                                                                      domain_1 );
1512                             per_node_counter++;
1513                         }
1514                     }
1515                 }
1516                 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1517                     per_node_go_mapped_domain_gain_loss_outfile_writer.close();
1518                     if ( per_node_counter < 1 ) {
1519                         per_node_go_mapped_domain_gain_loss_outfile.delete();
1520                     }
1521                     per_node_counter = 0;
1522                 }
1523                 out.write( "</table>" );
1524                 out.write( SurfacingConstants.NL );
1525                 out.write( "<hr>" );
1526                 out.write( SurfacingConstants.NL );
1527             } // for( final String id : sorted_ids ) {  
1528             out.write( "</body>" );
1529             out.write( SurfacingConstants.NL );
1530             out.write( "</html>" );
1531             out.write( SurfacingConstants.NL );
1532             out.flush();
1533             out.close();
1534         }
1535         catch ( final IOException e ) {
1536             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1537         }
1538         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
1539     }
1540
1541     public static void writeBinaryStatesMatrixToListORIGIG( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1542                                                             final Map<GoId, GoTerm> go_id_to_term_map,
1543                                                             final GoNameSpace go_namespace_limit,
1544                                                             final boolean domain_combinations,
1545                                                             final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
1546                                                             final CharacterStateMatrix.GainLossStates state,
1547                                                             final String filename,
1548                                                             final String indentifier_characters_separator,
1549                                                             final String character_separator,
1550                                                             final String title_for_html,
1551                                                             final String prefix_for_html,
1552                                                             final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
1553                                                             final SortedSet<String> all_pfams_encountered,
1554                                                             final SortedSet<String> pfams_gained_or_lost,
1555                                                             final String suffix_for_per_node_events_file ) {
1556         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1557             throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
1558         }
1559         else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
1560             throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
1561         }
1562         else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
1563             throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
1564         }
1565         final File outfile = new File( filename );
1566         checkForOutputFileWriteability( outfile );
1567         final SortedSet<String> sorted_ids = new TreeSet<String>();
1568         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
1569             sorted_ids.add( matrix.getIdentifier( i ) );
1570         }
1571         try {
1572             final Writer out = new BufferedWriter( new FileWriter( outfile ) );
1573             final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
1574                                                                                                                 domain_combinations,
1575                                                                                                                 state,
1576                                                                                                                 filename );
1577             Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1578             File per_node_go_mapped_domain_gain_loss_outfile = null;
1579             int per_node_counter = 0;
1580             out.write( "<html>" );
1581             out.write( SurfacingConstants.NL );
1582             addHtmlHead( out, title_for_html );
1583             out.write( SurfacingConstants.NL );
1584             out.write( "<body>" );
1585             out.write( SurfacingConstants.NL );
1586             out.write( "<h1>" );
1587             out.write( SurfacingConstants.NL );
1588             out.write( title_for_html );
1589             out.write( SurfacingConstants.NL );
1590             out.write( "</h1>" );
1591             out.write( SurfacingConstants.NL );
1592             out.write( "<table>" );
1593             out.write( SurfacingConstants.NL );
1594             for( final String id : sorted_ids ) {
1595                 out.write( "<tr>" );
1596                 out.write( "<td>" );
1597                 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
1598                 writeTaxonomyLinks( out, id );
1599                 out.write( "</td>" );
1600                 out.write( "</tr>" );
1601                 out.write( SurfacingConstants.NL );
1602             }
1603             out.write( "</table>" );
1604             out.write( SurfacingConstants.NL );
1605             for( final String id : sorted_ids ) {
1606                 out.write( SurfacingConstants.NL );
1607                 out.write( "<h2>" );
1608                 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
1609                 writeTaxonomyLinks( out, id );
1610                 out.write( "</h2>" );
1611                 out.write( SurfacingConstants.NL );
1612                 out.write( "<table>" );
1613                 out.write( SurfacingConstants.NL );
1614                 out.write( "<tr>" );
1615                 out.write( "<td><b>" );
1616                 out.write( "Pfam domain(s)" );
1617                 out.write( "</b></td><td><b>" );
1618                 out.write( "GO term acc" );
1619                 out.write( "</b></td><td><b>" );
1620                 out.write( "GO term" );
1621                 out.write( "</b></td><td><b>" );
1622                 out.write( "Penultimate GO term" );
1623                 out.write( "</b></td><td><b>" );
1624                 out.write( "GO namespace" );
1625                 out.write( "</b></td>" );
1626                 out.write( "</tr>" );
1627                 out.write( SurfacingConstants.NL );
1628                 out.write( "</tr>" );
1629                 out.write( SurfacingConstants.NL );
1630                 per_node_counter = 0;
1631                 if ( matrix.getNumberOfCharacters() > 0 ) {
1632                     per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
1633                             + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
1634                     SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
1635                     per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
1636                             .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
1637                 }
1638                 else {
1639                     per_node_go_mapped_domain_gain_loss_outfile = null;
1640                     per_node_go_mapped_domain_gain_loss_outfile_writer = null;
1641                 }
1642                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
1643                     // Not nice:
1644                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
1645                     if ( ( matrix.getState( id, c ) == state )
1646                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
1647                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
1648                         final String character = matrix.getCharacter( c );
1649                         String domain_0 = "";
1650                         String domain_1 = "";
1651                         if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
1652                             final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
1653                             if ( s.length != 2 ) {
1654                                 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
1655                                         + character + "]" );
1656                             }
1657                             domain_0 = s[ 0 ];
1658                             domain_1 = s[ 1 ];
1659                         }
1660                         else {
1661                             domain_0 = character;
1662                         }
1663                         writeDomainData( domain_id_to_go_ids_map,
1664                                          go_id_to_term_map,
1665                                          go_namespace_limit,
1666                                          out,
1667                                          domain_0,
1668                                          domain_1,
1669                                          prefix_for_html,
1670                                          character_separator,
1671                                          domain_id_to_secondary_features_maps,
1672                                          null );
1673                         all_pfams_encountered.add( domain_0 );
1674                         if ( pfams_gained_or_lost != null ) {
1675                             pfams_gained_or_lost.add( domain_0 );
1676                         }
1677                         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
1678                             all_pfams_encountered.add( domain_1 );
1679                             if ( pfams_gained_or_lost != null ) {
1680                                 pfams_gained_or_lost.add( domain_1 );
1681                             }
1682                         }
1683                         if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1684                             writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
1685                                                                      domain_0,
1686                                                                      domain_1 );
1687                             per_node_counter++;
1688                         }
1689                     }
1690                 }
1691                 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
1692                     per_node_go_mapped_domain_gain_loss_outfile_writer.close();
1693                     if ( per_node_counter < 1 ) {
1694                         per_node_go_mapped_domain_gain_loss_outfile.delete();
1695                     }
1696                     per_node_counter = 0;
1697                 }
1698                 out.write( "</table>" );
1699                 out.write( SurfacingConstants.NL );
1700                 out.write( "<hr>" );
1701                 out.write( SurfacingConstants.NL );
1702             } // for( final String id : sorted_ids ) {  
1703             out.write( "</body>" );
1704             out.write( SurfacingConstants.NL );
1705             out.write( "</html>" );
1706             out.write( SurfacingConstants.NL );
1707             out.flush();
1708             out.close();
1709         }
1710         catch ( final IOException e ) {
1711             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1712         }
1713         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
1714     }
1715
1716     public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
1717                                                           final File output_dir,
1718                                                           final Writer per_genome_domain_promiscuity_statistics_writer,
1719                                                           final GenomeWideCombinableDomains gwcd,
1720                                                           final int i,
1721                                                           final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
1722         File dc_outfile = new File( input_file_properties[ i ][ 0 ]
1723                 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
1724         if ( output_dir != null ) {
1725             dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
1726         }
1727         checkForOutputFileWriteability( dc_outfile );
1728         try {
1729             final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
1730             out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
1731             out.close();
1732         }
1733         catch ( final IOException e ) {
1734             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1735         }
1736         final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
1737         try {
1738             per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 0 ] + "\t" );
1739             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
1740             if ( stats.getN() < 2 ) {
1741                 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
1742             }
1743             else {
1744                 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
1745                         .sampleStandardDeviation() ) + "\t" );
1746             }
1747             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
1748             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
1749             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
1750             per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
1751             final SortedSet<DomainId> mpds = gwcd.getMostPromiscuosDomain();
1752             for( final DomainId mpd : mpds ) {
1753                 per_genome_domain_promiscuity_statistics_writer.write( mpd.getId() + " " );
1754             }
1755             per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
1756         }
1757         catch ( final IOException e ) {
1758             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1759         }
1760         if ( input_file_properties[ i ].length == 3 ) {
1761             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1762                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
1763                     + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
1764         }
1765         else {
1766             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
1767                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
1768                     + dc_outfile + "\"" );
1769         }
1770     }
1771
1772     private static void writeDomainData( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
1773                                          final Map<GoId, GoTerm> go_id_to_term_map,
1774                                          final GoNameSpace go_namespace_limit,
1775                                          final Writer out,
1776                                          final String domain_0,
1777                                          final String domain_1,
1778                                          final String prefix_for_html,
1779                                          final String character_separator_for_non_html_output,
1780                                          final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
1781                                          final Set<GoId> all_go_ids ) throws IOException {
1782         boolean any_go_annotation_present = false;
1783         boolean first_has_no_go = false;
1784         int domain_count = 2; // To distinguish between domains and binary domain combinations.
1785         if ( ForesterUtil.isEmpty( domain_1 ) ) {
1786             domain_count = 1;
1787         }
1788         // The following has a difficult to understand logic.  
1789         for( int d = 0; d < domain_count; ++d ) {
1790             List<GoId> go_ids = null;
1791             boolean go_annotation_present = false;
1792             if ( d == 0 ) {
1793                 final DomainId domain_id = new DomainId( domain_0 );
1794                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
1795                     go_annotation_present = true;
1796                     any_go_annotation_present = true;
1797                     go_ids = domain_id_to_go_ids_map.get( domain_id );
1798                 }
1799                 else {
1800                     first_has_no_go = true;
1801                 }
1802             }
1803             else {
1804                 final DomainId domain_id = new DomainId( domain_1 );
1805                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
1806                     go_annotation_present = true;
1807                     any_go_annotation_present = true;
1808                     go_ids = domain_id_to_go_ids_map.get( domain_id );
1809                 }
1810             }
1811             if ( go_annotation_present ) {
1812                 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
1813                 for( final GoId go_id : go_ids ) {
1814                     out.write( "<tr>" );
1815                     if ( first ) {
1816                         first = false;
1817                         writeDomainIdsToHtml( out,
1818                                               domain_0,
1819                                               domain_1,
1820                                               prefix_for_html,
1821                                               domain_id_to_secondary_features_maps );
1822                     }
1823                     else {
1824                         out.write( "<td></td>" );
1825                     }
1826                     if ( !go_id_to_term_map.containsKey( go_id ) ) {
1827                         throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
1828                     }
1829                     final GoTerm go_term = go_id_to_term_map.get( go_id );
1830                     if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
1831                         // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
1832                         final String go_id_str = go_id.getId();
1833                         out.write( "<td>" );
1834                         out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
1835                                 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
1836                         out.write( "</td><td>" );
1837                         out.write( go_term.getName() );
1838                         if ( domain_count == 2 ) {
1839                             out.write( " (" + d + ")" );
1840                         }
1841                         out.write( "</td><td>" );
1842                         // out.write( top );
1843                         // out.write( "</td><td>" );
1844                         out.write( "[" );
1845                         out.write( go_term.getGoNameSpace().toShortString() );
1846                         out.write( "]" );
1847                         out.write( "</td>" );
1848                         if ( all_go_ids != null ) {
1849                             all_go_ids.add( go_id );
1850                         }
1851                     }
1852                     else {
1853                         out.write( "<td>" );
1854                         out.write( "</td><td>" );
1855                         out.write( "</td><td>" );
1856                         out.write( "</td><td>" );
1857                         out.write( "</td>" );
1858                     }
1859                     out.write( "</tr>" );
1860                     out.write( SurfacingConstants.NL );
1861                 }
1862             }
1863         } //  for( int d = 0; d < domain_count; ++d ) 
1864         if ( !any_go_annotation_present ) {
1865             out.write( "<tr>" );
1866             writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
1867             out.write( "<td>" );
1868             out.write( "</td><td>" );
1869             out.write( "</td><td>" );
1870             out.write( "</td><td>" );
1871             out.write( "</td>" );
1872             out.write( "</tr>" );
1873             out.write( SurfacingConstants.NL );
1874         }
1875     }
1876
1877     private static void writeDomainIdsToHtml( final Writer out,
1878                                               final String domain_0,
1879                                               final String domain_1,
1880                                               final String prefix_for_detailed_html,
1881                                               final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps )
1882             throws IOException {
1883         out.write( "<td>" );
1884         if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
1885             out.write( prefix_for_detailed_html );
1886             out.write( " " );
1887         }
1888         out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
1889         out.write( "</td>" );
1890     }
1891
1892     public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc,
1893                                                                        final StringBuilder html_title,
1894                                                                        final Writer w,
1895                                                                        final SortedSet<DomainSimilarity> similarities,
1896                                                                        final boolean treat_as_binary,
1897                                                                        final List<Species> species_order,
1898                                                                        final PrintableDomainSimilarity.PRINT_OPTION print_option,
1899                                                                        final DomainSimilarity.DomainSimilaritySortField sort_field,
1900                                                                        final DomainSimilarity.DomainSimilarityScoring scoring,
1901                                                                        final boolean verbose ) throws IOException {
1902         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
1903         String histogram_title = null;
1904         switch ( sort_field ) {
1905             case ABS_MAX_COUNTS_DIFFERENCE:
1906                 if ( treat_as_binary ) {
1907                     histogram_title = "absolute counts difference:";
1908                 }
1909                 else {
1910                     histogram_title = "absolute (maximal) counts difference:";
1911                 }
1912                 break;
1913             case MAX_COUNTS_DIFFERENCE:
1914                 if ( treat_as_binary ) {
1915                     histogram_title = "counts difference:";
1916                 }
1917                 else {
1918                     histogram_title = "(maximal) counts difference:";
1919                 }
1920                 break;
1921             case DOMAIN_ID:
1922                 histogram_title = "score mean:";
1923                 break;
1924             case MIN:
1925                 histogram_title = "score minimum:";
1926                 break;
1927             case MAX:
1928                 histogram_title = "score maximum:";
1929                 break;
1930             case MAX_DIFFERENCE:
1931                 if ( treat_as_binary ) {
1932                     histogram_title = "difference:";
1933                 }
1934                 else {
1935                     histogram_title = "(maximal) difference:";
1936                 }
1937                 break;
1938             case MEAN:
1939                 histogram_title = "score mean:";
1940                 break;
1941             case SD:
1942                 histogram_title = "score standard deviation:";
1943                 break;
1944             case SPECIES_COUNT:
1945                 histogram_title = "species number:";
1946                 break;
1947             default:
1948                 throw new AssertionError( "Unknown sort field: " + sort_field );
1949         }
1950         for( final DomainSimilarity similarity : similarities ) {
1951             switch ( sort_field ) {
1952                 case ABS_MAX_COUNTS_DIFFERENCE:
1953                     stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) );
1954                     break;
1955                 case MAX_COUNTS_DIFFERENCE:
1956                     stats.addValue( similarity.getMaximalDifferenceInCounts() );
1957                     break;
1958                 case DOMAIN_ID:
1959                     stats.addValue( similarity.getMeanSimilarityScore() );
1960                     break;
1961                 case MIN:
1962                     stats.addValue( similarity.getMinimalSimilarityScore() );
1963                     break;
1964                 case MAX:
1965                     stats.addValue( similarity.getMaximalSimilarityScore() );
1966                     break;
1967                 case MAX_DIFFERENCE:
1968                     stats.addValue( similarity.getMaximalDifference() );
1969                     break;
1970                 case MEAN:
1971                     stats.addValue( similarity.getMeanSimilarityScore() );
1972                     break;
1973                 case SD:
1974                     stats.addValue( similarity.getStandardDeviationOfSimilarityScore() );
1975                     break;
1976                 case SPECIES_COUNT:
1977                     stats.addValue( similarity.getSpecies().size() );
1978                     break;
1979                 default:
1980                     throw new AssertionError( "Unknown sort field: " + sort_field );
1981             }
1982         }
1983         //
1984         // final HistogramData[] hists = new HistogramData[ 1 ];
1985         //      
1986         //        
1987         // List<HistogramDataItem> data_items = new
1988         // ArrayList<HistogramDataItem>();
1989         // double[] values = stats.getDataAsDoubleArray();
1990         // for( int i = 0; i < values.length; i++ ) {
1991         // HistogramDataItem data_item = new BasicHistogramDataItem( "", values[
1992         // i ] );
1993         // data_items.add( data_item );
1994         // }
1995         //        
1996         //        
1997         // HistogramData hd0 = new HistogramData( "name",
1998         // data_items,
1999         // null, 20,
2000         // 40 );
2001         //        
2002         //        
2003         //        
2004         //        
2005         // hists[ 0 ] = hd0;
2006         //       
2007         // final HistogramsFrame hf = new HistogramsFrame( hists );
2008         // hf.setVisible( true );
2009         //
2010         AsciiHistogram histo = null;
2011         if ( stats.getMin() < stats.getMin() ) {
2012             histo = new AsciiHistogram( stats, histogram_title );
2013         }
2014         if ( verbose ) {
2015             if ( histo != null ) {
2016                 System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) );
2017             }
2018             System.out.println();
2019             System.out.println( "N                   : " + stats.getN() );
2020             System.out.println( "Min                 : " + stats.getMin() );
2021             System.out.println( "Max                 : " + stats.getMax() );
2022             System.out.println( "Mean                : " + stats.arithmeticMean() );
2023             if ( stats.getN() > 1 ) {
2024                 System.out.println( "SD                  : " + stats.sampleStandardDeviation() );
2025             }
2026             else {
2027                 System.out.println( "SD                  : n/a" );
2028             }
2029             System.out.println( "Median              : " + stats.median() );
2030             if ( stats.getN() > 1 ) {
2031                 System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() );
2032             }
2033             else {
2034                 System.out.println( "Pearsonian skewness : n/a" );
2035             }
2036         }
2037         switch ( print_option ) {
2038             case SIMPLE_TAB_DELIMITED:
2039                 break;
2040             case HTML:
2041                 w.write( "<html>" );
2042                 w.write( SurfacingConstants.NL );
2043                 addHtmlHead( w, "SURFACING :: " + html_title );
2044                 w.write( SurfacingConstants.NL );
2045                 w.write( "<body>" );
2046                 w.write( SurfacingConstants.NL );
2047                 w.write( html_desc.toString() );
2048                 w.write( SurfacingConstants.NL );
2049                 w.write( "<hr>" );
2050                 w.write( "<br>" );
2051                 w.write( SurfacingConstants.NL );
2052                 w.write( "<tt><pre>" );
2053                 w.write( SurfacingConstants.NL );
2054                 if ( histo != null ) {
2055                     w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
2056                     w.write( SurfacingConstants.NL );
2057                 }
2058                 w.write( "</pre></tt>" );
2059                 w.write( SurfacingConstants.NL );
2060                 w.write( "<table>" );
2061                 w.write( SurfacingConstants.NL );
2062                 w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
2063                 w.write( SurfacingConstants.NL );
2064                 w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
2065                 w.write( SurfacingConstants.NL );
2066                 w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
2067                 w.write( SurfacingConstants.NL );
2068                 w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
2069                 w.write( SurfacingConstants.NL );
2070                 if ( stats.getN() > 1 ) {
2071                     w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
2072                 }
2073                 else {
2074                     w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
2075                 }
2076                 w.write( SurfacingConstants.NL );
2077                 w.write( "<tr><td>Median: </td><td>" + stats.median() + "</td></tr>" );
2078                 w.write( SurfacingConstants.NL );
2079                 if ( stats.getN() > 1 ) {
2080                     w.write( "<tr><td>Pearsonian skewness: </td><td>" + stats.pearsonianSkewness() + "</td></tr>" );
2081                 }
2082                 else {
2083                     w.write( "<tr><td>Pearsonian skewness: </td><td>n/a</td></tr>" );
2084                 }
2085                 w.write( SurfacingConstants.NL );
2086                 w.write( "</table>" );
2087                 w.write( SurfacingConstants.NL );
2088                 w.write( "<br>" );
2089                 w.write( SurfacingConstants.NL );
2090                 w.write( "<hr>" );
2091                 w.write( SurfacingConstants.NL );
2092                 w.write( "<br>" );
2093                 w.write( SurfacingConstants.NL );
2094                 w.write( "<table>" );
2095                 w.write( SurfacingConstants.NL );
2096                 break;
2097         }
2098         w.write( SurfacingConstants.NL );
2099         for( final DomainSimilarity similarity : similarities ) {
2100             if ( ( species_order != null ) && !species_order.isEmpty() ) {
2101                 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
2102             }
2103             w.write( similarity.toStringBuffer( print_option ).toString() );
2104             w.write( SurfacingConstants.NL );
2105         }
2106         switch ( print_option ) {
2107             case HTML:
2108                 w.write( SurfacingConstants.NL );
2109                 w.write( "</table>" );
2110                 w.write( SurfacingConstants.NL );
2111                 w.write( "</font>" );
2112                 w.write( SurfacingConstants.NL );
2113                 w.write( "</body>" );
2114                 w.write( SurfacingConstants.NL );
2115                 w.write( "</html>" );
2116                 w.write( SurfacingConstants.NL );
2117                 break;
2118         }
2119         w.flush();
2120         w.close();
2121         return stats;
2122     }
2123
2124     private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
2125                                                                  final String domain_0,
2126                                                                  final String domain_1 ) throws IOException {
2127         individual_files_writer.write( domain_0 );
2128         individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2129         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2130             individual_files_writer.write( domain_1 );
2131             individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
2132         }
2133     }
2134
2135     public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
2136                                           final String filename,
2137                                           final Format format ) {
2138         final File outfile = new File( filename );
2139         checkForOutputFileWriteability( outfile );
2140         try {
2141             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
2142             matrix.toWriter( out, format );
2143             out.flush();
2144             out.close();
2145         }
2146         catch ( final IOException e ) {
2147             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2148         }
2149         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
2150     }
2151
2152     public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
2153         checkForOutputFileWriteability( matrix_outfile );
2154         try {
2155             final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
2156             for( final DistanceMatrix distance_matrix : matrices ) {
2157                 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
2158                 out.write( ForesterUtil.LINE_SEPARATOR );
2159                 out.flush();
2160             }
2161             out.close();
2162         }
2163         catch ( final IOException e ) {
2164             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2165         }
2166         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
2167     }
2168
2169     private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
2170         try {
2171             final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
2172             for( final String pfam : pfams ) {
2173                 writer.write( pfam );
2174                 writer.write( ForesterUtil.LINE_SEPARATOR );
2175             }
2176             writer.close();
2177             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
2178                     + "]" );
2179         }
2180         catch ( final IOException e ) {
2181             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2182         }
2183     }
2184
2185     public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
2186         final PhylogenyWriter writer = new PhylogenyWriter();
2187         try {
2188             writer.toPhyloXML( new File( filename ), phylogeny, 1 );
2189         }
2190         catch ( final IOException e ) {
2191             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
2192                     + e );
2193         }
2194         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
2195     }
2196
2197     public static void writeTaxonomyLinks( final Writer writer, final String species ) throws IOException {
2198         if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
2199             final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species );
2200             writer.write( " [" );
2201             if ( matcher.matches() ) {
2202                 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_LINK + species
2203                         + "\" target=\"taxonomy_window\">uniprot</a>" );
2204             }
2205             else {
2206                 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
2207                         + "\" target=\"taxonomy_window\">eol</a>" );
2208                 writer.write( "|" );
2209                 writer.write( "<a href=\"" + SurfacingConstants.TOL_LINK + species
2210                         + "\" target=\"taxonomy_window\">tol</a>" );
2211             }
2212             writer.write( "]" );
2213         }
2214     }
2215
2216     private static void writeToNexus( final String outfile_name,
2217                                       final CharacterStateMatrix<BinaryStates> matrix,
2218                                       final Phylogeny phylogeny ) {
2219         if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
2220             throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
2221                     + "] to nexus" );
2222         }
2223         final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
2224         final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
2225         phylogenies.add( phylogeny );
2226         try {
2227             final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
2228             w.write( NexusConstants.NEXUS );
2229             w.write( ForesterUtil.LINE_SEPARATOR );
2230             my_matrix.writeNexusTaxaBlock( w );
2231             my_matrix.writeNexusBinaryChractersBlock( w );
2232             PhylogenyWriter.writeNexusTreesBlock( w, phylogenies );
2233             w.flush();
2234             w.close();
2235             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
2236         }
2237         catch ( final IOException e ) {
2238             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2239         }
2240     }
2241
2242     private static void writeToNexus( final String outfile_name,
2243                                       final DomainParsimonyCalculator domain_parsimony,
2244                                       final Phylogeny phylogeny ) {
2245         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
2246                       domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
2247                       phylogeny );
2248         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
2249                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
2250                       phylogeny );
2251     }
2252 }