25fdf17d918501b4fd59cbc4b42ca8a45fe81d22
[jalview.git] / forester / java / src / org / forester / surfacing / SurfacingUtil.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.awt.Color;
30 import java.io.BufferedWriter;
31 import java.io.File;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.Writer;
35 import java.text.DecimalFormat;
36 import java.text.NumberFormat;
37 import java.util.ArrayList;
38 import java.util.Arrays;
39 import java.util.Collections;
40 import java.util.Comparator;
41 import java.util.HashMap;
42 import java.util.HashSet;
43 import java.util.Iterator;
44 import java.util.List;
45 import java.util.Map;
46 import java.util.Map.Entry;
47 import java.util.PriorityQueue;
48 import java.util.Set;
49 import java.util.SortedMap;
50 import java.util.SortedSet;
51 import java.util.TreeMap;
52 import java.util.TreeSet;
53 import java.util.regex.Matcher;
54 import java.util.regex.Pattern;
55
56 import org.forester.application.surfacing;
57 import org.forester.evoinference.distance.NeighborJoining;
58 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
59 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
60 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
61 import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
62 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
63 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
64 import org.forester.evoinference.matrix.distance.DistanceMatrix;
65 import org.forester.go.GoId;
66 import org.forester.go.GoNameSpace;
67 import org.forester.go.GoTerm;
68 import org.forester.go.PfamToGoMapping;
69 import org.forester.io.parsers.nexus.NexusConstants;
70 import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
71 import org.forester.io.parsers.util.ParserUtils;
72 import org.forester.io.writers.PhylogenyWriter;
73 import org.forester.phylogeny.Phylogeny;
74 import org.forester.phylogeny.PhylogenyMethods;
75 import org.forester.phylogeny.PhylogenyNode;
76 import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
77 import org.forester.phylogeny.data.BinaryCharacters;
78 import org.forester.phylogeny.data.Confidence;
79 import org.forester.phylogeny.data.Taxonomy;
80 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
81 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
82 import org.forester.protein.BasicDomain;
83 import org.forester.protein.BasicProtein;
84 import org.forester.protein.BinaryDomainCombination;
85 import org.forester.protein.Domain;
86 import org.forester.protein.Protein;
87 import org.forester.species.Species;
88 import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
89 import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
90 import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION;
91 import org.forester.util.AsciiHistogram;
92 import org.forester.util.BasicDescriptiveStatistics;
93 import org.forester.util.BasicTable;
94 import org.forester.util.BasicTableParser;
95 import org.forester.util.CommandLineArguments;
96 import org.forester.util.DescriptiveStatistics;
97 import org.forester.util.ForesterUtil;
98 import org.forester.util.TaxonomyColors;
99
100 public final class SurfacingUtil {
101
102     final static class DomainComparator implements Comparator<Domain> {
103
104         final private boolean _ascending;
105
106         public DomainComparator( final boolean ascending ) {
107             _ascending = ascending;
108         }
109
110         @Override
111         public final int compare( final Domain d0, final Domain d1 ) {
112             if ( d0.getFrom() < d1.getFrom() ) {
113                 return _ascending ? -1 : 1;
114             }
115             else if ( d0.getFrom() > d1.getFrom() ) {
116                 return _ascending ? 1 : -1;
117             }
118             return 0;
119         }
120     }
121     private final static NumberFormat        FORMATTER_3                      = new DecimalFormat( "0.000" );
122     private static final Comparator<Domain>  ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator<Domain>() {
123
124                                                                                   @Override
125                                                                                   public int compare( final Domain d1,
126                                                                                                       final Domain d2 ) {
127                                                                                       if ( d1.getPerSequenceEvalue() < d2
128                                                                                               .getPerSequenceEvalue() ) {
129                                                                                           return -1;
130                                                                                       }
131                                                                                       else if ( d1
132                                                                                               .getPerSequenceEvalue() > d2
133                                                                                               .getPerSequenceEvalue() ) {
134                                                                                           return 1;
135                                                                                       }
136                                                                                       else {
137                                                                                           return d1.compareTo( d2 );
138                                                                                       }
139                                                                                   }
140                                                                               };
141     public final static Pattern              PATTERN_SP_STYLE_TAXONOMY        = Pattern.compile( "^[A-Z0-9]{3,5}$" );
142     private final static Map<String, String> _TAXCODE_HEXCOLORSTRING_MAP      = new HashMap<String, String>();
143
144     public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome,
145                                                            final SortedSet<BinaryDomainCombination> binary_domain_combinations ) {
146         final SortedMap<String, CombinableDomains> all_cd = genome.getAllCombinableDomainsIds();
147         for( final String domain_id : all_cd.keySet() ) {
148             binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() );
149         }
150     }
151
152     public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome,
153                                              final SortedSet<String> domain_ids ) {
154         final SortedSet<String> domains = genome.getAllDomainIds();
155         for( final String domain : domains ) {
156             domain_ids.add( domain );
157         }
158     }
159
160     public static void writeHtmlHead( final Writer w, final String title ) throws IOException {
161         w.write( SurfacingConstants.NL );
162         w.write( "<head>" );
163         w.write( "<title>" );
164         w.write( title );
165         w.write( "</title>" );
166         w.write( SurfacingConstants.NL );
167         w.write( "<style>" );
168         w.write( SurfacingConstants.NL );
169         w.write( "a:visited { color : #000066; text-decoration : none; }" );
170         w.write( SurfacingConstants.NL );
171         w.write( "a:link { color : #000066; text-decoration : none; }" );
172         w.write( SurfacingConstants.NL );
173         w.write( "a:active { color : ##000066; text-decoration : none; }" );
174         w.write( SurfacingConstants.NL );
175         w.write( "a:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; }" );
176         w.write( SurfacingConstants.NL );
177         //
178         w.write( "a.pl:visited { color : #505050; text-decoration : none; font-size: 7px;}" );
179         w.write( SurfacingConstants.NL );
180         w.write( "a.pl:link { color : #505050; text-decoration : none; font-size: 7px;}" );
181         w.write( SurfacingConstants.NL );
182         w.write( "a.pl:active { color : #505050; text-decoration : none; font-size: 7px;}" );
183         w.write( SurfacingConstants.NL );
184         w.write( "a.pl:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; font-size: 7px;}" );
185         w.write( SurfacingConstants.NL );
186         //
187         w.write( "a.ps:visited { color : #707070; text-decoration : none; font-size: 7px;}" );
188         w.write( SurfacingConstants.NL );
189         w.write( "a.ps:link { color : #707070; text-decoration : none; font-size: 7px;}" );
190         w.write( SurfacingConstants.NL );
191         w.write( "a.ps:active { color : #707070; text-decoration : none; font-size: 7px;}" );
192         w.write( SurfacingConstants.NL );
193         w.write( "a.ps:hover { color : #FFFFFF; background-color : #000000; text-decoration : none; font-size: 7px;}" );
194         w.write( SurfacingConstants.NL );
195         //
196         w.write( "td { text-align: left; vertical-align: top; font-family: Verdana, Arial, Helvetica; font-size: 8pt}" );
197         w.write( SurfacingConstants.NL );
198         w.write( "h1 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 18pt; font-weight: bold }" );
199         w.write( SurfacingConstants.NL );
200         w.write( "h2 { color : #0000FF; font-family: Verdana, Arial, Helvetica; font-size: 16pt; font-weight: bold }" );
201         w.write( SurfacingConstants.NL );
202         w.write( "</style>" );
203         w.write( SurfacingConstants.NL );
204         w.write( "</head>" );
205         w.write( SurfacingConstants.NL );
206     }
207
208     private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
209         if ( map.containsKey( s ) ) {
210             map.put( s, map.get( s ) + 1 );
211         }
212         else {
213             map.put( s, 1 );
214         }
215     }
216
217     public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set<DomainSimilarity> similarities ) {
218         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
219         for( final DomainSimilarity similarity : similarities ) {
220             stats.addValue( similarity.getMeanSimilarityScore() );
221         }
222         return stats;
223     }
224
225     private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
226                                                                     final String outfilename_for_counts,
227                                                                     final String outfilename_for_dc,
228                                                                     final String outfilename_for_dc_for_go_mapping,
229                                                                     final String outfilename_for_dc_for_go_mapping_unique,
230                                                                     final String outfilename_for_rank_counts,
231                                                                     final String outfilename_for_ancestor_species_counts,
232                                                                     final String outfilename_for_protein_stats,
233                                                                     final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
234                                                                     final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
235                                                                     final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
236         try {
237             //
238             //            if ( protein_length_stats_by_dc != null ) {
239             //                for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
240             //                    System.out.print( entry.getKey().toString() );
241             //                    System.out.print( ": " );
242             //                    double[] a = entry.getValue().getDataAsDoubleArray();
243             //                    for( int i = 0; i < a.length; i++ ) {
244             //                        System.out.print( a[ i ] + " " );
245             //                    }
246             //                    System.out.println();
247             //                }
248             //            }
249             //            if ( domain_number_stats_by_dc != null ) {
250             //                for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
251             //                    System.out.print( entry.getKey().toString() );
252             //                    System.out.print( ": " );
253             //                    double[] a = entry.getValue().getDataAsDoubleArray();
254             //                    for( int i = 0; i < a.length; i++ ) {
255             //                        System.out.print( a[ i ] + " " );
256             //                    }
257             //                    System.out.println();
258             //                }
259             //            }
260             //
261             final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
262             final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
263             final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
264             final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
265             final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
266             for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
267                 final PhylogenyNode n = it.next();
268                 final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
269                 for( final String dc : gained_dc ) {
270                     if ( dc_gain_counts.containsKey( dc ) ) {
271                         dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
272                     }
273                     else {
274                         dc_gain_counts.put( dc, 1 );
275                     }
276                 }
277             }
278             final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
279             final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
280             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
281             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
282             final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
283             final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
284             final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
285             final Set<String> dcs = dc_gain_counts.keySet();
286             final SortedSet<String> more_than_once = new TreeSet<String>();
287             DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
288             DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
289             DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
290             final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
291             long gained_multiple_times_domain_length_sum = 0;
292             long gained_once_domain_length_sum = 0;
293             long gained_multiple_times_domain_length_count = 0;
294             long gained_once_domain_length_count = 0;
295             for( final String dc : dcs ) {
296                 final int count = dc_gain_counts.get( dc );
297                 if ( histogram.containsKey( count ) ) {
298                     histogram.put( count, histogram.get( count ) + 1 );
299                     domain_lists.get( count ).append( ", " + dc );
300                     domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
301                     domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
302                 }
303                 else {
304                     histogram.put( count, 1 );
305                     domain_lists.put( count, new StringBuilder( dc ) );
306                     final PriorityQueue<String> q = new PriorityQueue<String>();
307                     q.addAll( splitDomainCombination( dc ) );
308                     domain_lists_go.put( count, q );
309                     final SortedSet<String> set = new TreeSet<String>();
310                     set.addAll( splitDomainCombination( dc ) );
311                     domain_lists_go_unique.put( count, set );
312                 }
313                 if ( protein_length_stats_by_dc != null ) {
314                     if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
315                         dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
316                     }
317                     dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
318                             .arithmeticMean() );
319                 }
320                 if ( domain_number_stats_by_dc != null ) {
321                     if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
322                         dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
323                     }
324                     dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
325                             .arithmeticMean() );
326                 }
327                 if ( domain_length_stats_by_domain != null ) {
328                     if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
329                         dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
330                     }
331                     final String[] ds = dc.split( "=" );
332                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
333                             .get( ds[ 0 ] ).arithmeticMean() );
334                     dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
335                             .get( ds[ 1 ] ).arithmeticMean() );
336                 }
337                 if ( count > 1 ) {
338                     more_than_once.add( dc );
339                     if ( protein_length_stats_by_dc != null ) {
340                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
341                         for( final double element : s.getData() ) {
342                             gained_multiple_times_lengths_stats.addValue( element );
343                         }
344                     }
345                     if ( domain_number_stats_by_dc != null ) {
346                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
347                         for( final double element : s.getData() ) {
348                             gained_multiple_times_domain_count_stats.addValue( element );
349                         }
350                     }
351                     if ( domain_length_stats_by_domain != null ) {
352                         final String[] ds = dc.split( "=" );
353                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
354                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
355                         for( final double element : s0.getData() ) {
356                             gained_multiple_times_domain_length_sum += element;
357                             ++gained_multiple_times_domain_length_count;
358                         }
359                         for( final double element : s1.getData() ) {
360                             gained_multiple_times_domain_length_sum += element;
361                             ++gained_multiple_times_domain_length_count;
362                         }
363                     }
364                 }
365                 else {
366                     if ( protein_length_stats_by_dc != null ) {
367                         final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
368                         for( final double element : s.getData() ) {
369                             gained_once_lengths_stats.addValue( element );
370                         }
371                     }
372                     if ( domain_number_stats_by_dc != null ) {
373                         final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
374                         for( final double element : s.getData() ) {
375                             gained_once_domain_count_stats.addValue( element );
376                         }
377                     }
378                     if ( domain_length_stats_by_domain != null ) {
379                         final String[] ds = dc.split( "=" );
380                         final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
381                         final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
382                         for( final double element : s0.getData() ) {
383                             gained_once_domain_length_sum += element;
384                             ++gained_once_domain_length_count;
385                         }
386                         for( final double element : s1.getData() ) {
387                             gained_once_domain_length_sum += element;
388                             ++gained_once_domain_length_count;
389                         }
390                     }
391                 }
392             }
393             final Set<Integer> histogram_keys = histogram.keySet();
394             for( final Integer histogram_key : histogram_keys ) {
395                 final int count = histogram.get( histogram_key );
396                 final StringBuilder dc = domain_lists.get( histogram_key );
397                 out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR );
398                 out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR );
399                 out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
400                 final Object[] sorted = domain_lists_go.get( histogram_key ).toArray();
401                 Arrays.sort( sorted );
402                 for( final Object domain : sorted ) {
403                     out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR );
404                 }
405                 out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR );
406                 for( final String domain : domain_lists_go_unique.get( histogram_key ) ) {
407                     out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR );
408                 }
409             }
410             out_counts.close();
411             out_dc.close();
412             out_dc_for_go_mapping.close();
413             out_dc_for_go_mapping_unique.close();
414             final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
415             final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
416             for( final String dc : more_than_once ) {
417                 final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
418                 for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
419                     final PhylogenyNode n = it.next();
420                     if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
421                         nodes.add( n );
422                     }
423                 }
424                 for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
425                     for( int j = i + 1; j < nodes.size(); ++j ) {
426                         final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
427                         String rank = "unknown";
428                         if ( lca.getNodeData().isHasTaxonomy()
429                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
430                             rank = lca.getNodeData().getTaxonomy().getRank();
431                         }
432                         addToCountMap( lca_rank_counts, rank );
433                         String lca_species;
434                         if ( lca.getNodeData().isHasTaxonomy()
435                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
436                             lca_species = lca.getNodeData().getTaxonomy().getScientificName();
437                         }
438                         else if ( lca.getNodeData().isHasTaxonomy()
439                                 && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
440                             lca_species = lca.getNodeData().getTaxonomy().getCommonName();
441                         }
442                         else {
443                             lca_species = lca.getName();
444                         }
445                         addToCountMap( lca_ancestor_species_counts, lca_species );
446                     }
447                 }
448             }
449             final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
450             final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
451             ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
452             ForesterUtil.map2writer( out_for_ancestor_species_counts,
453                                      lca_ancestor_species_counts,
454                                      "\t",
455                                      ForesterUtil.LINE_SEPARATOR );
456             out_for_rank_counts.close();
457             out_for_ancestor_species_counts.close();
458             if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
459                     && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
460                 final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
461                 w.write( "Domain Lengths: " );
462                 w.write( "\n" );
463                 if ( domain_length_stats_by_domain != null ) {
464                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
465                             .entrySet() ) {
466                         w.write( entry.getKey().toString() );
467                         w.write( "\t" + entry.getValue().arithmeticMean() );
468                         w.write( "\t" + entry.getValue().median() );
469                         w.write( "\n" );
470                     }
471                 }
472                 w.flush();
473                 w.write( "\n" );
474                 w.write( "\n" );
475                 w.write( "Protein Lengths: " );
476                 w.write( "\n" );
477                 if ( protein_length_stats_by_dc != null ) {
478                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
479                             .entrySet() ) {
480                         w.write( entry.getKey().toString() );
481                         w.write( "\t" + entry.getValue().arithmeticMean() );
482                         w.write( "\t" + entry.getValue().median() );
483                         w.write( "\n" );
484                     }
485                 }
486                 w.flush();
487                 w.write( "\n" );
488                 w.write( "\n" );
489                 w.write( "Number of domains: " );
490                 w.write( "\n" );
491                 if ( domain_number_stats_by_dc != null ) {
492                     for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
493                             .entrySet() ) {
494                         w.write( entry.getKey().toString() );
495                         w.write( "\t" + entry.getValue().arithmeticMean() );
496                         w.write( "\t" + entry.getValue().median() );
497                         w.write( "\n" );
498                     }
499                 }
500                 w.flush();
501                 w.write( "\n" );
502                 w.write( "\n" );
503                 w.write( "Gained once, domain lengths:" );
504                 w.write( "\n" );
505                 w.write( "N: " + gained_once_domain_length_count );
506                 w.write( "\n" );
507                 w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
508                 w.write( "\n" );
509                 w.write( "\n" );
510                 w.write( "Gained multiple times, domain lengths:" );
511                 w.write( "\n" );
512                 w.write( "N: " + gained_multiple_times_domain_length_count );
513                 w.write( "\n" );
514                 w.write( "Avg: "
515                         + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
516                 w.write( "\n" );
517                 w.write( "\n" );
518                 w.write( "\n" );
519                 w.write( "\n" );
520                 w.write( "Gained once, protein lengths:" );
521                 w.write( "\n" );
522                 w.write( gained_once_lengths_stats.toString() );
523                 gained_once_lengths_stats = null;
524                 w.write( "\n" );
525                 w.write( "\n" );
526                 w.write( "Gained once, domain counts:" );
527                 w.write( "\n" );
528                 w.write( gained_once_domain_count_stats.toString() );
529                 gained_once_domain_count_stats = null;
530                 w.write( "\n" );
531                 w.write( "\n" );
532                 w.write( "Gained multiple times, protein lengths:" );
533                 w.write( "\n" );
534                 w.write( gained_multiple_times_lengths_stats.toString() );
535                 gained_multiple_times_lengths_stats = null;
536                 w.write( "\n" );
537                 w.write( "\n" );
538                 w.write( "Gained multiple times, domain counts:" );
539                 w.write( "\n" );
540                 w.write( gained_multiple_times_domain_count_stats.toString() );
541                 w.flush();
542                 w.close();
543             }
544         }
545         catch ( final IOException e ) {
546             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
547         }
548         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
549                 + outfilename_for_counts + "]" );
550         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
551                 + outfilename_for_dc + "]" );
552         ForesterUtil.programMessage( surfacing.PRG_NAME,
553                                      "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
554                                              + outfilename_for_dc_for_go_mapping + "]" );
555         ForesterUtil.programMessage( surfacing.PRG_NAME,
556                                      "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
557                                              + outfilename_for_dc_for_go_mapping_unique + "]" );
558     }
559
560     public static void checkForOutputFileWriteability( final File outfile ) {
561         final String error = ForesterUtil.isWritableFile( outfile );
562         if ( !ForesterUtil.isEmpty( error ) ) {
563             ForesterUtil.fatalError( surfacing.PRG_NAME, error );
564         }
565     }
566
567     public static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
568                                                                 final String[][] input_file_properties,
569                                                                 final String automated_pairwise_comparison_suffix,
570                                                                 final File outdir ) {
571         for( int i = 0; i < input_file_properties.length; ++i ) {
572             for( int j = 0; j < i; ++j ) {
573                 final String species_i = input_file_properties[ i ][ 1 ];
574                 final String species_j = input_file_properties[ j ][ 1 ];
575                 String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i
576                         + "_" + species_j + automated_pairwise_comparison_suffix;
577                 switch ( domain_similarity_print_option ) {
578                     case HTML:
579                         if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) {
580                             pairwise_similarities_output_file_str += ".html";
581                         }
582                         break;
583                 }
584                 final String error = ForesterUtil
585                         .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir
586                                 + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) );
587                 if ( !ForesterUtil.isEmpty( error ) ) {
588                     ForesterUtil.fatalError( surfacing.PRG_NAME, error );
589                 }
590             }
591         }
592     }
593
594     private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
595                                                                         final boolean get_gains ) {
596         final SortedSet<String> domains = new TreeSet<String>();
597         for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
598             final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
599             if ( get_gains ) {
600                 domains.addAll( chars.getGainedCharacters() );
601             }
602             else {
603                 domains.addAll( chars.getLostCharacters() );
604             }
605         }
606         return domains;
607     }
608
609     public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
610                                                                                            final BinaryDomainCombination.DomainCombinationType dc_type,
611                                                                                            final List<BinaryDomainCombination> all_binary_domains_combination_gained,
612                                                                                            final boolean get_gains ) {
613         final SortedSet<String> sorted_ids = new TreeSet<String>();
614         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
615             sorted_ids.add( matrix.getIdentifier( i ) );
616         }
617         for( final String id : sorted_ids ) {
618             for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
619                 if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) )
620                         || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) {
621                     if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) {
622                         all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination
623                                 .createInstance( matrix.getCharacter( c ) ) );
624                     }
625                     else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) {
626                         all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination
627                                 .createInstance( matrix.getCharacter( c ) ) );
628                     }
629                     else {
630                         all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix
631                                 .getCharacter( c ) ) );
632                     }
633                 }
634             }
635         }
636     }
637
638     private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
639                                                             final boolean domain_combinations,
640                                                             final CharacterStateMatrix.GainLossStates state,
641                                                             final String outfile ) {
642         File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
643                 + ForesterUtil.FILE_SEPARATOR + base_dir );
644         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
645             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
646         }
647         if ( domain_combinations ) {
648             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
649                     + ForesterUtil.FILE_SEPARATOR + "DC" );
650         }
651         else {
652             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
653                     + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
654         }
655         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
656             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
657         }
658         if ( state == GainLossStates.GAIN ) {
659             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
660                     + ForesterUtil.FILE_SEPARATOR + "GAINS" );
661         }
662         else if ( state == GainLossStates.LOSS ) {
663             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
664                     + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
665         }
666         else {
667             per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
668                     + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
669         }
670         if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
671             per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
672         }
673         return per_node_go_mapped_domain_gain_loss_files_base_dir;
674     }
675
676     public static Map<String, List<GoId>> createDomainIdToGoIdMap( final List<PfamToGoMapping> pfam_to_go_mappings ) {
677         final Map<String, List<GoId>> domain_id_to_go_ids_map = new HashMap<String, List<GoId>>( pfam_to_go_mappings.size() );
678         for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) {
679             if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) {
680                 domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList<GoId>() );
681             }
682             domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() );
683         }
684         return domain_id_to_go_ids_map;
685     }
686
687     public static Map<String, Set<String>> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file )
688             throws IOException {
689         final BasicTable<String> primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' );
690         final Map<String, Set<String>> map = new TreeMap<String, Set<String>>();
691         for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) {
692             final String domain_id = primary_table.getValue( 0, r );
693             if ( !map.containsKey( domain_id ) ) {
694                 map.put( domain_id, new HashSet<String>() );
695             }
696             map.get( domain_id ).add( primary_table.getValue( 1, r ) );
697         }
698         return map;
699     }
700
701     public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
702         checkForOutputFileWriteability( nj_tree_outfile );
703         final NeighborJoining nj = NeighborJoining.createInstance();
704         final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
705         phylogeny.setName( nj_tree_outfile.getName() );
706         writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
707         return phylogeny;
708     }
709
710     public static StringBuilder createParametersAsString( final boolean ignore_dufs,
711                                                           final double e_value_max,
712                                                           final int max_allowed_overlap,
713                                                           final boolean no_engulfing_overlaps,
714                                                           final File cutoff_scores_file,
715                                                           final BinaryDomainCombination.DomainCombinationType dc_type ) {
716         final StringBuilder parameters_sb = new StringBuilder();
717         parameters_sb.append( "E-value: " + e_value_max );
718         if ( cutoff_scores_file != null ) {
719             parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file );
720         }
721         else {
722             parameters_sb.append( ", Cutoff-scores-file: not-set" );
723         }
724         if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) {
725             parameters_sb.append( ", Max-overlap: " + max_allowed_overlap );
726         }
727         else {
728             parameters_sb.append( ", Max-overlap: not-set" );
729         }
730         if ( no_engulfing_overlaps ) {
731             parameters_sb.append( ", Engulfing-overlaps: not-allowed" );
732         }
733         else {
734             parameters_sb.append( ", Engulfing-overlaps: allowed" );
735         }
736         if ( ignore_dufs ) {
737             parameters_sb.append( ", Ignore-dufs: true" );
738         }
739         else {
740             parameters_sb.append( ", Ignore-dufs: false" );
741         }
742         parameters_sb.append( ", DC type (if applicable): " + dc_type );
743         return parameters_sb;
744     }
745
746     private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
747         final SortedMap<String, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
748         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
749         for( final String domain_id : cds.keySet() ) {
750             final CombinableDomains cd = cds.get( domain_id );
751             binary_combinations.addAll( cd.toBinaryDomainCombinations() );
752         }
753         return binary_combinations;
754     }
755
756     public static void createSplitWriters( final File out_dir,
757                                            final String my_outfile,
758                                            final Map<Character, Writer> split_writers ) throws IOException {
759         split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
760                 + "_domains_A.html" ) ) );
761         split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
762                 + "_domains_B.html" ) ) );
763         split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
764                 + "_domains_C.html" ) ) );
765         split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
766                 + "_domains_D.html" ) ) );
767         split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
768                 + "_domains_E.html" ) ) );
769         split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
770                 + "_domains_F.html" ) ) );
771         split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
772                 + "_domains_G.html" ) ) );
773         split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
774                 + "_domains_H.html" ) ) );
775         split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
776                 + "_domains_I.html" ) ) );
777         split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
778                 + "_domains_J.html" ) ) );
779         split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
780                 + "_domains_K.html" ) ) );
781         split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
782                 + "_domains_L.html" ) ) );
783         split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
784                 + "_domains_M.html" ) ) );
785         split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
786                 + "_domains_N.html" ) ) );
787         split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
788                 + "_domains_O.html" ) ) );
789         split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
790                 + "_domains_P.html" ) ) );
791         split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
792                 + "_domains_Q.html" ) ) );
793         split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
794                 + "_domains_R.html" ) ) );
795         split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
796                 + "_domains_S.html" ) ) );
797         split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
798                 + "_domains_T.html" ) ) );
799         split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
800                 + "_domains_U.html" ) ) );
801         split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
802                 + "_domains_V.html" ) ) );
803         split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
804                 + "_domains_W.html" ) ) );
805         split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
806                 + "_domains_X.html" ) ) );
807         split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
808                 + "_domains_Y.html" ) ) );
809         split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
810                 + "_domains_Z.html" ) ) );
811         split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
812                 + "_domains_0.html" ) ) );
813     }
814
815     public static Map<String, Integer> createTaxCodeToIdMap( final Phylogeny phy ) {
816         final Map<String, Integer> m = new HashMap<String, Integer>();
817         for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
818             final PhylogenyNode n = iter.next();
819             if ( n.getNodeData().isHasTaxonomy() ) {
820                 final Taxonomy t = n.getNodeData().getTaxonomy();
821                 final String c = t.getTaxonomyCode();
822                 if ( !ForesterUtil.isEmpty( c ) ) {
823                     if ( n.getNodeData().getTaxonomy() == null ) {
824                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
825                     }
826                     final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue();
827                     if ( ForesterUtil.isEmpty( id ) ) {
828                         ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n );
829                     }
830                     if ( m.containsKey( c ) ) {
831                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" );
832                     }
833                     final int iid = Integer.valueOf( id );
834                     if ( m.containsValue( iid ) ) {
835                         ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" );
836                     }
837                     m.put( c, iid );
838                 }
839             }
840             else {
841                 ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n );
842             }
843         }
844         return m;
845     }
846
847     public static void decoratePrintableDomainSimilarities( final SortedSet<DomainSimilarity> domain_similarities,
848                                                             final Detailedness detailedness ) {
849         for( final DomainSimilarity domain_similarity : domain_similarities ) {
850             if ( domain_similarity instanceof PrintableDomainSimilarity ) {
851                 final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity;
852                 printable_domain_similarity.setDetailedness( detailedness );
853             }
854         }
855     }
856
857     public static void doit( final List<Protein> proteins,
858                              final List<String> query_domain_ids_nc_order,
859                              final Writer out,
860                              final String separator,
861                              final String limit_to_species,
862                              final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
863         for( final Protein protein : proteins ) {
864             if ( ForesterUtil.isEmpty( limit_to_species )
865                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
866                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
867                     out.write( protein.getSpecies().getSpeciesId() );
868                     out.write( separator );
869                     out.write( protein.getProteinId().getId() );
870                     out.write( separator );
871                     out.write( "[" );
872                     final Set<String> visited_domain_ids = new HashSet<String>();
873                     boolean first = true;
874                     for( final Domain domain : protein.getProteinDomains() ) {
875                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
876                             visited_domain_ids.add( domain.getDomainId() );
877                             if ( first ) {
878                                 first = false;
879                             }
880                             else {
881                                 out.write( " " );
882                             }
883                             out.write( domain.getDomainId() );
884                             out.write( " {" );
885                             out.write( "" + domain.getTotalCount() );
886                             out.write( "}" );
887                         }
888                     }
889                     out.write( "]" );
890                     out.write( separator );
891                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
892                             .equals( SurfacingConstants.NONE ) ) ) {
893                         out.write( protein.getDescription() );
894                     }
895                     out.write( separator );
896                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
897                             .equals( SurfacingConstants.NONE ) ) ) {
898                         out.write( protein.getAccession() );
899                     }
900                     out.write( SurfacingConstants.NL );
901                 }
902             }
903         }
904         out.flush();
905     }
906
907     public static void domainsPerProteinsStatistics( final String genome,
908                                                      final List<Protein> protein_list,
909                                                      final DescriptiveStatistics all_genomes_domains_per_potein_stats,
910                                                      final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
911                                                      final SortedSet<String> domains_which_are_always_single,
912                                                      final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
913                                                      final SortedSet<String> domains_which_never_single,
914                                                      final Writer writer ) {
915         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
916         for( final Protein protein : protein_list ) {
917             final int domains = protein.getNumberOfProteinDomains();
918             //System.out.println( domains );
919             stats.addValue( domains );
920             all_genomes_domains_per_potein_stats.addValue( domains );
921             if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
922                 all_genomes_domains_per_potein_histo.put( domains, 1 );
923             }
924             else {
925                 all_genomes_domains_per_potein_histo.put( domains,
926                                                           1 + all_genomes_domains_per_potein_histo.get( domains ) );
927             }
928             if ( domains == 1 ) {
929                 final String domain = protein.getProteinDomain( 0 ).getDomainId();
930                 if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
931                     if ( domains_which_never_single.contains( domain ) ) {
932                         domains_which_never_single.remove( domain );
933                         domains_which_are_sometimes_single_sometimes_not.add( domain );
934                     }
935                     else {
936                         domains_which_are_always_single.add( domain );
937                     }
938                 }
939             }
940             else if ( domains > 1 ) {
941                 for( final Domain d : protein.getProteinDomains() ) {
942                     final String domain = d.getDomainId();
943                     // System.out.println( domain );
944                     if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
945                         if ( domains_which_are_always_single.contains( domain ) ) {
946                             domains_which_are_always_single.remove( domain );
947                             domains_which_are_sometimes_single_sometimes_not.add( domain );
948                         }
949                         else {
950                             domains_which_never_single.add( domain );
951                         }
952                     }
953                 }
954             }
955         }
956         try {
957             writer.write( genome );
958             writer.write( "\t" );
959             if ( stats.getN() >= 1 ) {
960                 writer.write( stats.arithmeticMean() + "" );
961                 writer.write( "\t" );
962                 if ( stats.getN() >= 2 ) {
963                     writer.write( stats.sampleStandardDeviation() + "" );
964                 }
965                 else {
966                     writer.write( "" );
967                 }
968                 writer.write( "\t" );
969                 writer.write( stats.median() + "" );
970                 writer.write( "\t" );
971                 writer.write( stats.getN() + "" );
972                 writer.write( "\t" );
973                 writer.write( stats.getMin() + "" );
974                 writer.write( "\t" );
975                 writer.write( stats.getMax() + "" );
976             }
977             else {
978                 writer.write( "\t" );
979                 writer.write( "\t" );
980                 writer.write( "\t" );
981                 writer.write( "0" );
982                 writer.write( "\t" );
983                 writer.write( "\t" );
984             }
985             writer.write( "\n" );
986         }
987         catch ( final IOException e ) {
988             e.printStackTrace();
989         }
990     }
991
992     public static void executeDomainLengthAnalysis( final String[][] input_file_properties,
993                                                     final int number_of_genomes,
994                                                     final DomainLengthsTable domain_lengths_table,
995                                                     final File outfile ) throws IOException {
996         final DecimalFormat df = new DecimalFormat( "#.00" );
997         checkForOutputFileWriteability( outfile );
998         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
999         out.write( "MEAN BASED STATISTICS PER SPECIES" );
1000         out.write( ForesterUtil.LINE_SEPARATOR );
1001         out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() );
1002         out.write( ForesterUtil.LINE_SEPARATOR );
1003         out.write( ForesterUtil.LINE_SEPARATOR );
1004         final List<DomainLengths> domain_lengths_list = domain_lengths_table.getDomainLengthsList();
1005         out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" );
1006         out.write( ForesterUtil.LINE_SEPARATOR );
1007         for( final DomainLengths domain_lengths : domain_lengths_list ) {
1008             final List<Species> species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 );
1009             if ( species_list.size() > 0 ) {
1010                 out.write( domain_lengths.getDomainId() + "\t" );
1011                 for( final Species species : species_list ) {
1012                     out.write( species + "\t" );
1013                 }
1014                 out.write( ForesterUtil.LINE_SEPARATOR );
1015             }
1016         }
1017         out.write( ForesterUtil.LINE_SEPARATOR );
1018         out.write( ForesterUtil.LINE_SEPARATOR );
1019         out.write( "OUTLIER SPECIES (Z 1.0)" );
1020         out.write( ForesterUtil.LINE_SEPARATOR );
1021         final DescriptiveStatistics stats_for_all_species = domain_lengths_table
1022                 .calculateMeanBasedStatisticsForAllSpecies();
1023         out.write( stats_for_all_species.asSummary() );
1024         out.write( ForesterUtil.LINE_SEPARATOR );
1025         final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species );
1026         out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() );
1027         out.write( ForesterUtil.LINE_SEPARATOR );
1028         final double population_sd = stats_for_all_species.sampleStandardDeviation();
1029         final double population_mean = stats_for_all_species.arithmeticMean();
1030         for( final Species species : domain_lengths_table.getSpecies() ) {
1031             final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean();
1032             final double z = ( x - population_mean ) / population_sd;
1033             out.write( species + "\t" + z );
1034             out.write( ForesterUtil.LINE_SEPARATOR );
1035         }
1036         out.write( ForesterUtil.LINE_SEPARATOR );
1037         for( final Species species : domain_lengths_table.getSpecies() ) {
1038             final DescriptiveStatistics stats_for_species = domain_lengths_table
1039                     .calculateMeanBasedStatisticsForSpecies( species );
1040             final double x = stats_for_species.arithmeticMean();
1041             final double z = ( x - population_mean ) / population_sd;
1042             if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) {
1043                 out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() );
1044                 out.write( ForesterUtil.LINE_SEPARATOR );
1045             }
1046         }
1047         out.close();
1048         System.gc();
1049     }
1050
1051     /**
1052      * Warning: This side-effects 'all_bin_domain_combinations_encountered'!
1053      * 
1054      * 
1055      * @param output_file
1056      * @param all_bin_domain_combinations_changed
1057      * @param sum_of_all_domains_encountered
1058      * @param all_bin_domain_combinations_encountered
1059      * @param is_gains_analysis
1060      * @param protein_length_stats_by_dc 
1061      * @throws IOException
1062      */
1063     public static void executeFitchGainsAnalysis( final File output_file,
1064                                                   final List<BinaryDomainCombination> all_bin_domain_combinations_changed,
1065                                                   final int sum_of_all_domains_encountered,
1066                                                   final SortedSet<BinaryDomainCombination> all_bin_domain_combinations_encountered,
1067                                                   final boolean is_gains_analysis ) throws IOException {
1068         checkForOutputFileWriteability( output_file );
1069         final Writer out = ForesterUtil.createBufferedWriter( output_file );
1070         final SortedMap<Object, Integer> bdc_to_counts = ForesterUtil
1071                 .listToSortedCountsMap( all_bin_domain_combinations_changed );
1072         final SortedSet<String> all_domains_in_combination_changed_more_than_once = new TreeSet<String>();
1073         final SortedSet<String> all_domains_in_combination_changed_only_once = new TreeSet<String>();
1074         int above_one = 0;
1075         int one = 0;
1076         for( final Object bdc_object : bdc_to_counts.keySet() ) {
1077             final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object;
1078             final int count = bdc_to_counts.get( bdc_object );
1079             if ( count < 1 ) {
1080                 ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "count < 1 " );
1081             }
1082             out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR );
1083             if ( count > 1 ) {
1084                 all_domains_in_combination_changed_more_than_once.add( bdc.getId0() );
1085                 all_domains_in_combination_changed_more_than_once.add( bdc.getId1() );
1086                 above_one++;
1087             }
1088             else if ( count == 1 ) {
1089                 all_domains_in_combination_changed_only_once.add( bdc.getId0() );
1090                 all_domains_in_combination_changed_only_once.add( bdc.getId1() );
1091                 one++;
1092             }
1093         }
1094         final int all = all_bin_domain_combinations_encountered.size();
1095         int never_lost = -1;
1096         if ( !is_gains_analysis ) {
1097             all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed );
1098             never_lost = all_bin_domain_combinations_encountered.size();
1099             for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) {
1100                 out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR );
1101             }
1102         }
1103         if ( is_gains_analysis ) {
1104             out.write( "Sum of all distinct domain combinations appearing once               : " + one
1105                     + ForesterUtil.LINE_SEPARATOR );
1106             out.write( "Sum of all distinct domain combinations appearing more than once     : " + above_one
1107                     + ForesterUtil.LINE_SEPARATOR );
1108             out.write( "Sum of all distinct domains in combinations apppearing only once     : "
1109                     + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR );
1110             out.write( "Sum of all distinct domains in combinations apppearing more than once: "
1111                     + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR );
1112         }
1113         else {
1114             out.write( "Sum of all distinct domain combinations never lost                   : " + never_lost
1115                     + ForesterUtil.LINE_SEPARATOR );
1116             out.write( "Sum of all distinct domain combinations lost once                    : " + one
1117                     + ForesterUtil.LINE_SEPARATOR );
1118             out.write( "Sum of all distinct domain combinations lost more than once          : " + above_one
1119                     + ForesterUtil.LINE_SEPARATOR );
1120             out.write( "Sum of all distinct domains in combinations lost only once           : "
1121                     + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR );
1122             out.write( "Sum of all distinct domains in combinations lost more than once: "
1123                     + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR );
1124         }
1125         out.write( "All binary combinations                                              : " + all
1126                 + ForesterUtil.LINE_SEPARATOR );
1127         out.write( "All domains                                                          : "
1128                 + sum_of_all_domains_encountered );
1129         out.close();
1130         ForesterUtil.programMessage( surfacing.PRG_NAME,
1131                                      "Wrote fitch domain combination dynamics counts analysis to \"" + output_file
1132                                              + "\"" );
1133     }
1134
1135     /**
1136      * 
1137      * @param all_binary_domains_combination_lost_fitch 
1138      * @param use_last_in_fitch_parsimony 
1139      * @param consider_directedness_and_adjacency_for_bin_combinations 
1140      * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations
1141      * which were gained under unweighted (Fitch) parsimony.
1142      */
1143     public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony,
1144                                                  final boolean radomize_fitch_parsimony,
1145                                                  final String outfile_name,
1146                                                  final DomainParsimonyCalculator domain_parsimony,
1147                                                  final Phylogeny phylogeny,
1148                                                  final Map<String, List<GoId>> domain_id_to_go_ids_map,
1149                                                  final Map<GoId, GoTerm> go_id_to_term_map,
1150                                                  final GoNameSpace go_namespace_limit,
1151                                                  final String parameters_str,
1152                                                  final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
1153                                                  final SortedSet<String> positive_filter,
1154                                                  final boolean output_binary_domain_combinations_for_graphs,
1155                                                  final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
1156                                                  final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
1157                                                  final BinaryDomainCombination.DomainCombinationType dc_type,
1158                                                  final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
1159                                                  final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
1160                                                  final Map<String, DescriptiveStatistics> domain_length_stats_by_domain,
1161                                                  final Map<String, Integer> tax_code_to_id_map,
1162                                                  final boolean write_to_nexus,
1163                                                  final boolean use_last_in_fitch_parsimony ) {
1164         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
1165         final String date_time = ForesterUtil.getCurrentDateTime();
1166         final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
1167         final SortedSet<String> all_pfams_gained_as_domains = new TreeSet<String>();
1168         final SortedSet<String> all_pfams_lost_as_domains = new TreeSet<String>();
1169         final SortedSet<String> all_pfams_gained_as_dom_combinations = new TreeSet<String>();
1170         final SortedSet<String> all_pfams_lost_as_dom_combinations = new TreeSet<String>();
1171         if ( write_to_nexus ) {
1172             writeToNexus( outfile_name, domain_parsimony, phylogeny );
1173         }
1174         // DOLLO DOMAINS
1175         // -------------
1176         Phylogeny local_phylogeny_l = phylogeny.copy();
1177         if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) {
1178             domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter );
1179         }
1180         else {
1181             domain_parsimony.executeDolloParsimonyOnDomainPresence();
1182         }
1183         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
1184                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
1185         SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
1186                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER );
1187         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1188                                                            CharacterStateMatrix.GainLossStates.GAIN,
1189                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D,
1190                                                            sep,
1191                                                            ForesterUtil.LINE_SEPARATOR,
1192                                                            null );
1193         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1194                                                            CharacterStateMatrix.GainLossStates.LOSS,
1195                                                            outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D,
1196                                                            sep,
1197                                                            ForesterUtil.LINE_SEPARATOR,
1198                                                            null );
1199         SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
1200                 + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null );
1201         //HTML:
1202         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1203                                        go_id_to_term_map,
1204                                        go_namespace_limit,
1205                                        false,
1206                                        domain_parsimony.getGainLossMatrix(),
1207                                        CharacterStateMatrix.GainLossStates.GAIN,
1208                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D,
1209                                        sep,
1210                                        ForesterUtil.LINE_SEPARATOR,
1211                                        "Dollo Parsimony | Gains | Domains",
1212                                        "+",
1213                                        domain_id_to_secondary_features_maps,
1214                                        all_pfams_encountered,
1215                                        all_pfams_gained_as_domains,
1216                                        "_dollo_gains_d",
1217                                        tax_code_to_id_map );
1218         writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1219                                        go_id_to_term_map,
1220                                        go_namespace_limit,
1221                                        false,
1222                                        domain_parsimony.getGainLossMatrix(),
1223                                        CharacterStateMatrix.GainLossStates.LOSS,
1224                                        outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D,
1225                                        sep,
1226                                        ForesterUtil.LINE_SEPARATOR,
1227                                        "Dollo Parsimony | Losses | Domains",
1228                                        "-",
1229                                        domain_id_to_secondary_features_maps,
1230                                        all_pfams_encountered,
1231                                        all_pfams_lost_as_domains,
1232                                        "_dollo_losses_d",
1233                                        tax_code_to_id_map );
1234         //        writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1235         //                                       go_id_to_term_map,
1236         //                                       go_namespace_limit,
1237         //                                       false,
1238         //                                       domain_parsimony.getGainLossMatrix(),
1239         //                                       null,
1240         //                                       outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D,
1241         //                                       sep,
1242         //                                       ForesterUtil.LINE_SEPARATOR,
1243         //                                       "Dollo Parsimony | Present | Domains",
1244         //                                       "",
1245         //                                       domain_id_to_secondary_features_maps,
1246         //                                       all_pfams_encountered,
1247         //                                       null,
1248         //                                       "_dollo_present_d",
1249         //                                       tax_code_to_id_map );
1250         preparePhylogeny( local_phylogeny_l,
1251                           domain_parsimony,
1252                           date_time,
1253                           "Dollo parsimony on domain presence/absence",
1254                           "dollo_on_domains_" + outfile_name,
1255                           parameters_str );
1256         SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
1257                 + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
1258         try {
1259             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" );
1260             writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" );
1261         }
1262         catch ( final IOException e ) {
1263             e.printStackTrace();
1264             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1265         }
1266         if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) {
1267             // FITCH DOMAIN COMBINATIONS
1268             // -------------------------
1269             local_phylogeny_l = phylogeny.copy();
1270             String randomization = "no";
1271             if ( radomize_fitch_parsimony ) {
1272                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony );
1273                 randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
1274             }
1275             else {
1276                 domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony );
1277             }
1278             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
1279                     + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
1280             SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name
1281                     + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
1282             SurfacingUtil
1283                     .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1284                                                           CharacterStateMatrix.GainLossStates.GAIN,
1285                                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC,
1286                                                           sep,
1287                                                           ForesterUtil.LINE_SEPARATOR,
1288                                                           null );
1289             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1290                                                                CharacterStateMatrix.GainLossStates.LOSS,
1291                                                                outfile_name
1292                                                                        + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC,
1293                                                                sep,
1294                                                                ForesterUtil.LINE_SEPARATOR,
1295                                                                null );
1296             SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name
1297                     + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null );
1298             if ( all_binary_domains_combination_gained_fitch != null ) {
1299                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1300                                                                                     dc_type,
1301                                                                                     all_binary_domains_combination_gained_fitch,
1302                                                                                     true );
1303             }
1304             if ( all_binary_domains_combination_lost_fitch != null ) {
1305                 collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(),
1306                                                                                     dc_type,
1307                                                                                     all_binary_domains_combination_lost_fitch,
1308                                                                                     false );
1309             }
1310             if ( output_binary_domain_combinations_for_graphs ) {
1311                 SurfacingUtil
1312                         .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony
1313                                                                                                            .getGainLossMatrix(),
1314                                                                                                    null,
1315                                                                                                    outfile_name
1316                                                                                                            + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS,
1317                                                                                                    sep,
1318                                                                                                    ForesterUtil.LINE_SEPARATOR,
1319                                                                                                    BinaryDomainCombination.OutputFormat.DOT );
1320             }
1321             // HTML:
1322             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1323                                            go_id_to_term_map,
1324                                            go_namespace_limit,
1325                                            true,
1326                                            domain_parsimony.getGainLossMatrix(),
1327                                            CharacterStateMatrix.GainLossStates.GAIN,
1328                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC,
1329                                            sep,
1330                                            ForesterUtil.LINE_SEPARATOR,
1331                                            "Fitch Parsimony | Gains | Domain Combinations",
1332                                            "+",
1333                                            null,
1334                                            all_pfams_encountered,
1335                                            all_pfams_gained_as_dom_combinations,
1336                                            "_fitch_gains_dc",
1337                                            tax_code_to_id_map );
1338             writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1339                                            go_id_to_term_map,
1340                                            go_namespace_limit,
1341                                            true,
1342                                            domain_parsimony.getGainLossMatrix(),
1343                                            CharacterStateMatrix.GainLossStates.LOSS,
1344                                            outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC,
1345                                            sep,
1346                                            ForesterUtil.LINE_SEPARATOR,
1347                                            "Fitch Parsimony | Losses | Domain Combinations",
1348                                            "-",
1349                                            null,
1350                                            all_pfams_encountered,
1351                                            all_pfams_lost_as_dom_combinations,
1352                                            "_fitch_losses_dc",
1353                                            tax_code_to_id_map );
1354             //            writeBinaryStatesMatrixToList( domain_id_to_go_ids_map,
1355             //                                           go_id_to_term_map,
1356             //                                           go_namespace_limit,
1357             //                                           true,
1358             //                                           domain_parsimony.getGainLossMatrix(),
1359             //                                           null,
1360             //                                           outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC,
1361             //                                           sep,
1362             //                                           ForesterUtil.LINE_SEPARATOR,
1363             //                                           "Fitch Parsimony | Present | Domain Combinations",
1364             //                                           "",
1365             //                                           null,
1366             //                                           all_pfams_encountered,
1367             //                                           null,
1368             //                                           "_fitch_present_dc",
1369             //                                           tax_code_to_id_map );
1370             writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map,
1371                                             go_id_to_term_map,
1372                                             outfile_name,
1373                                             all_pfams_encountered );
1374             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains );
1375             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains );
1376             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX,
1377                               all_pfams_gained_as_dom_combinations );
1378             writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations );
1379             preparePhylogeny( local_phylogeny_l,
1380                               domain_parsimony,
1381                               date_time,
1382                               "Fitch parsimony on binary domain combination presence/absence randomization: "
1383                                       + randomization,
1384                               "fitch_on_binary_domain_combinations_" + outfile_name,
1385                               parameters_str );
1386             SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
1387                     + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
1388             calculateIndependentDomainCombinationGains( local_phylogeny_l,
1389                                                         outfile_name
1390                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
1391                                                         outfile_name
1392                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
1393                                                         outfile_name
1394                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
1395                                                         outfile_name
1396                                                                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
1397                                                         outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
1398                                                         outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
1399                                                         outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
1400                                                         protein_length_stats_by_dc,
1401                                                         domain_number_stats_by_dc,
1402                                                         domain_length_stats_by_domain );
1403         }
1404     }
1405
1406     public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name,
1407                                                                      final DomainParsimonyCalculator secondary_features_parsimony,
1408                                                                      final Phylogeny phylogeny,
1409                                                                      final String parameters_str,
1410                                                                      final Map<Species, MappingResults> mapping_results_map,
1411                                                                      final boolean use_last_in_fitch_parsimony ) {
1412         final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
1413         final String date_time = ForesterUtil.getCurrentDateTime();
1414         System.out.println();
1415         writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
1416                       secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
1417                       phylogeny );
1418         Phylogeny local_phylogeny_copy = phylogeny.copy();
1419         secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
1420         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
1421                 + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
1422         SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name
1423                 + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
1424         SurfacingUtil
1425                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1426                                                       CharacterStateMatrix.GainLossStates.GAIN,
1427                                                       outfile_name
1428                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES,
1429                                                       sep,
1430                                                       ForesterUtil.LINE_SEPARATOR,
1431                                                       null );
1432         SurfacingUtil
1433                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1434                                                       CharacterStateMatrix.GainLossStates.LOSS,
1435                                                       outfile_name
1436                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES,
1437                                                       sep,
1438                                                       ForesterUtil.LINE_SEPARATOR,
1439                                                       null );
1440         SurfacingUtil
1441                 .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(),
1442                                                       null,
1443                                                       outfile_name
1444                                                               + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES,
1445                                                       sep,
1446                                                       ForesterUtil.LINE_SEPARATOR,
1447                                                       null );
1448         preparePhylogeny( local_phylogeny_copy,
1449                           secondary_features_parsimony,
1450                           date_time,
1451                           "Dollo parsimony on secondary feature presence/absence",
1452                           "dollo_on_secondary_features_" + outfile_name,
1453                           parameters_str );
1454         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
1455                 + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
1456         // FITCH DOMAIN COMBINATIONS
1457         // -------------------------
1458         local_phylogeny_copy = phylogeny.copy();
1459         final String randomization = "no";
1460         secondary_features_parsimony
1461                 .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony );
1462         preparePhylogeny( local_phylogeny_copy,
1463                           secondary_features_parsimony,
1464                           date_time,
1465                           "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
1466                                   + randomization,
1467                           "fitch_on_binary_domain_combinations_" + outfile_name,
1468                           parameters_str );
1469         SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
1470                 + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
1471         calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
1472                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
1473                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
1474                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
1475                 + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
1476                 + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
1477                 + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
1478     }
1479
1480     public static void executePlusMinusAnalysis( final File output_file,
1481                                                  final List<String> plus_minus_analysis_high_copy_base,
1482                                                  final List<String> plus_minus_analysis_high_copy_target,
1483                                                  final List<String> plus_minus_analysis_low_copy,
1484                                                  final List<GenomeWideCombinableDomains> gwcd_list,
1485                                                  final SortedMap<Species, List<Protein>> protein_lists_per_species,
1486                                                  final Map<String, List<GoId>> domain_id_to_go_ids_map,
1487                                                  final Map<GoId, GoTerm> go_id_to_term_map,
1488                                                  final List<Object> plus_minus_analysis_numbers ) {
1489         final Set<String> all_spec = new HashSet<String>();
1490         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
1491             all_spec.add( gwcd.getSpecies().getSpeciesId() );
1492         }
1493         final File html_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX_HTML );
1494         final File plain_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX );
1495         final File html_out_dc = new File( output_file + surfacing.PLUS_MINUS_DC_SUFFIX_HTML );
1496         final File all_domains_go_ids_out_dom = new File( output_file + surfacing.PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX );
1497         final File passing_domains_go_ids_out_dom = new File( output_file
1498                 + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX );
1499         final File proteins_file_base = new File( output_file + "" );
1500         final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue();
1501         final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue();
1502         try {
1503             DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list,
1504                                                                        protein_lists_per_species,
1505                                                                        plus_minus_analysis_high_copy_base,
1506                                                                        plus_minus_analysis_high_copy_target,
1507                                                                        plus_minus_analysis_low_copy,
1508                                                                        min_diff,
1509                                                                        factor,
1510                                                                        plain_out_dom,
1511                                                                        html_out_dom,
1512                                                                        html_out_dc,
1513                                                                        domain_id_to_go_ids_map,
1514                                                                        go_id_to_term_map,
1515                                                                        all_domains_go_ids_out_dom,
1516                                                                        passing_domains_go_ids_out_dom,
1517                                                                        proteins_file_base );
1518         }
1519         catch ( final IOException e ) {
1520             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1521         }
1522         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \""
1523                 + html_out_dom + "\"" );
1524         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \""
1525                 + plain_out_dom + "\"" );
1526         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc
1527                 + "\"" );
1528         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \""
1529                 + passing_domains_go_ids_out_dom + "\"" );
1530         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \""
1531                 + all_domains_go_ids_out_dom + "\"" );
1532     }
1533
1534     public static void extractProteinNames( final List<Protein> proteins,
1535                                             final List<String> query_domain_ids_nc_order,
1536                                             final Writer out,
1537                                             final String separator,
1538                                             final String limit_to_species ) throws IOException {
1539         for( final Protein protein : proteins ) {
1540             if ( ForesterUtil.isEmpty( limit_to_species )
1541                     || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1542                 if ( protein.contains( query_domain_ids_nc_order, true ) ) {
1543                     out.write( protein.getSpecies().getSpeciesId() );
1544                     out.write( separator );
1545                     out.write( protein.getProteinId().getId() );
1546                     out.write( separator );
1547                     out.write( "[" );
1548                     final Set<String> visited_domain_ids = new HashSet<String>();
1549                     boolean first = true;
1550                     for( final Domain domain : protein.getProteinDomains() ) {
1551                         if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
1552                             visited_domain_ids.add( domain.getDomainId() );
1553                             if ( first ) {
1554                                 first = false;
1555                             }
1556                             else {
1557                                 out.write( " " );
1558                             }
1559                             out.write( domain.getDomainId() );
1560                             out.write( " {" );
1561                             out.write( "" + domain.getTotalCount() );
1562                             out.write( "}" );
1563                         }
1564                     }
1565                     out.write( "]" );
1566                     out.write( separator );
1567                     if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1568                             .equals( SurfacingConstants.NONE ) ) ) {
1569                         out.write( protein.getDescription() );
1570                     }
1571                     out.write( separator );
1572                     if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1573                             .equals( SurfacingConstants.NONE ) ) ) {
1574                         out.write( protein.getAccession() );
1575                     }
1576                     out.write( SurfacingConstants.NL );
1577                 }
1578             }
1579         }
1580         out.flush();
1581     }
1582
1583     public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
1584                                             final String domain_id,
1585                                             final Writer out,
1586                                             final String separator,
1587                                             final String limit_to_species,
1588                                             final double domain_e_cutoff ) throws IOException {
1589         //System.out.println( "Per domain E-value: " + domain_e_cutoff );
1590         for( final Species species : protein_lists_per_species.keySet() ) {
1591             //System.out.println( species + ":" );
1592             for( final Protein protein : protein_lists_per_species.get( species ) ) {
1593                 if ( ForesterUtil.isEmpty( limit_to_species )
1594                         || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
1595                     final List<Domain> domains = protein.getProteinDomains( domain_id );
1596                     if ( domains.size() > 0 ) {
1597                         out.write( protein.getSpecies().getSpeciesId() );
1598                         out.write( separator );
1599                         out.write( protein.getProteinId().getId() );
1600                         out.write( separator );
1601                         out.write( domain_id.toString() );
1602                         out.write( separator );
1603                         int prev_to = -1;
1604                         for( final Domain domain : domains ) {
1605                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1606                                 out.write( "/" );
1607                                 out.write( domain.getFrom() + "-" + domain.getTo() );
1608                                 if ( prev_to >= 0 ) {
1609                                     final int l = domain.getFrom() - prev_to;
1610                                     // System.out.println( l );
1611                                 }
1612                                 prev_to = domain.getTo();
1613                             }
1614                         }
1615                         out.write( "/" );
1616                         out.write( separator );
1617                         final List<Domain> domain_list = new ArrayList<Domain>();
1618                         for( final Domain domain : protein.getProteinDomains() ) {
1619                             if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
1620                                 domain_list.add( domain );
1621                             }
1622                         }
1623                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
1624                         for( int i = 0; i < domain_list.size(); ++i ) {
1625                             domain_ary[ i ] = domain_list.get( i );
1626                         }
1627                         Arrays.sort( domain_ary, new DomainComparator( true ) );
1628                         out.write( "{" );
1629                         boolean first = true;
1630                         for( final Domain domain : domain_ary ) {
1631                             if ( first ) {
1632                                 first = false;
1633                             }
1634                             else {
1635                                 out.write( "," );
1636                             }
1637                             out.write( domain.getDomainId().toString() );
1638                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
1639                             out.write( ":" + domain.getPerDomainEvalue() );
1640                         }
1641                         out.write( "}" );
1642                         if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
1643                                 .equals( SurfacingConstants.NONE ) ) ) {
1644                             out.write( protein.getDescription() );
1645                         }
1646                         out.write( separator );
1647                         if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
1648                                 .equals( SurfacingConstants.NONE ) ) ) {
1649                             out.write( protein.getAccession() );
1650                         }
1651                         out.write( SurfacingConstants.NL );
1652                     }
1653                 }
1654             }
1655         }
1656         out.flush();
1657     }
1658
1659     public static SortedSet<String> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
1660         final SortedSet<String> all_domains_ids = new TreeSet<String>();
1661         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
1662             final Set<String> all_domains = gwcd.getAllDomainIds();
1663             //    for( final Domain domain : all_domains ) {
1664             all_domains_ids.addAll( all_domains );
1665             //    }
1666         }
1667         return all_domains_ids;
1668     }
1669
1670     public static SortedMap<String, Integer> getDomainCounts( final List<Protein> protein_domain_collections ) {
1671         final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
1672         for( final Protein protein_domain_collection : protein_domain_collections ) {
1673             for( final Object name : protein_domain_collection.getProteinDomains() ) {
1674                 final BasicDomain protein_domain = ( BasicDomain ) name;
1675                 final String id = protein_domain.getDomainId();
1676                 if ( map.containsKey( id ) ) {
1677                     map.put( id, map.get( id ) + 1 );
1678                 }
1679                 else {
1680                     map.put( id, 1 );
1681                 }
1682             }
1683         }
1684         return map;
1685     }
1686
1687     public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) {
1688         final PhylogenyNodeIterator it = p.iteratorPostorder();
1689         int c = 0;
1690         while ( it.hasNext() ) {
1691             final PhylogenyNode n = it.next();
1692             if ( ForesterUtil.isEmpty( n.getName() )
1693                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1694                             .getScientificName() ) )
1695                     && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
1696                             .getCommonName() ) ) ) {
1697                 if ( n.getParent() != null ) {
1698                     names.append( " " );
1699                     names.append( n.getParent().getName() );
1700                 }
1701                 final List l = n.getAllExternalDescendants();
1702                 for( final Object object : l ) {
1703                     System.out.println( l.toString() );
1704                 }
1705                 ++c;
1706             }
1707         }
1708         return c;
1709     }
1710
1711     public static void log( final String msg, final Writer w ) {
1712         try {
1713             w.write( msg );
1714             w.write( ForesterUtil.LINE_SEPARATOR );
1715         }
1716         catch ( final IOException e ) {
1717             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
1718         }
1719     }
1720
1721     public static String obtainHexColorStringDependingOnTaxonomyGroup( final String tax_code, final Phylogeny phy )
1722             throws IllegalArgumentException {
1723         if ( !_TAXCODE_HEXCOLORSTRING_MAP.containsKey( tax_code ) ) {
1724             if ( phy != null && !phy.isEmpty() ) {
1725                 List<PhylogenyNode> nodes = phy.getNodesViaTaxonomyCode( tax_code );
1726                 Color c = null;
1727                 if ( nodes == null || nodes.isEmpty() ) {
1728                     throw new IllegalArgumentException( "code " + tax_code + " is not found" );
1729                 }
1730                 if ( nodes.size() != 1 ) {
1731                     throw new IllegalArgumentException( "code " + tax_code + " is not unique" );
1732                 }
1733                 PhylogenyNode n = nodes.get( 0 );
1734                 while ( n != null ) {
1735                     if ( n.getNodeData().isHasTaxonomy()
1736                             && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
1737                         c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getNodeData().getTaxonomy()
1738                                 .getScientificName(), tax_code );
1739                     }
1740                     if ( c == null && !ForesterUtil.isEmpty( n.getName() ) ) {
1741                         c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getName(), tax_code );
1742                     }
1743                     if ( c != null ) {
1744                         break;
1745                     }
1746                     n = n.getParent();
1747                 }
1748                 if ( c == null ) {
1749                     throw new IllegalArgumentException( "no color found for taxonomy code \"" + tax_code + "\"" );
1750                 }
1751                 final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() );
1752                 _TAXCODE_HEXCOLORSTRING_MAP.put( tax_code, hex );
1753             }
1754             else {
1755                 throw new IllegalArgumentException( "unable to obtain color for code " + tax_code
1756                         + " (tree is null or empty and code is not in map)" );
1757             }
1758         }
1759         return _TAXCODE_HEXCOLORSTRING_MAP.get( tax_code );
1760     }
1761
1762     public static Phylogeny[] obtainAndPreProcessIntrees( final File[] intree_files,
1763                                                           final int number_of_genomes,
1764                                                           final String[][] input_file_properties ) {
1765         final Phylogeny[] intrees = new Phylogeny[ intree_files.length ];
1766         int i = 0;
1767         for( final File intree_file : intree_files ) {
1768             Phylogeny intree = null;
1769             final String error = ForesterUtil.isReadableFile( intree_file );
1770             if ( !ForesterUtil.isEmpty( error ) ) {
1771                 ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: "
1772                         + error );
1773             }
1774             try {
1775                 final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance()
1776                         .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) );
1777                 if ( p_array.length < 1 ) {
1778                     ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1779                             + "] does not contain any phylogeny in phyloXML format" );
1780                 }
1781                 else if ( p_array.length > 1 ) {
1782                     ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1783                             + "] contains more than one phylogeny in phyloXML format" );
1784                 }
1785                 intree = p_array[ 0 ];
1786             }
1787             catch ( final Exception e ) {
1788                 ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file
1789                         + "]: " + error );
1790             }
1791             if ( ( intree == null ) || intree.isEmpty() ) {
1792                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" );
1793             }
1794             if ( !intree.isRooted() ) {
1795                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" );
1796             }
1797             if ( intree.getNumberOfExternalNodes() < number_of_genomes ) {
1798                 ForesterUtil.fatalError( surfacing.PRG_NAME,
1799                                          "number of external nodes [" + intree.getNumberOfExternalNodes()
1800                                                  + "] of input tree [" + intree_file
1801                                                  + "] is smaller than the number of genomes the be analyzed ["
1802                                                  + number_of_genomes + "]" );
1803             }
1804             final StringBuilder parent_names = new StringBuilder();
1805             final int nodes_lacking_name = getNumberOfNodesLackingName( intree, parent_names );
1806             if ( nodes_lacking_name > 0 ) {
1807                 ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has "
1808                         + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" );
1809             }
1810             preparePhylogenyForParsimonyAnalyses( intree, input_file_properties );
1811             if ( !intree.isCompletelyBinary() ) {
1812                 ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file
1813                         + "] is not completely binary" );
1814             }
1815             intrees[ i++ ] = intree;
1816         }
1817         return intrees;
1818     }
1819
1820     public static Phylogeny obtainFirstIntree( final File intree_file ) {
1821         Phylogeny intree = null;
1822         final String error = ForesterUtil.isReadableFile( intree_file );
1823         if ( !ForesterUtil.isEmpty( error ) ) {
1824             ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + error );
1825         }
1826         try {
1827             final Phylogeny[] phys = ParserBasedPhylogenyFactory.getInstance()
1828                     .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) );
1829             if ( phys.length < 1 ) {
1830                 ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1831                         + "] does not contain any phylogeny in phyloXML format" );
1832             }
1833             else if ( phys.length > 1 ) {
1834                 ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
1835                         + "] contains more than one phylogeny in phyloXML format" );
1836             }
1837             intree = phys[ 0 ];
1838         }
1839         catch ( final Exception e ) {
1840             ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + "]: "
1841                     + error );
1842         }
1843         if ( ( intree == null ) || intree.isEmpty() ) {
1844             ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" );
1845         }
1846         if ( !intree.isRooted() ) {
1847             ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" );
1848         }
1849         return intree;
1850     }
1851
1852     public static void performDomainArchitectureAnalysis( final SortedMap<String, Set<String>> domain_architecutures,
1853                                                           final SortedMap<String, Integer> domain_architecuture_counts,
1854                                                           final int min_count,
1855                                                           final File da_counts_outfile,
1856                                                           final File unique_da_outfile ) {
1857         checkForOutputFileWriteability( da_counts_outfile );
1858         checkForOutputFileWriteability( unique_da_outfile );
1859         try {
1860             final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) );
1861             final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) );
1862             final Iterator<Entry<String, Integer>> it = domain_architecuture_counts.entrySet().iterator();
1863             while ( it.hasNext() ) {
1864                 final Map.Entry<String, Integer> e = it.next();
1865                 final String da = e.getKey();
1866                 final int count = e.getValue();
1867                 if ( count >= min_count ) {
1868                     da_counts_out.write( da );
1869                     da_counts_out.write( "\t" );
1870                     da_counts_out.write( String.valueOf( count ) );
1871                     da_counts_out.write( ForesterUtil.LINE_SEPARATOR );
1872                 }
1873                 if ( count == 1 ) {
1874                     final Iterator<Entry<String, Set<String>>> it2 = domain_architecutures.entrySet().iterator();
1875                     while ( it2.hasNext() ) {
1876                         final Map.Entry<String, Set<String>> e2 = it2.next();
1877                         final String genome = e2.getKey();
1878                         final Set<String> das = e2.getValue();
1879                         if ( das.contains( da ) ) {
1880                             unique_da_out.write( genome );
1881                             unique_da_out.write( "\t" );
1882                             unique_da_out.write( da );
1883                             unique_da_out.write( ForesterUtil.LINE_SEPARATOR );
1884                         }
1885                     }
1886                 }
1887             }
1888             unique_da_out.close();
1889             da_counts_out.close();
1890         }
1891         catch ( final IOException e ) {
1892             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
1893         }
1894         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" );
1895         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" );
1896         //
1897     }
1898
1899     public static void preparePhylogeny( final Phylogeny p,
1900                                          final DomainParsimonyCalculator domain_parsimony,
1901                                          final String date_time,
1902                                          final String method,
1903                                          final String name,
1904                                          final String parameters_str ) {
1905         domain_parsimony.decoratePhylogenyWithDomains( p );
1906         final StringBuilder desc = new StringBuilder();
1907         desc.append( "[Method: " + method + "] [Date: " + date_time + "] " );
1908         desc.append( "[Cost: " + domain_parsimony.getCost() + "] " );
1909         desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " );
1910         desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " );
1911         desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " );
1912         desc.append( "[Parameters: " + parameters_str + "]" );
1913         p.setName( name );
1914         p.setDescription( desc.toString() );
1915         p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) );
1916         p.setRerootable( false );
1917         p.setRooted( true );
1918     }
1919
1920     public static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
1921                                                              final String[][] input_file_properties ) {
1922         final String[] genomes = new String[ input_file_properties.length ];
1923         for( int i = 0; i < input_file_properties.length; ++i ) {
1924             if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) {
1925                 ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ]
1926                         + "] is not unique in input tree " + intree.getName() );
1927             }
1928             genomes[ i ] = input_file_properties[ i ][ 1 ];
1929         }
1930         //
1931         final PhylogenyNodeIterator it = intree.iteratorPostorder();
1932         while ( it.hasNext() ) {
1933             final PhylogenyNode n = it.next();
1934             if ( ForesterUtil.isEmpty( n.getName() ) ) {
1935                 if ( n.getNodeData().isHasTaxonomy()
1936                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) {
1937                     n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() );
1938                 }
1939                 else if ( n.getNodeData().isHasTaxonomy()
1940                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
1941                     n.setName( n.getNodeData().getTaxonomy().getScientificName() );
1942                 }
1943                 else if ( n.getNodeData().isHasTaxonomy()
1944                         && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) {
1945                     n.setName( n.getNodeData().getTaxonomy().getCommonName() );
1946                 }
1947                 else {
1948                     ForesterUtil
1949                             .fatalError( surfacing.PRG_NAME,
1950                                          "node with no name, scientific name, common name, or taxonomy code present" );
1951                 }
1952             }
1953         }
1954         //
1955         final List<String> igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree );
1956         if ( igns.size() > 0 ) {
1957             System.out.println( "Not using the following " + igns.size() + " nodes:" );
1958             for( int i = 0; i < igns.size(); ++i ) {
1959                 System.out.println( " " + i + ": " + igns.get( i ) );
1960             }
1961             System.out.println( "--" );
1962         }
1963         for( final String[] input_file_propertie : input_file_properties ) {
1964             try {
1965                 intree.getNode( input_file_propertie[ 1 ] );
1966             }
1967             catch ( final IllegalArgumentException e ) {
1968                 ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_propertie[ 1 ]
1969                         + "] not present/not unique in input tree" );
1970             }
1971         }
1972     }
1973
1974     public static void printOutPercentageOfMultidomainProteins( final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
1975                                                                 final Writer log_writer ) {
1976         int sum = 0;
1977         for( final Entry<Integer, Integer> entry : all_genomes_domains_per_potein_histo.entrySet() ) {
1978             sum += entry.getValue();
1979         }
1980         final double percentage = ( 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) ) / sum;
1981         ForesterUtil.programMessage( surfacing.PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" );
1982         log( "Percentage of multidomain proteins:            : " + percentage + "%", log_writer );
1983     }
1984
1985     private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w )
1986             throws IOException {
1987         w.write( "<hr>" );
1988         w.write( "<br>" );
1989         w.write( SurfacingConstants.NL );
1990         w.write( "<tt><pre>" );
1991         w.write( SurfacingConstants.NL );
1992         if ( histo != null ) {
1993             w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
1994             w.write( SurfacingConstants.NL );
1995         }
1996         w.write( "</pre></tt>" );
1997         w.write( SurfacingConstants.NL );
1998         w.write( "<table>" );
1999         w.write( SurfacingConstants.NL );
2000         w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
2001         w.write( SurfacingConstants.NL );
2002         w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
2003         w.write( SurfacingConstants.NL );
2004         w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
2005         w.write( SurfacingConstants.NL );
2006         w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
2007         w.write( SurfacingConstants.NL );
2008         if ( stats.getN() > 1 ) {
2009             w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
2010         }
2011         else {
2012             w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
2013         }
2014         w.write( SurfacingConstants.NL );
2015         w.write( "</table>" );
2016         w.write( SurfacingConstants.NL );
2017         w.write( "<br>" );
2018         w.write( SurfacingConstants.NL );
2019     }
2020
2021     public static void processFilter( final File filter_file, final SortedSet<String> filter ) {
2022         SortedSet<String> filter_str = null;
2023         try {
2024             filter_str = ForesterUtil.file2set( filter_file );
2025         }
2026         catch ( final IOException e ) {
2027             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2028         }
2029         if ( filter_str != null ) {
2030             for( final String string : filter_str ) {
2031                 filter.add( string );
2032             }
2033         }
2034         if ( surfacing.VERBOSE ) {
2035             System.out.println( "Filter:" );
2036             for( final String domainId : filter ) {
2037                 System.out.println( domainId );
2038             }
2039         }
2040     }
2041
2042     public static String[][] processInputGenomesFile( final File input_genomes ) {
2043         String[][] input_file_properties = null;
2044         try {
2045             input_file_properties = ForesterUtil.file22dArray( input_genomes );
2046         }
2047         catch ( final IOException e ) {
2048             ForesterUtil.fatalError( surfacing.PRG_NAME,
2049                                      "genomes files is to be in the following format \"<hmmpfam output file> <species>\": "
2050                                              + e.getLocalizedMessage() );
2051         }
2052         final Set<String> specs = new HashSet<String>();
2053         final Set<String> paths = new HashSet<String>();
2054         for( int i = 0; i < input_file_properties.length; ++i ) {
2055             if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) {
2056                 ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: "
2057                         + input_file_properties[ i ][ 1 ] );
2058             }
2059             if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) {
2060                 ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ]
2061                         + " is not unique" );
2062             }
2063             specs.add( input_file_properties[ i ][ 1 ] );
2064             if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) {
2065                 ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ]
2066                         + " is not unique" );
2067             }
2068             paths.add( input_file_properties[ i ][ 0 ] );
2069             final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) );
2070             if ( !ForesterUtil.isEmpty( error ) ) {
2071                 ForesterUtil.fatalError( surfacing.PRG_NAME, error );
2072             }
2073         }
2074         return input_file_properties;
2075     }
2076
2077     public static void processPlusMinusAnalysisOption( final CommandLineArguments cla,
2078                                                        final List<String> high_copy_base,
2079                                                        final List<String> high_copy_target,
2080                                                        final List<String> low_copy,
2081                                                        final List<Object> numbers ) {
2082         if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) {
2083             if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) {
2084                 ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -"
2085                         + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=<file>" );
2086             }
2087             final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) );
2088             final String msg = ForesterUtil.isReadableFile( plus_minus_file );
2089             if ( !ForesterUtil.isEmpty( msg ) ) {
2090                 ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg );
2091             }
2092             processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers );
2093         }
2094     }
2095
2096     // First numbers is minimal difference, second is factor.
2097     public static void processPlusMinusFile( final File plus_minus_file,
2098                                              final List<String> high_copy_base,
2099                                              final List<String> high_copy_target,
2100                                              final List<String> low_copy,
2101                                              final List<Object> numbers ) {
2102         Set<String> species_set = null;
2103         int min_diff = surfacing.PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT;
2104         double factor = surfacing.PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT;
2105         try {
2106             species_set = ForesterUtil.file2set( plus_minus_file );
2107         }
2108         catch ( final IOException e ) {
2109             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2110         }
2111         if ( species_set != null ) {
2112             for( final String species : species_set ) {
2113                 final String species_trimmed = species.substring( 1 );
2114                 if ( species.startsWith( "+" ) ) {
2115                     if ( low_copy.contains( species_trimmed ) ) {
2116                         ForesterUtil.fatalError( surfacing.PRG_NAME,
2117                                                  "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \""
2118                                                          + species_trimmed + "\"" );
2119                     }
2120                     high_copy_base.add( species_trimmed );
2121                 }
2122                 else if ( species.startsWith( "*" ) ) {
2123                     if ( low_copy.contains( species_trimmed ) ) {
2124                         ForesterUtil.fatalError( surfacing.PRG_NAME,
2125                                                  "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \""
2126                                                          + species_trimmed + "\"" );
2127                     }
2128                     high_copy_target.add( species_trimmed );
2129                 }
2130                 else if ( species.startsWith( "-" ) ) {
2131                     if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) {
2132                         ForesterUtil.fatalError( surfacing.PRG_NAME,
2133                                                  "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \""
2134                                                          + species_trimmed + "\"" );
2135                     }
2136                     low_copy.add( species_trimmed );
2137                 }
2138                 else if ( species.startsWith( "$D" ) ) {
2139                     try {
2140                         min_diff = Integer.parseInt( species.substring( 3 ) );
2141                     }
2142                     catch ( final NumberFormatException e ) {
2143                         ForesterUtil.fatalError( surfacing.PRG_NAME,
2144                                                  "could not parse integer value for minimal difference from: \""
2145                                                          + species.substring( 3 ) + "\"" );
2146                     }
2147                 }
2148                 else if ( species.startsWith( "$F" ) ) {
2149                     try {
2150                         factor = Double.parseDouble( species.substring( 3 ) );
2151                     }
2152                     catch ( final NumberFormatException e ) {
2153                         ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \""
2154                                 + species.substring( 3 ) + "\"" );
2155                     }
2156                 }
2157                 else if ( species.startsWith( "#" ) ) {
2158                     // Comment, ignore.
2159                 }
2160                 else {
2161                     ForesterUtil
2162                             .fatalError( surfacing.PRG_NAME,
2163                                          "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=<integer>' minimal Difference (default is 1), '$F=<double>' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \""
2164                                                  + species + "\"" );
2165                 }
2166                 numbers.add( new Integer( min_diff + "" ) );
2167                 numbers.add( new Double( factor + "" ) );
2168             }
2169         }
2170         else {
2171             ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" );
2172         }
2173     }
2174
2175     /*
2176      * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value
2177      * 
2178      * 
2179      */
2180     static public StringBuffer proteinToDomainCombinations( final Protein protein,
2181                                                             final String protein_id,
2182                                                             final String separator ) {
2183         final StringBuffer sb = new StringBuffer();
2184         if ( protein.getSpecies() == null ) {
2185             throw new IllegalArgumentException( "species must not be null" );
2186         }
2187         if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) {
2188             throw new IllegalArgumentException( "species id must not be empty" );
2189         }
2190         final List<Domain> domains = protein.getProteinDomains();
2191         if ( domains.size() > 1 ) {
2192             final Map<String, Integer> counts = new HashMap<String, Integer>();
2193             for( final Domain domain : domains ) {
2194                 final String id = domain.getDomainId();
2195                 if ( counts.containsKey( id ) ) {
2196                     counts.put( id, counts.get( id ) + 1 );
2197                 }
2198                 else {
2199                     counts.put( id, 1 );
2200                 }
2201             }
2202             final Set<String> dcs = new HashSet<String>();
2203             for( int i = 1; i < domains.size(); ++i ) {
2204                 for( int j = 0; j < i; ++j ) {
2205                     Domain domain_n = domains.get( i );
2206                     Domain domain_c = domains.get( j );
2207                     if ( domain_n.getFrom() > domain_c.getFrom() ) {
2208                         domain_n = domains.get( j );
2209                         domain_c = domains.get( i );
2210                     }
2211                     final String dc = domain_n.getDomainId() + domain_c.getDomainId();
2212                     if ( !dcs.contains( dc ) ) {
2213                         dcs.add( dc );
2214                         sb.append( protein.getSpecies() );
2215                         sb.append( separator );
2216                         sb.append( protein_id );
2217                         sb.append( separator );
2218                         sb.append( domain_n.getDomainId() );
2219                         sb.append( separator );
2220                         sb.append( domain_c.getDomainId() );
2221                         sb.append( separator );
2222                         sb.append( domain_n.getPerDomainEvalue() );
2223                         sb.append( separator );
2224                         sb.append( domain_c.getPerDomainEvalue() );
2225                         sb.append( separator );
2226                         sb.append( counts.get( domain_n.getDomainId() ) );
2227                         sb.append( separator );
2228                         sb.append( counts.get( domain_c.getDomainId() ) );
2229                         sb.append( ForesterUtil.LINE_SEPARATOR );
2230                     }
2231                 }
2232             }
2233         }
2234         else if ( domains.size() == 1 ) {
2235             sb.append( protein.getSpecies() );
2236             sb.append( separator );
2237             sb.append( protein_id );
2238             sb.append( separator );
2239             sb.append( domains.get( 0 ).getDomainId() );
2240             sb.append( separator );
2241             sb.append( separator );
2242             sb.append( domains.get( 0 ).getPerDomainEvalue() );
2243             sb.append( separator );
2244             sb.append( separator );
2245             sb.append( 1 );
2246             sb.append( separator );
2247             sb.append( ForesterUtil.LINE_SEPARATOR );
2248         }
2249         else {
2250             sb.append( protein.getSpecies() );
2251             sb.append( separator );
2252             sb.append( protein_id );
2253             sb.append( separator );
2254             sb.append( separator );
2255             sb.append( separator );
2256             sb.append( separator );
2257             sb.append( separator );
2258             sb.append( separator );
2259             sb.append( ForesterUtil.LINE_SEPARATOR );
2260         }
2261         return sb;
2262     }
2263
2264     public static List<Domain> sortDomainsWithAscendingConfidenceValues( final Protein protein ) {
2265         final List<Domain> domains = new ArrayList<Domain>();
2266         for( final Domain d : protein.getProteinDomains() ) {
2267             domains.add( d );
2268         }
2269         Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER );
2270         return domains;
2271     }
2272
2273     private static List<String> splitDomainCombination( final String dc ) {
2274         final String[] s = dc.split( "=" );
2275         if ( s.length != 2 ) {
2276             ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
2277                     + dc );
2278             System.exit( -1 );
2279         }
2280         final List<String> l = new ArrayList<String>( 2 );
2281         l.add( s[ 0 ] );
2282         l.add( s[ 1 ] );
2283         return l;
2284     }
2285
2286     public static int storeDomainArchitectures( final String genome,
2287                                                 final SortedMap<String, Set<String>> domain_architecutures,
2288                                                 final List<Protein> protein_list,
2289                                                 final Map<String, Integer> distinct_domain_architecuture_counts ) {
2290         final Set<String> da = new HashSet<String>();
2291         domain_architecutures.put( genome, da );
2292         for( final Protein protein : protein_list ) {
2293             final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" );
2294             if ( !da.contains( da_str ) ) {
2295                 if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) {
2296                     distinct_domain_architecuture_counts.put( da_str, 1 );
2297                 }
2298                 else {
2299                     distinct_domain_architecuture_counts.put( da_str,
2300                                                               distinct_domain_architecuture_counts.get( da_str ) + 1 );
2301                 }
2302                 da.add( da_str );
2303             }
2304         }
2305         return da.size();
2306     }
2307
2308     public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p,
2309                                                             final boolean get_gains,
2310                                                             final String outdir,
2311                                                             final String suffix_for_filename ) throws IOException {
2312         CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN;
2313         if ( !get_gains ) {
2314             state = CharacterStateMatrix.GainLossStates.LOSS;
2315         }
2316         final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES,
2317                                                                   false,
2318                                                                   state,
2319                                                                   outdir );
2320         for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) {
2321             final PhylogenyNode node = it.next();
2322             if ( !node.isExternal() ) {
2323                 final SortedSet<String> domains = collectAllDomainsChangedOnSubtree( node, get_gains );
2324                 if ( domains.size() > 0 ) {
2325                     final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR
2326                             + node.getName() + suffix_for_filename );
2327                     for( final String domain : domains ) {
2328                         writer.write( domain );
2329                         writer.write( ForesterUtil.LINE_SEPARATOR );
2330                     }
2331                     writer.close();
2332                 }
2333             }
2334         }
2335     }
2336
2337     private static void writeAllEncounteredPfamsToFile( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2338                                                         final Map<GoId, GoTerm> go_id_to_term_map,
2339                                                         final String outfile_name,
2340                                                         final SortedSet<String> all_pfams_encountered ) {
2341         final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
2342         final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
2343                 + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
2344         final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
2345         int biological_process_counter = 0;
2346         int cellular_component_counter = 0;
2347         int molecular_function_counter = 0;
2348         int pfams_with_mappings_counter = 0;
2349         int pfams_without_mappings_counter = 0;
2350         int pfams_without_mappings_to_bp_or_mf_counter = 0;
2351         int pfams_with_mappings_to_bp_or_mf_counter = 0;
2352         try {
2353             final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
2354             final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
2355             final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
2356             summary_writer.write( "# Pfam to GO mapping summary" );
2357             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2358             summary_writer.write( "# Actual summary is at the end of this file." );
2359             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2360             summary_writer.write( "# Encountered Pfams without a GO mapping:" );
2361             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2362             for( final String pfam : all_pfams_encountered ) {
2363                 all_pfams_encountered_writer.write( pfam );
2364                 all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
2365                 final String domain_id = new String( pfam );
2366                 if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
2367                     ++pfams_with_mappings_counter;
2368                     all_pfams_encountered_with_go_annotation_writer.write( pfam );
2369                     all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
2370                     final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
2371                     boolean maps_to_bp = false;
2372                     boolean maps_to_cc = false;
2373                     boolean maps_to_mf = false;
2374                     for( final GoId go_id : go_ids ) {
2375                         final GoTerm go_term = go_id_to_term_map.get( go_id );
2376                         if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
2377                             maps_to_bp = true;
2378                         }
2379                         else if ( go_term.getGoNameSpace().isCellularComponent() ) {
2380                             maps_to_cc = true;
2381                         }
2382                         else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
2383                             maps_to_mf = true;
2384                         }
2385                     }
2386                     if ( maps_to_bp ) {
2387                         ++biological_process_counter;
2388                     }
2389                     if ( maps_to_cc ) {
2390                         ++cellular_component_counter;
2391                     }
2392                     if ( maps_to_mf ) {
2393                         ++molecular_function_counter;
2394                     }
2395                     if ( maps_to_bp || maps_to_mf ) {
2396                         ++pfams_with_mappings_to_bp_or_mf_counter;
2397                     }
2398                     else {
2399                         ++pfams_without_mappings_to_bp_or_mf_counter;
2400                     }
2401                 }
2402                 else {
2403                     ++pfams_without_mappings_to_bp_or_mf_counter;
2404                     ++pfams_without_mappings_counter;
2405                     summary_writer.write( pfam );
2406                     summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2407                 }
2408             }
2409             all_pfams_encountered_writer.close();
2410             all_pfams_encountered_with_go_annotation_writer.close();
2411             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size()
2412                     + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" );
2413             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter
2414                     + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file
2415                     + "\"" );
2416             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all ["
2417                     + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \""
2418                     + encountered_pfams_summary_file + "\"" );
2419             ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered                : "
2420                     + all_pfams_encountered.size() );
2421             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping                 : "
2422                     + pfams_without_mappings_counter + " ["
2423                     + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2424             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
2425                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
2426                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2427             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping                    : "
2428                     + pfams_with_mappings_counter + " ["
2429                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2430             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func.  : "
2431                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
2432                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2433             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
2434                     + biological_process_counter + " ["
2435                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2436             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
2437                     + molecular_function_counter + " ["
2438                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2439             ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
2440                     + cellular_component_counter + " ["
2441                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2442             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2443             summary_writer.write( "# Sum of Pfams encountered                : " + all_pfams_encountered.size() );
2444             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2445             summary_writer.write( "# Pfams without a mapping                 : " + pfams_without_mappings_counter
2446                     + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2447             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2448             summary_writer.write( "# Pfams without mapping to proc. or func. : "
2449                     + pfams_without_mappings_to_bp_or_mf_counter + " ["
2450                     + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2451             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2452             summary_writer.write( "# Pfams with a mapping                    : " + pfams_with_mappings_counter + " ["
2453                     + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
2454             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2455             summary_writer.write( "# Pfams with a mapping to proc. or func.  : "
2456                     + pfams_with_mappings_to_bp_or_mf_counter + " ["
2457                     + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
2458             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2459             summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
2460                     + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
2461             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2462             summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
2463                     + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
2464             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2465             summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
2466                     + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
2467             summary_writer.write( ForesterUtil.LINE_SEPARATOR );
2468             summary_writer.close();
2469         }
2470         catch ( final IOException e ) {
2471             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
2472         }
2473     }
2474
2475     public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
2476                                                                           final File output_dir,
2477                                                                           final GenomeWideCombinableDomains gwcd,
2478                                                                           final int i,
2479                                                                           final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
2480         File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ]
2481                 + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
2482         if ( output_dir != null ) {
2483             dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
2484         }
2485         checkForOutputFileWriteability( dc_outfile_dot );
2486         final SortedSet<BinaryDomainCombination> binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd );
2487         try {
2488             final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) );
2489             for( final BinaryDomainCombination bdc : binary_combinations ) {
2490                 out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null )
2491                         .toString() );
2492                 out_dot.write( SurfacingConstants.NL );
2493             }
2494             out_dot.close();
2495         }
2496         catch ( final IOException e ) {
2497             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2498         }
2499         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \""
2500                 + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
2501                 + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" );
2502     }
2503
2504     public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
2505                                                             final CharacterStateMatrix.GainLossStates state,
2506                                                             final String filename,
2507                                                             final String indentifier_characters_separator,
2508                                                             final String character_separator,
2509                                                             final Map<String, String> descriptions ) {
2510         final File outfile = new File( filename );
2511         checkForOutputFileWriteability( outfile );
2512         final SortedSet<String> sorted_ids = new TreeSet<String>();
2513         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
2514             sorted_ids.add( matrix.getIdentifier( i ) );
2515         }
2516         try {
2517             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
2518             for( final String id : sorted_ids ) {
2519                 out.write( indentifier_characters_separator );
2520                 out.write( "#" + id );
2521                 out.write( indentifier_characters_separator );
2522                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
2523                     // Not nice:
2524                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
2525                     if ( ( matrix.getState( id, c ) == state )
2526                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
2527                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
2528                         out.write( matrix.getCharacter( c ) );
2529                         if ( ( descriptions != null ) && !descriptions.isEmpty()
2530                                 && descriptions.containsKey( matrix.getCharacter( c ) ) ) {
2531                             out.write( "\t" );
2532                             out.write( descriptions.get( matrix.getCharacter( c ) ) );
2533                         }
2534                         out.write( character_separator );
2535                     }
2536                 }
2537             }
2538             out.flush();
2539             out.close();
2540         }
2541         catch ( final IOException e ) {
2542             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2543         }
2544         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
2545     }
2546
2547     public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
2548                                                                                                  final CharacterStateMatrix.GainLossStates state,
2549                                                                                                  final String filename,
2550                                                                                                  final String indentifier_characters_separator,
2551                                                                                                  final String character_separator,
2552                                                                                                  final BinaryDomainCombination.OutputFormat bc_output_format ) {
2553         final File outfile = new File( filename );
2554         checkForOutputFileWriteability( outfile );
2555         final SortedSet<String> sorted_ids = new TreeSet<String>();
2556         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
2557             sorted_ids.add( matrix.getIdentifier( i ) );
2558         }
2559         try {
2560             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
2561             for( final String id : sorted_ids ) {
2562                 out.write( indentifier_characters_separator );
2563                 out.write( "#" + id );
2564                 out.write( indentifier_characters_separator );
2565                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
2566                     // Not nice:
2567                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
2568                     if ( ( matrix.getState( id, c ) == state )
2569                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix
2570                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) {
2571                         BinaryDomainCombination bdc = null;
2572                         try {
2573                             bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) );
2574                         }
2575                         catch ( final Exception e ) {
2576                             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
2577                         }
2578                         out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() );
2579                         out.write( character_separator );
2580                     }
2581                 }
2582             }
2583             out.flush();
2584             out.close();
2585         }
2586         catch ( final IOException e ) {
2587             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2588         }
2589         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" );
2590     }
2591
2592     public static void writeBinaryStatesMatrixToList( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2593                                                       final Map<GoId, GoTerm> go_id_to_term_map,
2594                                                       final GoNameSpace go_namespace_limit,
2595                                                       final boolean domain_combinations,
2596                                                       final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
2597                                                       final CharacterStateMatrix.GainLossStates state,
2598                                                       final String filename,
2599                                                       final String indentifier_characters_separator,
2600                                                       final String character_separator,
2601                                                       final String title_for_html,
2602                                                       final String prefix_for_html,
2603                                                       final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
2604                                                       final SortedSet<String> all_pfams_encountered,
2605                                                       final SortedSet<String> pfams_gained_or_lost,
2606                                                       final String suffix_for_per_node_events_file,
2607                                                       final Map<String, Integer> tax_code_to_id_map ) {
2608         if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
2609             throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
2610         }
2611         else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
2612             throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
2613         }
2614         else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
2615             throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
2616         }
2617         final File outfile = new File( filename );
2618         checkForOutputFileWriteability( outfile );
2619         final SortedSet<String> sorted_ids = new TreeSet<String>();
2620         for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
2621             sorted_ids.add( matrix.getIdentifier( i ) );
2622         }
2623         try {
2624             final Writer out = new BufferedWriter( new FileWriter( outfile ) );
2625             final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
2626                                                                                                                 domain_combinations,
2627                                                                                                                 state,
2628                                                                                                                 filename );
2629             Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
2630             File per_node_go_mapped_domain_gain_loss_outfile = null;
2631             int per_node_counter = 0;
2632             out.write( "<html>" );
2633             out.write( SurfacingConstants.NL );
2634             writeHtmlHead( out, title_for_html );
2635             out.write( SurfacingConstants.NL );
2636             out.write( "<body>" );
2637             out.write( SurfacingConstants.NL );
2638             out.write( "<h1>" );
2639             out.write( SurfacingConstants.NL );
2640             out.write( title_for_html );
2641             out.write( SurfacingConstants.NL );
2642             out.write( "</h1>" );
2643             out.write( SurfacingConstants.NL );
2644             out.write( "<table>" );
2645             out.write( SurfacingConstants.NL );
2646             for( final String id : sorted_ids ) {
2647                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
2648                 if ( matcher.matches() ) {
2649                     continue;
2650                 }
2651                 out.write( "<tr>" );
2652                 out.write( "<td>" );
2653                 out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
2654                 out.write( "</td>" );
2655                 out.write( "</tr>" );
2656                 out.write( SurfacingConstants.NL );
2657             }
2658             out.write( "</table>" );
2659             out.write( SurfacingConstants.NL );
2660             for( final String id : sorted_ids ) {
2661                 final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id );
2662                 if ( matcher.matches() ) {
2663                     continue;
2664                 }
2665                 out.write( SurfacingConstants.NL );
2666                 out.write( "<h2>" );
2667                 out.write( "<a name=\"" + id + "\">" + id + "</a>" );
2668                 writeTaxonomyLinks( out, id, tax_code_to_id_map );
2669                 out.write( "</h2>" );
2670                 out.write( SurfacingConstants.NL );
2671                 out.write( "<table>" );
2672                 out.write( SurfacingConstants.NL );
2673                 out.write( "<tr>" );
2674                 out.write( "<td><b>" );
2675                 out.write( "Pfam domain(s)" );
2676                 out.write( "</b></td><td><b>" );
2677                 out.write( "GO term acc" );
2678                 out.write( "</b></td><td><b>" );
2679                 out.write( "GO term" );
2680                 out.write( "</b></td><td><b>" );
2681                 out.write( "GO namespace" );
2682                 out.write( "</b></td>" );
2683                 out.write( "</tr>" );
2684                 out.write( SurfacingConstants.NL );
2685                 out.write( "</tr>" );
2686                 out.write( SurfacingConstants.NL );
2687                 per_node_counter = 0;
2688                 if ( matrix.getNumberOfCharacters() > 0 ) {
2689                     per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
2690                             + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
2691                     SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
2692                     per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
2693                             .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
2694                 }
2695                 else {
2696                     per_node_go_mapped_domain_gain_loss_outfile = null;
2697                     per_node_go_mapped_domain_gain_loss_outfile_writer = null;
2698                 }
2699                 for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
2700                     // Not nice:
2701                     // using null to indicate either UNCHANGED_PRESENT or GAIN.
2702                     if ( ( matrix.getState( id, c ) == state )
2703                             || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
2704                                     .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
2705                         final String character = matrix.getCharacter( c );
2706                         String domain_0 = "";
2707                         String domain_1 = "";
2708                         if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
2709                             final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
2710                             if ( s.length != 2 ) {
2711                                 throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
2712                                         + character + "]" );
2713                             }
2714                             domain_0 = s[ 0 ];
2715                             domain_1 = s[ 1 ];
2716                         }
2717                         else {
2718                             domain_0 = character;
2719                         }
2720                         writeDomainData( domain_id_to_go_ids_map,
2721                                          go_id_to_term_map,
2722                                          go_namespace_limit,
2723                                          out,
2724                                          domain_0,
2725                                          domain_1,
2726                                          prefix_for_html,
2727                                          character_separator,
2728                                          domain_id_to_secondary_features_maps,
2729                                          null );
2730                         all_pfams_encountered.add( domain_0 );
2731                         if ( pfams_gained_or_lost != null ) {
2732                             pfams_gained_or_lost.add( domain_0 );
2733                         }
2734                         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
2735                             all_pfams_encountered.add( domain_1 );
2736                             if ( pfams_gained_or_lost != null ) {
2737                                 pfams_gained_or_lost.add( domain_1 );
2738                             }
2739                         }
2740                         if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2741                             writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
2742                                                                      domain_0,
2743                                                                      domain_1 );
2744                             per_node_counter++;
2745                         }
2746                     }
2747                 }
2748                 if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
2749                     per_node_go_mapped_domain_gain_loss_outfile_writer.close();
2750                     if ( per_node_counter < 1 ) {
2751                         per_node_go_mapped_domain_gain_loss_outfile.delete();
2752                     }
2753                     per_node_counter = 0;
2754                 }
2755                 out.write( "</table>" );
2756                 out.write( SurfacingConstants.NL );
2757                 out.write( "<hr>" );
2758                 out.write( SurfacingConstants.NL );
2759             } // for( final String id : sorted_ids ) {  
2760             out.write( "</body>" );
2761             out.write( SurfacingConstants.NL );
2762             out.write( "</html>" );
2763             out.write( SurfacingConstants.NL );
2764             out.flush();
2765             out.close();
2766         }
2767         catch ( final IOException e ) {
2768             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2769         }
2770         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
2771     }
2772
2773     public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
2774                                                           final File output_dir,
2775                                                           final Writer per_genome_domain_promiscuity_statistics_writer,
2776                                                           final GenomeWideCombinableDomains gwcd,
2777                                                           final int i,
2778                                                           final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
2779         File dc_outfile = new File( input_file_properties[ i ][ 1 ]
2780                 + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX );
2781         if ( output_dir != null ) {
2782             dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile );
2783         }
2784         checkForOutputFileWriteability( dc_outfile );
2785         try {
2786             final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) );
2787             out.write( gwcd.toStringBuilder( dc_sort_order ).toString() );
2788             out.close();
2789         }
2790         catch ( final IOException e ) {
2791             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2792         }
2793         final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics();
2794         try {
2795             per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" );
2796             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" );
2797             if ( stats.getN() < 2 ) {
2798                 per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" );
2799             }
2800             else {
2801                 per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
2802                         .sampleStandardDeviation() ) + "\t" );
2803             }
2804             per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
2805             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
2806             per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
2807             per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
2808             final SortedSet<String> mpds = gwcd.getMostPromiscuosDomain();
2809             for( final String mpd : mpds ) {
2810                 per_genome_domain_promiscuity_statistics_writer.write( mpd + " " );
2811             }
2812             per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
2813         }
2814         catch ( final IOException e ) {
2815             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
2816         }
2817         if ( input_file_properties[ i ].length == 3 ) {
2818             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2819                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
2820                     + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
2821         }
2822         else {
2823             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
2824                     + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
2825                     + dc_outfile + "\"" );
2826         }
2827     }
2828
2829     private static void writeDomainData( final Map<String, List<GoId>> domain_id_to_go_ids_map,
2830                                          final Map<GoId, GoTerm> go_id_to_term_map,
2831                                          final GoNameSpace go_namespace_limit,
2832                                          final Writer out,
2833                                          final String domain_0,
2834                                          final String domain_1,
2835                                          final String prefix_for_html,
2836                                          final String character_separator_for_non_html_output,
2837                                          final Map<String, Set<String>>[] domain_id_to_secondary_features_maps,
2838                                          final Set<GoId> all_go_ids ) throws IOException {
2839         boolean any_go_annotation_present = false;
2840         boolean first_has_no_go = false;
2841         int domain_count = 2; // To distinguish between domains and binary domain combinations.
2842         if ( ForesterUtil.isEmpty( domain_1 ) ) {
2843             domain_count = 1;
2844         }
2845         // The following has a difficult to understand logic.  
2846         for( int d = 0; d < domain_count; ++d ) {
2847             List<GoId> go_ids = null;
2848             boolean go_annotation_present = false;
2849             if ( d == 0 ) {
2850                 if ( domain_id_to_go_ids_map.containsKey( domain_0 ) ) {
2851                     go_annotation_present = true;
2852                     any_go_annotation_present = true;
2853                     go_ids = domain_id_to_go_ids_map.get( domain_0 );
2854                 }
2855                 else {
2856                     first_has_no_go = true;
2857                 }
2858             }
2859             else {
2860                 if ( domain_id_to_go_ids_map.containsKey( domain_1 ) ) {
2861                     go_annotation_present = true;
2862                     any_go_annotation_present = true;
2863                     go_ids = domain_id_to_go_ids_map.get( domain_1 );
2864                 }
2865             }
2866             if ( go_annotation_present ) {
2867                 boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) );
2868                 for( final GoId go_id : go_ids ) {
2869                     out.write( "<tr>" );
2870                     if ( first ) {
2871                         first = false;
2872                         writeDomainIdsToHtml( out,
2873                                               domain_0,
2874                                               domain_1,
2875                                               prefix_for_html,
2876                                               domain_id_to_secondary_features_maps );
2877                     }
2878                     else {
2879                         out.write( "<td></td>" );
2880                     }
2881                     if ( !go_id_to_term_map.containsKey( go_id ) ) {
2882                         throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" );
2883                     }
2884                     final GoTerm go_term = go_id_to_term_map.get( go_id );
2885                     if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) {
2886                         // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName();
2887                         final String go_id_str = go_id.getId();
2888                         out.write( "<td>" );
2889                         out.write( "<a href=\"" + SurfacingConstants.AMIGO_LINK + go_id_str
2890                                 + "\" target=\"amigo_window\">" + go_id_str + "</a>" );
2891                         out.write( "</td><td>" );
2892                         out.write( go_term.getName() );
2893                         if ( domain_count == 2 ) {
2894                             out.write( " (" + d + ")" );
2895                         }
2896                         out.write( "</td><td>" );
2897                         // out.write( top );
2898                         // out.write( "</td><td>" );
2899                         out.write( "[" );
2900                         out.write( go_term.getGoNameSpace().toShortString() );
2901                         out.write( "]" );
2902                         out.write( "</td>" );
2903                         if ( all_go_ids != null ) {
2904                             all_go_ids.add( go_id );
2905                         }
2906                     }
2907                     else {
2908                         out.write( "<td>" );
2909                         out.write( "</td><td>" );
2910                         out.write( "</td><td>" );
2911                         out.write( "</td><td>" );
2912                         out.write( "</td>" );
2913                     }
2914                     out.write( "</tr>" );
2915                     out.write( SurfacingConstants.NL );
2916                 }
2917             }
2918         } //  for( int d = 0; d < domain_count; ++d ) 
2919         if ( !any_go_annotation_present ) {
2920             out.write( "<tr>" );
2921             writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps );
2922             out.write( "<td>" );
2923             out.write( "</td><td>" );
2924             out.write( "</td><td>" );
2925             out.write( "</td><td>" );
2926             out.write( "</td>" );
2927             out.write( "</tr>" );
2928             out.write( SurfacingConstants.NL );
2929         }
2930     }
2931
2932     private static void writeDomainIdsToHtml( final Writer out,
2933                                               final String domain_0,
2934                                               final String domain_1,
2935                                               final String prefix_for_detailed_html,
2936                                               final Map<String, Set<String>>[] domain_id_to_secondary_features_maps )
2937             throws IOException {
2938         out.write( "<td>" );
2939         if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) {
2940             out.write( prefix_for_detailed_html );
2941             out.write( " " );
2942         }
2943         out.write( "<a href=\"" + SurfacingConstants.PFAM_FAMILY_ID_LINK + domain_0 + "\">" + domain_0 + "</a>" );
2944         out.write( "</td>" );
2945     }
2946
2947     private final static void writeColorLabels( String l, Color c, Writer w ) throws IOException {
2948         w.write( "<tr><td><b><span style=\"color:" );
2949         w.write( String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ) );
2950         w.write( "\">" );
2951         w.write( l );
2952         w.write( "</span></b></td></tr>" );
2953         w.write( SurfacingConstants.NL );
2954     }
2955
2956     public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc,
2957                                                       final StringBuilder html_title,
2958                                                       final Writer simple_tab_writer,
2959                                                       final Writer single_writer,
2960                                                       Map<Character, Writer> split_writers,
2961                                                       final SortedSet<DomainSimilarity> similarities,
2962                                                       final boolean treat_as_binary,
2963                                                       final List<Species> species_order,
2964                                                       final PrintableDomainSimilarity.PRINT_OPTION print_option,
2965                                                       final DomainSimilarity.DomainSimilarityScoring scoring,
2966                                                       final boolean verbose,
2967                                                       final Map<String, Integer> tax_code_to_id_map,
2968                                                       Phylogeny phy ) throws IOException {
2969         if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
2970             split_writers = new HashMap<Character, Writer>();
2971             split_writers.put( '_', single_writer );
2972         }
2973         switch ( print_option ) {
2974             case SIMPLE_TAB_DELIMITED:
2975                 break;
2976             case HTML:
2977                 for( final Character key : split_writers.keySet() ) {
2978                     final Writer w = split_writers.get( key );
2979                     w.write( "<html>" );
2980                     w.write( SurfacingConstants.NL );
2981                     if ( key != '_' ) {
2982                         writeHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() );
2983                     }
2984                     else {
2985                         writeHtmlHead( w, "DC analysis (" + html_title + ")" );
2986                     }
2987                     w.write( SurfacingConstants.NL );
2988                     w.write( "<body>" );
2989                     w.write( SurfacingConstants.NL );
2990                     w.write( html_desc.toString() );
2991                     w.write( SurfacingConstants.NL );
2992                     w.write( "<hr>" );
2993                     w.write( SurfacingConstants.NL );
2994                     w.write( "<br>" );
2995                     w.write( SurfacingConstants.NL );
2996                     w.write( "<table>" );
2997                     w.write( SurfacingConstants.NL );
2998                     w.write( "<tr><td><b>Domains:</b></td></tr>" );
2999                     w.write( SurfacingConstants.NL );
3000                 }
3001                 break;
3002         }
3003         //
3004         for( final DomainSimilarity similarity : similarities ) {
3005             if ( ( species_order != null ) && !species_order.isEmpty() ) {
3006                 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
3007             }
3008             if ( single_writer != null ) {
3009                 single_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
3010                         + similarity.getDomainId() + "</a></b></td></tr>" );
3011                 single_writer.write( SurfacingConstants.NL );
3012             }
3013             else {
3014                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
3015                         .charAt( 0 ) );
3016                 if ( local_writer == null ) {
3017                     local_writer = split_writers.get( '0' );
3018                 }
3019                 local_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
3020                         + similarity.getDomainId() + "</a></b></td></tr>" );
3021                 local_writer.write( SurfacingConstants.NL );
3022             }
3023         }
3024         for( final Writer w : split_writers.values() ) {
3025             w.write( "</table>" );
3026             w.write( SurfacingConstants.NL );
3027             w.write( "<hr>" );
3028             w.write( SurfacingConstants.NL );
3029             //
3030             w.write( "<table>" );
3031             w.write( SurfacingConstants.NL );
3032             w.write( "<tr><td><b>" );
3033             w.write( "Species group colors:" );
3034             w.write( "</b></td></tr>" );
3035             w.write( SurfacingConstants.NL );
3036             writeColorLabels( "Deuterostomia", TaxonomyColors.DEUTEROSTOMIA_COLOR, w );
3037             writeColorLabels( "Protostomia", TaxonomyColors.PROTOSTOMIA_COLOR, w );
3038             writeColorLabels( "Cnidaria", TaxonomyColors.CNIDARIA_COLOR, w );
3039             writeColorLabels( "Placozoa", TaxonomyColors.PLACOZOA_COLOR, w );
3040             writeColorLabels( "Ctenophora (comb jellies)", TaxonomyColors.CTENOPHORA_COLOR, w );
3041             writeColorLabels( "Porifera (sponges)", TaxonomyColors.PORIFERA_COLOR, w );
3042             writeColorLabels( "Choanoflagellida", TaxonomyColors.CHOANOFLAGELLIDA, w );
3043             writeColorLabels( "Ichthyosporea & Filasterea", TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA, w );
3044             writeColorLabels( "Fungi", TaxonomyColors.FUNGI_COLOR, w );
3045             writeColorLabels( "Nucleariidae and Fonticula group",
3046                               TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR,
3047                               w );
3048             writeColorLabels( "Amoebozoa", TaxonomyColors.AMOEBOZOA_COLOR, w );
3049             writeColorLabels( "Embryophyta (plants)", TaxonomyColors.EMBRYOPHYTA_COLOR, w );
3050             writeColorLabels( "Chlorophyta (green algae)", TaxonomyColors.CHLOROPHYTA_COLOR, w );
3051             writeColorLabels( "Rhodophyta (red algae)", TaxonomyColors.RHODOPHYTA_COLOR, w );
3052             writeColorLabels( "Glaucocystophyce (Glaucophyta)", TaxonomyColors.GLAUCOPHYTA_COLOR, w );
3053             writeColorLabels( "Hacrobia (Cryptophyta & Haptophyceae & Centroheliozoa)",
3054                               TaxonomyColors.HACROBIA_COLOR,
3055                               w );
3056             writeColorLabels( "Stramenopiles (Chromophyta, heterokonts)", TaxonomyColors.STRAMENOPILES_COLOR, w );
3057             writeColorLabels( "Alveolata", TaxonomyColors.ALVEOLATA_COLOR, w );
3058             writeColorLabels( "Rhizaria", TaxonomyColors.RHIZARIA_COLOR, w );
3059             writeColorLabels( "Excavata", TaxonomyColors.EXCAVATA_COLOR, w );
3060             writeColorLabels( "Apusozoa", TaxonomyColors.APUSOZOA_COLOR, w );
3061             writeColorLabels( "Archaea", TaxonomyColors.ARCHAEA_COLOR, w );
3062             writeColorLabels( "Bacteria", TaxonomyColors.BACTERIA_COLOR, w );
3063             w.write( "</table>" );
3064             w.write( SurfacingConstants.NL );
3065             //
3066             w.write( "<hr>" );
3067             w.write( SurfacingConstants.NL );
3068             w.write( "<table>" );
3069             w.write( SurfacingConstants.NL );
3070         }
3071         //
3072         for( final DomainSimilarity similarity : similarities ) {
3073             if ( ( species_order != null ) && !species_order.isEmpty() ) {
3074                 ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
3075             }
3076             if ( simple_tab_writer != null ) {
3077                 simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED,
3078                                                                     tax_code_to_id_map,
3079                                                                     null ).toString() );
3080             }
3081             if ( single_writer != null ) {
3082                 single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
3083                 single_writer.write( SurfacingConstants.NL );
3084             }
3085             else {
3086                 Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
3087                         .charAt( 0 ) );
3088                 if ( local_writer == null ) {
3089                     local_writer = split_writers.get( '0' );
3090                 }
3091                 local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() );
3092                 local_writer.write( SurfacingConstants.NL );
3093             }
3094         }
3095         switch ( print_option ) {
3096             case HTML:
3097                 for( final Writer w : split_writers.values() ) {
3098                     w.write( SurfacingConstants.NL );
3099                     w.write( "</table>" );
3100                     w.write( SurfacingConstants.NL );
3101                     w.write( "</font>" );
3102                     w.write( SurfacingConstants.NL );
3103                     w.write( "</body>" );
3104                     w.write( SurfacingConstants.NL );
3105                     w.write( "</html>" );
3106                     w.write( SurfacingConstants.NL );
3107                 }
3108                 break;
3109             default:
3110                 break;
3111         }
3112         for( final Writer w : split_writers.values() ) {
3113             w.close();
3114         }
3115     }
3116
3117     private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer,
3118                                                                  final String domain_0,
3119                                                                  final String domain_1 ) throws IOException {
3120         individual_files_writer.write( domain_0 );
3121         individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
3122         if ( !ForesterUtil.isEmpty( domain_1 ) ) {
3123             individual_files_writer.write( domain_1 );
3124             individual_files_writer.write( ForesterUtil.LINE_SEPARATOR );
3125         }
3126     }
3127
3128     public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
3129                                           final String filename,
3130                                           final Format format ) {
3131         final File outfile = new File( filename );
3132         checkForOutputFileWriteability( outfile );
3133         try {
3134             final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
3135             matrix.toWriter( out, format );
3136             out.flush();
3137             out.close();
3138         }
3139         catch ( final IOException e ) {
3140             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
3141         }
3142         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
3143     }
3144
3145     public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
3146         checkForOutputFileWriteability( matrix_outfile );
3147         try {
3148             final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
3149             for( final DistanceMatrix distance_matrix : matrices ) {
3150                 out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
3151                 out.write( ForesterUtil.LINE_SEPARATOR );
3152                 out.flush();
3153             }
3154             out.close();
3155         }
3156         catch ( final IOException e ) {
3157             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
3158         }
3159         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
3160     }
3161
3162     private static void writePfamsToFile( final String outfile_name, final SortedSet<String> pfams ) {
3163         try {
3164             final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) );
3165             for( final String pfam : pfams ) {
3166                 writer.write( pfam );
3167                 writer.write( ForesterUtil.LINE_SEPARATOR );
3168             }
3169             writer.close();
3170             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name
3171                     + "]" );
3172         }
3173         catch ( final IOException e ) {
3174             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
3175         }
3176     }
3177
3178     public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
3179         final PhylogenyWriter writer = new PhylogenyWriter();
3180         try {
3181             writer.toPhyloXML( new File( filename ), phylogeny, 1 );
3182         }
3183         catch ( final IOException e ) {
3184             ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
3185                     + e );
3186         }
3187         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
3188     }
3189
3190     public static void writePresentToNexus( final File output_file,
3191                                             final File positive_filter_file,
3192                                             final SortedSet<String> filter,
3193                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
3194         try {
3195             writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list,
3196                                                                                                 positive_filter_file == null ? null
3197                                                                                                         : filter ),
3198                                output_file + surfacing.DOMAINS_PRESENT_NEXUS,
3199                                Format.NEXUS_BINARY );
3200             writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ),
3201                                output_file + surfacing.BDC_PRESENT_NEXUS,
3202                                Format.NEXUS_BINARY );
3203         }
3204         catch ( final Exception e ) {
3205             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
3206         }
3207     }
3208
3209     public static void writeProteinListsForAllSpecies( final File output_dir,
3210                                                        final SortedMap<Species, List<Protein>> protein_lists_per_species,
3211                                                        final List<GenomeWideCombinableDomains> gwcd_list,
3212                                                        final double domain_e_cutoff ) {
3213         final SortedSet<String> all_domains = new TreeSet<String>();
3214         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
3215             all_domains.addAll( gwcd.getAllDomainIds() );
3216         }
3217         for( final String domain : all_domains ) {
3218             final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX );
3219             checkForOutputFileWriteability( out );
3220             try {
3221                 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) );
3222                 extractProteinNames( protein_lists_per_species,
3223                                      domain,
3224                                      proteins_file_writer,
3225                                      "\t",
3226                                      surfacing.LIMIT_SPEC_FOR_PROT_EX,
3227                                      domain_e_cutoff );
3228                 proteins_file_writer.close();
3229             }
3230             catch ( final IOException e ) {
3231                 ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
3232             }
3233             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" );
3234         }
3235     }
3236
3237     public static void writeTaxonomyLinks( final Writer writer,
3238                                            final String species,
3239                                            final Map<String, Integer> tax_code_to_id_map ) throws IOException {
3240         if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
3241             writer.write( " [" );
3242             if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) {
3243                 writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK
3244                         + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot</a>" );
3245             }
3246             else {
3247                 writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
3248                         + "\" target=\"taxonomy_window\">eol</a>" );
3249                 writer.write( "|" );
3250                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + species
3251                         + "\" target=\"taxonomy_window\">scholar</a>" );
3252                 writer.write( "|" );
3253                 writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_WEB_SEARCH_LINK + species
3254                         + "\" target=\"taxonomy_window\">google</a>" );
3255             }
3256             writer.write( "]" );
3257         }
3258     }
3259
3260     private static void writeToNexus( final String outfile_name,
3261                                       final CharacterStateMatrix<BinaryStates> matrix,
3262                                       final Phylogeny phylogeny ) {
3263         if ( !( matrix instanceof BasicCharacterStateMatrix ) ) {
3264             throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class
3265                     + "] to nexus" );
3266         }
3267         final BasicCharacterStateMatrix<BinaryStates> my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix<BinaryStates> ) matrix;
3268         final List<Phylogeny> phylogenies = new ArrayList<Phylogeny>( 1 );
3269         phylogenies.add( phylogeny );
3270         try {
3271             final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) );
3272             w.write( NexusConstants.NEXUS );
3273             w.write( ForesterUtil.LINE_SEPARATOR );
3274             my_matrix.writeNexusTaxaBlock( w );
3275             my_matrix.writeNexusBinaryChractersBlock( w );
3276             PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
3277             w.flush();
3278             w.close();
3279             ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
3280         }
3281         catch ( final IOException e ) {
3282             ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
3283         }
3284     }
3285
3286     private static void writeToNexus( final String outfile_name,
3287                                       final DomainParsimonyCalculator domain_parsimony,
3288                                       final Phylogeny phylogeny ) {
3289         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS,
3290                       domain_parsimony.createMatrixOfDomainPresenceOrAbsence(),
3291                       phylogeny );
3292         writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS,
3293                       domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
3294                       phylogeny );
3295     }
3296
3297     private SurfacingUtil() {
3298         // Hidden constructor.
3299     }
3300 }