JAL-2844 colouring subtree method made public
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeSet;
42
43 import org.forester.datastructures.IntMatrix;
44 import org.forester.io.parsers.IteratingPhylogenyParser;
45 import org.forester.io.parsers.PhylogenyParser;
46 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
47 import org.forester.io.parsers.nhx.NHXParser;
48 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
49 import org.forester.io.parsers.util.ParserUtils;
50 import org.forester.phylogeny.Phylogeny;
51 import org.forester.phylogeny.PhylogenyMethods;
52 import org.forester.phylogeny.PhylogenyNode;
53 import org.forester.phylogeny.data.Taxonomy;
54 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
55 import org.forester.phylogeny.factories.PhylogenyFactory;
56 import org.forester.sdi.GSDI;
57 import org.forester.sdi.GSDIR;
58 import org.forester.sdi.SDIException;
59 import org.forester.sdi.SDIR;
60 import org.forester.sdi.SDIutil;
61 import org.forester.sdi.SDIutil.ALGORITHM;
62 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
63 import org.forester.util.BasicDescriptiveStatistics;
64 import org.forester.util.ForesterUtil;
65
66 public final class RIO {
67
68     public static final int                  DEFAULT_RANGE = -1;
69     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
70     private static IntMatrix                 _m;
71     private Phylogeny[]                      _analyzed_gene_trees;
72     private List<PhylogenyNode>              _removed_gene_tree_nodes;
73     private int                              _ext_nodes;
74     private int                              _int_nodes;
75     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
76     private final StringBuilder              _log;
77     private final BasicDescriptiveStatistics _duplications_stats;
78     private final boolean                    _produce_log;
79     private final boolean                    _verbose;
80     private final REROOTING                  _rerooting;
81     private final Phylogeny                  _species_tree;
82     private Phylogeny                        _min_dub_gene_tree;
83     private Map<Integer, Phylogeny>          _dup_to_tree_map;
84
85     private RIO( final IteratingPhylogenyParser p,
86                  final Phylogeny species_tree,
87                  final ALGORITHM algorithm,
88                  final REROOTING rerooting,
89                  final String outgroup,
90                  int first,
91                  int last,
92                  final boolean produce_log,
93                  final boolean verbose,
94                  final boolean transfer_taxonomy )
95             throws IOException, SDIException, RIOException {
96         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
97             last = END_OF_GT;
98         }
99         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
100             first = 0;
101         }
102         removeSingleDescendentsNodes( species_tree, verbose );
103         p.reset();
104         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
105         _produce_log = produce_log;
106         _verbose = verbose;
107         _rerooting = rerooting;
108         _ext_nodes = -1;
109         _int_nodes = -1;
110         _log = new StringBuilder();
111         _gsdir_tax_comp_base = null;
112         _analyzed_gene_trees = null;
113         _removed_gene_tree_nodes = null;
114         _duplications_stats = new BasicDescriptiveStatistics();
115         p.reset();
116         inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
117         _species_tree = species_tree;
118     }
119
120     private RIO( final Phylogeny[] gene_trees,
121                  final Phylogeny species_tree,
122                  final ALGORITHM algorithm,
123                  final REROOTING rerooting,
124                  final String outgroup,
125                  int first,
126                  int last,
127                  final boolean produce_log,
128                  final boolean verbose,
129                  final boolean transfer_taxonomy )
130             throws IOException, SDIException, RIOException {
131         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
132             last = gene_trees.length - 1;
133         }
134         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
135             first = 0;
136         }
137         removeSingleDescendentsNodes( species_tree, verbose );
138         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
139         _produce_log = produce_log;
140         _verbose = verbose;
141         _rerooting = rerooting;
142         _ext_nodes = -1;
143         _int_nodes = -1;
144         _log = new StringBuilder();
145         _gsdir_tax_comp_base = null;
146         _analyzed_gene_trees = null;
147         _removed_gene_tree_nodes = null;
148         _duplications_stats = new BasicDescriptiveStatistics();
149         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
150         _species_tree = species_tree;
151     }
152
153     public final Phylogeny[] getAnalyzedGeneTrees() {
154         return _analyzed_gene_trees;
155     }
156
157     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
158         return _duplications_stats;
159     }
160
161     /**
162      * Returns the numbers of number of ext nodes in gene trees analyzed (after
163      * stripping).
164      *
165      * @return number of ext nodes in gene trees analyzed (after stripping)
166      */
167     public final int getExtNodesOfAnalyzedGeneTrees() {
168         return _ext_nodes;
169     }
170
171     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
172         return _gsdir_tax_comp_base;
173     }
174
175     /**
176      * Returns the numbers of number of int nodes in gene trees analyzed (after
177      * stripping).
178      *
179      * @return number of int nodes in gene trees analyzed (after stripping)
180      */
181     public final int getIntNodesOfAnalyzedGeneTrees() {
182         return _int_nodes;
183     }
184
185     public final StringBuilder getLog() {
186         return _log;
187     }
188
189     final public Phylogeny getMinDuplicationsGeneTree() {
190         return _min_dub_gene_tree;
191     }
192
193     public final IntMatrix getOrthologTable() {
194         return _m;
195     }
196
197     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
198         return _removed_gene_tree_nodes;
199     }
200
201     public final Phylogeny getSpeciesTree() {
202         return _species_tree;
203     }
204
205     private final void inferOrthologs( final IteratingPhylogenyParser parser,
206                                        final Phylogeny species_tree,
207                                        final ALGORITHM algorithm,
208                                        final String outgroup,
209                                        int first,
210                                        final int last,
211                                        final boolean transfer_taxonomy )
212             throws SDIException, RIOException, FileNotFoundException, IOException {
213         if ( !parser.hasNext() ) {
214             throw new RIOException( "no gene trees to analyze" );
215         }
216         if ( log() ) {
217             preLog( -1, species_tree, algorithm, outgroup );
218         }
219         if ( _verbose ) {
220             System.out.println();
221         }
222         int gene_tree_ext_nodes = 0;
223         int i = 0;
224         int counter = 0;
225         final boolean no_range = ( first < 0 ) || ( last < first );
226         while ( parser.hasNext() ) {
227             final Phylogeny gt = parser.next();
228             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
229                 if ( gt.isEmpty() ) {
230                     throw new RIOException( "gene tree #" + i + " is empty" );
231                 }
232                 if ( gt.getNumberOfExternalNodes() == 1 ) {
233                     throw new RIOException( "gene tree #" + i + " has only one external node" );
234                 }
235                 if ( _verbose ) {
236                     System.out.print( "\r" + i );
237                 }
238                 if ( counter == 0 ) {
239                     if ( algorithm == ALGORITHM.SDIR ) {
240                         // Removes from species_tree all species not found in gene_tree.
241                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
242                         if ( species_tree.isEmpty() ) {
243                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
244                         }
245                     }
246                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
247                 }
248                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
249                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
250                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
251                             + gene_tree_ext_nodes + ")" );
252                 }
253                 if ( algorithm == ALGORITHM.SDIR ) {
254                     // Removes from gene_tree all species not found in species_tree.
255                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
256                     if ( gt.isEmpty() ) {
257                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
258                     }
259                 }
260                 final Phylogeny analyzed_gt = performOrthologInference( gt,
261                                                                         species_tree,
262                                                                         algorithm,
263                                                                         outgroup,
264                                                                         counter,
265                                                                         transfer_taxonomy );
266                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
267                 ++counter;
268             }
269             ++i;
270         }
271         if ( _verbose ) {
272             System.out.print( "\rGene trees analyzed                 :\t" + counter );
273         }
274         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
275             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
276         }
277         if ( no_range ) {
278             first = 0;
279         }
280         if ( log() ) {
281             postLog( species_tree, first, ( first + counter ) - 1 );
282         }
283         if ( _verbose ) {
284             System.out.println();
285             System.out.println();
286         }
287     }
288
289     private final void inferOrthologs( final Phylogeny[] gene_trees,
290                                        final Phylogeny species_tree,
291                                        final ALGORITHM algorithm,
292                                        final String outgroup,
293                                        final int first,
294                                        final int last,
295                                        final boolean transfer_taxonomy )
296             throws SDIException, RIOException, FileNotFoundException, IOException {
297         if ( algorithm == ALGORITHM.SDIR ) {
298             // Removes from species_tree all species not found in gene_tree.
299             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
300             if ( species_tree.isEmpty() ) {
301                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
302             }
303         }
304         final Phylogeny[] my_gene_trees;
305         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
306             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
307             int c = 0;
308             for( int i = first; i <= last; ++i ) {
309                 my_gene_trees[ c++ ] = gene_trees[ i ];
310             }
311         }
312         else {
313             my_gene_trees = gene_trees;
314         }
315         if ( log() ) {
316             preLog( gene_trees.length, species_tree, algorithm, outgroup );
317         }
318         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
319             System.out.println();
320         }
321         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
322         int gene_tree_ext_nodes = 0;
323         for( int i = 0; i < my_gene_trees.length; ++i ) {
324             final Phylogeny gt = my_gene_trees[ i ];
325             if ( gt.isEmpty() ) {
326                 throw new RIOException( "gene tree #" + i + " is empty" );
327             }
328             if ( gt.getNumberOfExternalNodes() == 1 ) {
329                 throw new RIOException( "gene tree #" + i + " has only one external node" );
330             }
331             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
332                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
333             }
334             if ( i == 0 ) {
335                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
336             }
337             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
338                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
339                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
340                         + ")" );
341             }
342             if ( algorithm == ALGORITHM.SDIR ) {
343                 // Removes from gene_tree all species not found in species_tree.
344                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
345                 if ( gt.isEmpty() ) {
346                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
347                 }
348             }
349             _analyzed_gene_trees[ i ] = performOrthologInference( gt,
350                                                                   species_tree,
351                                                                   algorithm,
352                                                                   outgroup,
353                                                                   i,
354                                                                   transfer_taxonomy );
355         }
356         if ( log() ) {
357             postLog( species_tree, first, last );
358         }
359         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
360             System.out.println();
361             System.out.println();
362         }
363     }
364
365     private final boolean log() {
366         return _produce_log;
367     }
368
369     private final void log( final String s ) {
370         _log.append( s );
371         _log.append( ForesterUtil.LINE_SEPARATOR );
372     }
373
374     private final void logRemovedGeneTreeNodes() {
375         final SortedSet<String> rn = new TreeSet<String>();
376         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
377             final Taxonomy t = n.getNodeData().getTaxonomy();
378             switch ( getGSDIRtaxCompBase() ) {
379                 case CODE: {
380                     rn.add( t.getTaxonomyCode() );
381                     break;
382                 }
383                 case ID: {
384                     rn.add( t.getIdentifier().toString() );
385                     break;
386                 }
387                 case SCIENTIFIC_NAME: {
388                     rn.add( t.getScientificName() );
389                     break;
390                 }
391             }
392         }
393         final StringBuilder sb = new StringBuilder();
394         for( final String s : rn ) {
395             sb.append( '\t' );
396             sb.append( s );
397         }
398         log( "Species stripped from gene trees    :" + sb );
399     }
400
401     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
402                                                       final Phylogeny species_tree,
403                                                       final ALGORITHM algorithm,
404                                                       final String outgroup,
405                                                       final int i,
406                                                       final boolean transfer_taxonomy )
407             throws SDIException, RIOException {
408         final Phylogeny assigned_tree;
409         switch ( algorithm ) {
410             case SDIR: {
411                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
412                 break;
413             }
414             case GSDIR: {
415                 assigned_tree = performOrthologInferenceByGSDI( gene_tree,
416                                                                 species_tree,
417                                                                 outgroup,
418                                                                 i,
419                                                                 transfer_taxonomy );
420                 break;
421             }
422             default: {
423                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
424             }
425         }
426         if ( i == 0 ) {
427             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
428             _int_nodes = assigned_tree.getNumberOfInternalNodes();
429         }
430         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
431             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
432                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
433                     + ")" );
434         }
435         return assigned_tree;
436     }
437
438     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
439                                                             final Phylogeny species_tree,
440                                                             final String outgroup,
441                                                             final int i,
442                                                             final boolean transfer_taxonomy )
443             throws SDIException, RIOException {
444         final Phylogeny assigned_tree;
445         final int dups;
446         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
447             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
448             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
449             if ( i == 0 ) {
450                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
451                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
452                     if ( !r.getNodeData().isHasTaxonomy() ) {
453                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
454                                 + i + ": " + r.toString() );
455                     }
456                 }
457             }
458             if ( i == 0 ) {
459                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
460             }
461             dups = gsdir.getMinDuplicationsSum();
462         }
463         else {
464             if ( _rerooting == REROOTING.MIDPOINT ) {
465                 PhylogenyMethods.midpointRoot( gene_tree );
466             }
467             else if ( _rerooting == REROOTING.OUTGROUP ) {
468                 final PhylogenyNode n = gene_tree.getNode( outgroup );
469                 gene_tree.reRoot( n );
470             }
471             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
472             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
473             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
474                 if ( !r.getNodeData().isHasTaxonomy() ) {
475                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
476                             + ": " + r.toString() );
477                 }
478             }
479             assigned_tree = gene_tree;
480             if ( i == 0 ) {
481                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
482             }
483             dups = gsdi.getDuplicationsSum();
484         }
485         assigned_tree.setRerootable( false );
486         double new_dist = -1;
487         if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
488             _min_dub_gene_tree = assigned_tree;
489         }
490         else if ( dups == _duplications_stats.getMin() ) {
491             new_dist = PhylogenyMethods.calculateMaxDistanceToRoot( assigned_tree );
492             if ( new_dist < PhylogenyMethods.calculateMaxDistanceToRoot( _min_dub_gene_tree ) ) {
493                 _min_dub_gene_tree = assigned_tree;
494             }
495         }
496         if ( _dup_to_tree_map == null ) {
497             _dup_to_tree_map = new HashMap<Integer, Phylogeny>();
498         }
499         if ( !_dup_to_tree_map.containsKey( dups ) ) {
500             _dup_to_tree_map.put( dups, assigned_tree );
501         }
502         else {
503             if ( new_dist == -1 ) {
504                 new_dist = PhylogenyMethods.calculateMaxDistanceToRoot( assigned_tree );
505             }
506             if ( new_dist < PhylogenyMethods.calculateMaxDistanceToRoot( _dup_to_tree_map.get( dups ) ) ) {
507                 _dup_to_tree_map.put( dups, assigned_tree );
508             }
509         }
510         _duplications_stats.addValue( dups );
511         return assigned_tree;
512     }
513
514     final public Map<Integer, Phylogeny> getDuplicationsToTreeMap() {
515         return _dup_to_tree_map;
516     }
517
518     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
519             throws SDIException {
520         final SDIR sdir = new SDIR();
521         final Phylogeny r = sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
522         r.setRerootable( false );
523         final int dups = sdir.getMinimalDuplications();
524         _duplications_stats.addValue( dups );
525         return r;
526     }
527
528     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
529         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
530         final int min = ( int ) getDuplicationsStatistics().getMin();
531         final int max = ( int ) getDuplicationsStatistics().getMax();
532         final int median = ( int ) getDuplicationsStatistics().median();
533         int min_count = 0;
534         int max_count = 0;
535         int median_count = 0;
536         for( double d : getDuplicationsStatistics().getData() ) {
537             if ( ( ( int ) d ) == min ) {
538                 ++min_count;
539             }
540             if ( ( ( int ) d ) == max ) {
541                 ++max_count;
542             }
543             if ( ( ( int ) d ) == median ) {
544                 ++median_count;
545             }
546         }
547         final double min_count_percentage = ( 100.0 * min_count ) / getDuplicationsStatistics().getN();
548         final double max_count_percentage = ( 100.0 * max_count ) / getDuplicationsStatistics().getN();
549         final double median_count_percentage = ( 100.0 * median_count ) / getDuplicationsStatistics().getN();
550         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
551             logRemovedGeneTreeNodes();
552         }
553         log( "Gene trees analyzed                 :\t" + getDuplicationsStatistics().getN() );
554         if ( ( first >= 0 ) && ( last >= 0 ) ) {
555             log( "Gene trees analyzed range           :\t" + first + "-" + last );
556         }
557         log( "Gene tree internal nodes            :\t" + getIntNodesOfAnalyzedGeneTrees() );
558         log( "Gene tree external nodes            :\t" + getExtNodesOfAnalyzedGeneTrees() );
559         log( "Removed ext gene tree nodes         :\t" + getRemovedGeneTreeNodes().size() );
560         log( "Spec tree ext nodes (after strip)   :\t" + species_tree.getNumberOfExternalNodes() );
561         log( "Spec tree polytomies (after strip)  :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
562         log( "Taxonomy linking based on           :\t" + getGSDIRtaxCompBase() );
563         log( "Mean number of duplications         :\t" + df.format( getDuplicationsStatistics().arithmeticMean() )
564                 + "\t"
565                 + df.format( ( 100.0 * getDuplicationsStatistics().arithmeticMean() )
566                         / getIntNodesOfAnalyzedGeneTrees() )
567                 + "%\t(sd: " + df.format( getDuplicationsStatistics().sampleStandardDeviation() ) + ")" );
568         if ( getDuplicationsStatistics().getN() > 3 ) {
569             log( "Median number of duplications       :\t" + df.format( median ) + "\t"
570                     + df.format( ( 100.0 * median ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
571         }
572         log( "Minimum duplications                :\t" + min + "\t"
573                 + df.format( ( 100.0 * min ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
574         log( "Maximum duplications                :\t" + ( int ) max + "\t"
575                 + df.format( ( 100.0 * max ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
576         log( "Gene trees with median duplications :\t" + median_count + "\t" + df.format( median_count_percentage )
577                 + "%" );
578         log( "Gene trees with minimum duplications:\t" + min_count + "\t" + df.format( min_count_percentage ) + "%" );
579         log( "Gene trees with maximum duplications:\t" + max_count + "\t" + df.format( max_count_percentage ) + "%" );
580     }
581
582     private final void preLog( final int gene_trees,
583                                final Phylogeny species_tree,
584                                final ALGORITHM algorithm,
585                                final String outgroup ) {
586         if ( gene_trees > 0 ) {
587             log( "Number of gene trees (total)        :\t" + gene_trees );
588         }
589         log( "Algorithm                           :\t" + algorithm );
590         log( "Spec tree ext nodes (prior strip)   :\t" + species_tree.getNumberOfExternalNodes() );
591         log( "Spec tree polytomies (prior strip)  :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
592         String rs = "";
593         switch ( _rerooting ) {
594             case BY_ALGORITHM: {
595                 rs = "minimizing duplications";
596                 break;
597             }
598             case MIDPOINT: {
599                 rs = "midpoint";
600                 break;
601             }
602             case OUTGROUP: {
603                 rs = "outgroup: " + outgroup;
604                 break;
605             }
606             case NONE: {
607                 rs = "none";
608                 break;
609             }
610         }
611         log( "Re-rooting                          :\t" + rs );
612     }
613
614     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
615             throws RIOException {
616         final List<String> labels = new ArrayList<String>();
617         final Set<String> labels_set = new HashSet<String>();
618         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
619             final String label = obtainLabel( labels_set, n );
620             labels_set.add( label );
621             labels.add( label );
622         }
623         if ( sort ) {
624             Collections.sort( labels );
625         }
626         final IntMatrix m = new IntMatrix( labels );
627         int counter = 0;
628         for( final Phylogeny gt : analyzed_gene_trees ) {
629             counter++;
630             updateCounts( m, counter, gt );
631         }
632         return m;
633     }
634
635     public final static RIO executeAnalysis( final File gene_trees_file,
636                                              final File species_tree_file,
637                                              final ALGORITHM algorithm,
638                                              final REROOTING rerooting,
639                                              final String outgroup,
640                                              final int first,
641                                              final int last,
642                                              final boolean produce_log,
643                                              final boolean verbose,
644                                              final boolean transfer_taxonomy )
645             throws IOException, SDIException, RIOException {
646         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
647         if ( gene_trees.length < 1 ) {
648             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
649         }
650         final Phylogeny species_tree = SDIutil
651                 .parseSpeciesTree( gene_trees[ 0 ], species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
652         return new RIO( gene_trees,
653                         species_tree,
654                         algorithm,
655                         rerooting,
656                         outgroup,
657                         first,
658                         last,
659                         produce_log,
660                         verbose,
661                         transfer_taxonomy );
662     }
663
664     public final static RIO executeAnalysis( final File gene_trees_file,
665                                              final Phylogeny species_tree,
666                                              final ALGORITHM algorithm,
667                                              final REROOTING rerooting,
668                                              final String outgroup,
669                                              final boolean produce_log,
670                                              final boolean verbose,
671                                              final boolean transfer_taxonomy )
672             throws IOException, SDIException, RIOException {
673         return new RIO( parseGeneTrees( gene_trees_file ),
674                         species_tree,
675                         algorithm,
676                         rerooting,
677                         outgroup,
678                         DEFAULT_RANGE,
679                         DEFAULT_RANGE,
680                         produce_log,
681                         verbose,
682                         transfer_taxonomy );
683     }
684
685     public final static RIO executeAnalysis( final File gene_trees_file,
686                                              final Phylogeny species_tree,
687                                              final ALGORITHM algorithm,
688                                              final REROOTING rerooting,
689                                              final String outgroup,
690                                              final int first,
691                                              final int last,
692                                              final boolean produce_log,
693                                              final boolean verbose,
694                                              final boolean transfer_taxonomy )
695             throws IOException, SDIException, RIOException {
696         return new RIO( parseGeneTrees( gene_trees_file ),
697                         species_tree,
698                         algorithm,
699                         rerooting,
700                         outgroup,
701                         first,
702                         last,
703                         produce_log,
704                         verbose,
705                         transfer_taxonomy );
706     }
707
708     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
709                                              final File species_tree_file,
710                                              final ALGORITHM algorithm,
711                                              final REROOTING rerooting,
712                                              final String outgroup,
713                                              final int first,
714                                              final int last,
715                                              final boolean produce_log,
716                                              final boolean verbose,
717                                              final boolean transfer_taxonomy )
718             throws IOException, SDIException, RIOException {
719         final Phylogeny g0 = p.next();
720         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
721             throw new RIOException( "input file does not seem to contain any gene trees" );
722         }
723         final Phylogeny species_tree = SDIutil
724                 .parseSpeciesTree( g0, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
725         p.reset();
726         return new RIO( p,
727                         species_tree,
728                         algorithm,
729                         rerooting,
730                         outgroup,
731                         first,
732                         last,
733                         produce_log,
734                         verbose,
735                         transfer_taxonomy );
736     }
737
738     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
739                                              final Phylogeny species_tree,
740                                              final ALGORITHM algorithm,
741                                              final REROOTING rerooting,
742                                              final String outgroup,
743                                              final boolean produce_log,
744                                              final boolean verbose,
745                                              final boolean transfer_taxonomy )
746             throws IOException, SDIException, RIOException {
747         return new RIO( p,
748                         species_tree,
749                         algorithm,
750                         rerooting,
751                         outgroup,
752                         DEFAULT_RANGE,
753                         DEFAULT_RANGE,
754                         produce_log,
755                         verbose,
756                         transfer_taxonomy );
757     }
758
759     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
760                                              final Phylogeny species_tree,
761                                              final ALGORITHM algorithm,
762                                              final REROOTING rerooting,
763                                              final String outgroup,
764                                              final int first,
765                                              final int last,
766                                              final boolean produce_log,
767                                              final boolean verbose,
768                                              final boolean transfer_taxonomy )
769             throws IOException, SDIException, RIOException {
770         return new RIO( p,
771                         species_tree,
772                         algorithm,
773                         rerooting,
774                         outgroup,
775                         first,
776                         last,
777                         produce_log,
778                         verbose,
779                         transfer_taxonomy );
780     }
781
782     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
783             throws IOException, SDIException, RIOException {
784         return new RIO( gene_trees,
785                         species_tree,
786                         ALGORITHM.GSDIR,
787                         REROOTING.BY_ALGORITHM,
788                         null,
789                         DEFAULT_RANGE,
790                         DEFAULT_RANGE,
791                         false,
792                         false,
793                         false );
794     }
795
796     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
797                                              final Phylogeny species_tree,
798                                              final ALGORITHM algorithm,
799                                              final REROOTING rerooting,
800                                              final String outgroup,
801                                              final boolean produce_log,
802                                              final boolean verbose,
803                                              final boolean transfer_taxonomy )
804             throws IOException, SDIException, RIOException {
805         return new RIO( gene_trees,
806                         species_tree,
807                         algorithm,
808                         rerooting,
809                         outgroup,
810                         DEFAULT_RANGE,
811                         DEFAULT_RANGE,
812                         produce_log,
813                         verbose,
814                         transfer_taxonomy );
815     }
816
817     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
818                                              final Phylogeny species_tree,
819                                              final ALGORITHM algorithm,
820                                              final REROOTING rerooting,
821                                              final String outgroup,
822                                              final int first,
823                                              final int last,
824                                              final boolean produce_log,
825                                              final boolean verbose,
826                                              final boolean transfer_taxonomy )
827             throws IOException, SDIException, RIOException {
828         return new RIO( gene_trees,
829                         species_tree,
830                         algorithm,
831                         rerooting,
832                         outgroup,
833                         first,
834                         last,
835                         produce_log,
836                         verbose,
837                         transfer_taxonomy );
838     }
839
840     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
841             throws RIOException {
842         if ( counter == 0 ) {
843             final List<String> labels = new ArrayList<String>();
844             final Set<String> labels_set = new HashSet<String>();
845             for( final PhylogenyNode n : g.getExternalNodes() ) {
846                 final String label = obtainLabel( labels_set, n );
847                 labels_set.add( label );
848                 labels.add( label );
849             }
850             if ( sort ) {
851                 Collections.sort( labels );
852             }
853             _m = new IntMatrix( labels );
854         }
855         updateCounts( _m, counter, g );
856     }
857
858     private final static void checkPreconditions( final IteratingPhylogenyParser p,
859                                                   final Phylogeny species_tree,
860                                                   final REROOTING rerooting,
861                                                   final String outgroup,
862                                                   final int first,
863                                                   final int last )
864             throws RIOException, IOException {
865         final Phylogeny g0 = p.next();
866         if ( ( g0 == null ) || g0.isEmpty() ) {
867             throw new RIOException( "input file does not seem to contain any gene trees" );
868         }
869         if ( g0.getNumberOfExternalNodes() < 2 ) {
870             throw new RIOException( "input file does not seem to contain any useable gene trees" );
871         }
872         if ( !species_tree.isRooted() ) {
873             throw new RIOException( "species tree is not rooted" );
874         }
875         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
876                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
877             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
878                     + last );
879         }
880         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
881             throw new RIOException( "outgroup not set for midpoint rooting" );
882         }
883         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
884             throw new RIOException( "outgroup only used for midpoint rooting" );
885         }
886         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
887             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
888         }
889         if ( rerooting == REROOTING.OUTGROUP ) {
890             try {
891                 g0.getNode( outgroup );
892             }
893             catch ( final IllegalArgumentException e ) {
894                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
895             }
896         }
897     }
898
899     private final static void checkPreconditions( final Phylogeny[] gene_trees,
900                                                   final Phylogeny species_tree,
901                                                   final REROOTING rerooting,
902                                                   final String outgroup,
903                                                   final int first,
904                                                   final int last )
905             throws RIOException {
906         if ( !species_tree.isRooted() ) {
907             throw new RIOException( "species tree is not rooted" );
908         }
909         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
910                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
911             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
912                     + last + " (out of " + gene_trees.length + ")" );
913         }
914         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
915             throw new RIOException( "outgroup not set for midpoint rooting" );
916         }
917         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
918             throw new RIOException( "outgroup only used for midpoint rooting" );
919         }
920         if ( ( rerooting == REROOTING.MIDPOINT )
921                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
922             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
923         }
924         if ( rerooting == REROOTING.OUTGROUP ) {
925             try {
926                 gene_trees[ 0 ].getNode( outgroup );
927             }
928             catch ( final IllegalArgumentException e ) {
929                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
930             }
931         }
932     }
933
934     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
935         String label;
936         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
937             label = n.getNodeData().getSequence().getName();
938         }
939         else if ( n.getNodeData().isHasSequence()
940                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
941             label = n.getNodeData().getSequence().getSymbol();
942         }
943         else if ( n.getNodeData().isHasSequence()
944                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
945             label = n.getNodeData().getSequence().getGeneName();
946         }
947         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
948             label = n.getName();
949         }
950         else {
951             throw new RIOException( "node " + n + " has no appropriate label" );
952         }
953         if ( labels_set.contains( label ) ) {
954             throw new RIOException( "label " + label + " is not unique" );
955         }
956         return label;
957     }
958
959     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file )
960             throws FileNotFoundException, IOException {
961         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
962         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
963         if ( p instanceof NHXParser ) {
964             final NHXParser nhx = ( NHXParser ) p;
965             nhx.setReplaceUnderscores( false );
966             nhx.setIgnoreQuotes( true );
967             nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
968         }
969         else if ( p instanceof NexusPhylogeniesParser ) {
970             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
971             nex.setReplaceUnderscores( false );
972             nex.setIgnoreQuotes( true );
973             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
974         }
975         return factory.create( gene_trees_file, p );
976     }
977
978     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
979         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
980         if ( o > 0 ) {
981             if ( verbose ) {
982                 System.out.println( "warning: species tree has " + o
983                         + " internal nodes with only one descendent which are therefore going to be removed" );
984             }
985             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
986         }
987     }
988
989     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
990             throws RIOException {
991         PhylogenyMethods.preOrderReId( g );
992         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
993         for( int x = 0; x < m.size(); ++x ) {
994             final String mx = m.getLabel( x );
995             final PhylogenyNode nx = map.get( mx );
996             if ( nx == null ) {
997                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
998             }
999             String my;
1000             PhylogenyNode ny;
1001             for( int y = 0; y < m.size(); ++y ) {
1002                 my = m.getLabel( y );
1003                 ny = map.get( my );
1004                 if ( ny == null ) {
1005                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
1006                 }
1007                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
1008                     m.inreaseByOne( x, y );
1009                 }
1010             }
1011         }
1012     }
1013
1014     public enum REROOTING {
1015                            NONE,
1016                            BY_ALGORITHM,
1017                            MIDPOINT,
1018                            OUTGROUP;
1019     }
1020 }