in progress...
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Set;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
41
42 import org.forester.datastructures.IntMatrix;
43 import org.forester.io.parsers.IteratingPhylogenyParser;
44 import org.forester.io.parsers.PhylogenyParser;
45 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
46 import org.forester.io.parsers.nhx.NHXParser;
47 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
48 import org.forester.io.parsers.util.ParserUtils;
49 import org.forester.phylogeny.Phylogeny;
50 import org.forester.phylogeny.PhylogenyMethods;
51 import org.forester.phylogeny.PhylogenyNode;
52 import org.forester.phylogeny.data.Taxonomy;
53 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
54 import org.forester.phylogeny.factories.PhylogenyFactory;
55 import org.forester.sdi.GSDI;
56 import org.forester.sdi.GSDIR;
57 import org.forester.sdi.SDIException;
58 import org.forester.sdi.SDIR;
59 import org.forester.sdi.SDIutil;
60 import org.forester.sdi.SDIutil.ALGORITHM;
61 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
62 import org.forester.util.BasicDescriptiveStatistics;
63 import org.forester.util.ForesterUtil;
64
65 public final class RIO {
66
67     public static final int                  DEFAULT_RANGE = -1;
68     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
69     private static IntMatrix                 _m;
70     private Phylogeny[]                      _analyzed_gene_trees;
71     private List<PhylogenyNode>              _removed_gene_tree_nodes;
72     private int                              _ext_nodes;
73     private int                              _int_nodes;
74     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
75     private final StringBuilder              _log;
76     private final BasicDescriptiveStatistics _duplications_stats;
77     private final boolean                    _produce_log;
78     private final boolean                    _verbose;
79     private final REROOTING                  _rerooting;
80     private final Phylogeny                  _species_tree;
81     private Phylogeny                        _min_dub_gene_tree;
82
83     private RIO( final IteratingPhylogenyParser p,
84                  final Phylogeny species_tree,
85                  final ALGORITHM algorithm,
86                  final REROOTING rerooting,
87                  final String outgroup,
88                  int first,
89                  int last,
90                  final boolean produce_log,
91                  final boolean verbose,
92                  final boolean transfer_taxonomy )
93             throws IOException, SDIException, RIOException {
94         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
95             last = END_OF_GT;
96         }
97         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
98             first = 0;
99         }
100         removeSingleDescendentsNodes( species_tree, verbose );
101         p.reset();
102         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
103         _produce_log = produce_log;
104         _verbose = verbose;
105         _rerooting = rerooting;
106         _ext_nodes = -1;
107         _int_nodes = -1;
108         _log = new StringBuilder();
109         _gsdir_tax_comp_base = null;
110         _analyzed_gene_trees = null;
111         _removed_gene_tree_nodes = null;
112         _duplications_stats = new BasicDescriptiveStatistics();
113         p.reset();
114         inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
115         _species_tree = species_tree;
116     }
117
118     private RIO( final Phylogeny[] gene_trees,
119                  final Phylogeny species_tree,
120                  final ALGORITHM algorithm,
121                  final REROOTING rerooting,
122                  final String outgroup,
123                  int first,
124                  int last,
125                  final boolean produce_log,
126                  final boolean verbose,
127                  final boolean transfer_taxonomy )
128             throws IOException, SDIException, RIOException {
129         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
130             last = gene_trees.length - 1;
131         }
132         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
133             first = 0;
134         }
135         removeSingleDescendentsNodes( species_tree, verbose );
136         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
137         _produce_log = produce_log;
138         _verbose = verbose;
139         _rerooting = rerooting;
140         _ext_nodes = -1;
141         _int_nodes = -1;
142         _log = new StringBuilder();
143         _gsdir_tax_comp_base = null;
144         _analyzed_gene_trees = null;
145         _removed_gene_tree_nodes = null;
146         _duplications_stats = new BasicDescriptiveStatistics();
147         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
148         _species_tree = species_tree;
149     }
150
151     public final Phylogeny[] getAnalyzedGeneTrees() {
152         return _analyzed_gene_trees;
153     }
154
155     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
156         return _duplications_stats;
157     }
158
159     /**
160      * Returns the numbers of number of ext nodes in gene trees analyzed (after
161      * stripping).
162      *
163      * @return number of ext nodes in gene trees analyzed (after stripping)
164      */
165     public final int getExtNodesOfAnalyzedGeneTrees() {
166         return _ext_nodes;
167     }
168
169     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
170         return _gsdir_tax_comp_base;
171     }
172
173     /**
174      * Returns the numbers of number of int nodes in gene trees analyzed (after
175      * stripping).
176      *
177      * @return number of int nodes in gene trees analyzed (after stripping)
178      */
179     public final int getIntNodesOfAnalyzedGeneTrees() {
180         return _int_nodes;
181     }
182
183     public final StringBuilder getLog() {
184         return _log;
185     }
186
187     final public Phylogeny getMinDuplicationsGeneTree() {
188         return _min_dub_gene_tree;
189     }
190
191     public final IntMatrix getOrthologTable() {
192         return _m;
193     }
194
195     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
196         return _removed_gene_tree_nodes;
197     }
198
199     public final Phylogeny getSpeciesTree() {
200         return _species_tree;
201     }
202
203     private final void inferOrthologs( final IteratingPhylogenyParser parser,
204                                        final Phylogeny species_tree,
205                                        final ALGORITHM algorithm,
206                                        final String outgroup,
207                                        int first,
208                                        final int last,
209                                        final boolean transfer_taxonomy )
210             throws SDIException, RIOException, FileNotFoundException, IOException {
211         if ( !parser.hasNext() ) {
212             throw new RIOException( "no gene trees to analyze" );
213         }
214         if ( log() ) {
215             preLog( -1, species_tree, algorithm, outgroup );
216         }
217         if ( _verbose ) {
218             System.out.println();
219         }
220         int gene_tree_ext_nodes = 0;
221         int i = 0;
222         int counter = 0;
223         final boolean no_range = ( first < 0 ) || ( last < first );
224         while ( parser.hasNext() ) {
225             final Phylogeny gt = parser.next();
226             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
227                 if ( gt.isEmpty() ) {
228                     throw new RIOException( "gene tree #" + i + " is empty" );
229                 }
230                 if ( gt.getNumberOfExternalNodes() == 1 ) {
231                     throw new RIOException( "gene tree #" + i + " has only one external node" );
232                 }
233                 if ( _verbose ) {
234                     System.out.print( "\r" + i );
235                 }
236                 if ( counter == 0 ) {
237                     if ( algorithm == ALGORITHM.SDIR ) {
238                         // Removes from species_tree all species not found in gene_tree.
239                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
240                         if ( species_tree.isEmpty() ) {
241                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
242                         }
243                     }
244                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
245                 }
246                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
247                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
248                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
249                             + gene_tree_ext_nodes + ")" );
250                 }
251                 if ( algorithm == ALGORITHM.SDIR ) {
252                     // Removes from gene_tree all species not found in species_tree.
253                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
254                     if ( gt.isEmpty() ) {
255                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
256                     }
257                 }
258                 final Phylogeny analyzed_gt = performOrthologInference( gt,
259                                                                         species_tree,
260                                                                         algorithm,
261                                                                         outgroup,
262                                                                         counter,
263                                                                         transfer_taxonomy );
264                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
265                 ++counter;
266             }
267             ++i;
268         }
269         if ( _verbose ) {
270             System.out.print( "\rGene trees analyzed                 :\t" + counter );
271         }
272         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
273             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
274         }
275         if ( no_range ) {
276             first = 0;
277         }
278         if ( log() ) {
279             postLog( species_tree, first, ( first + counter ) - 1 );
280         }
281         if ( _verbose ) {
282             System.out.println();
283             System.out.println();
284         }
285     }
286
287     private final void inferOrthologs( final Phylogeny[] gene_trees,
288                                        final Phylogeny species_tree,
289                                        final ALGORITHM algorithm,
290                                        final String outgroup,
291                                        final int first,
292                                        final int last,
293                                        final boolean transfer_taxonomy )
294             throws SDIException, RIOException, FileNotFoundException, IOException {
295         if ( algorithm == ALGORITHM.SDIR ) {
296             // Removes from species_tree all species not found in gene_tree.
297             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
298             if ( species_tree.isEmpty() ) {
299                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
300             }
301         }
302         final Phylogeny[] my_gene_trees;
303         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
304             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
305             int c = 0;
306             for( int i = first; i <= last; ++i ) {
307                 my_gene_trees[ c++ ] = gene_trees[ i ];
308             }
309         }
310         else {
311             my_gene_trees = gene_trees;
312         }
313         if ( log() ) {
314             preLog( gene_trees.length, species_tree, algorithm, outgroup );
315         }
316         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
317             System.out.println();
318         }
319         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
320         int gene_tree_ext_nodes = 0;
321         for( int i = 0; i < my_gene_trees.length; ++i ) {
322             final Phylogeny gt = my_gene_trees[ i ];
323             if ( gt.isEmpty() ) {
324                 throw new RIOException( "gene tree #" + i + " is empty" );
325             }
326             if ( gt.getNumberOfExternalNodes() == 1 ) {
327                 throw new RIOException( "gene tree #" + i + " has only one external node" );
328             }
329             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
330                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
331             }
332             if ( i == 0 ) {
333                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
334             }
335             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
336                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
337                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
338                         + ")" );
339             }
340             if ( algorithm == ALGORITHM.SDIR ) {
341                 // Removes from gene_tree all species not found in species_tree.
342                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
343                 if ( gt.isEmpty() ) {
344                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
345                 }
346             }
347             _analyzed_gene_trees[ i ] = performOrthologInference( gt,
348                                                                   species_tree,
349                                                                   algorithm,
350                                                                   outgroup,
351                                                                   i,
352                                                                   transfer_taxonomy );
353         }
354         if ( log() ) {
355             postLog( species_tree, first, last );
356         }
357         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
358             System.out.println();
359             System.out.println();
360         }
361     }
362
363     private final boolean log() {
364         return _produce_log;
365     }
366
367     private final void log( final String s ) {
368         _log.append( s );
369         _log.append( ForesterUtil.LINE_SEPARATOR );
370     }
371
372     private final void logRemovedGeneTreeNodes() {
373         final SortedSet<String> rn = new TreeSet<String>();
374         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
375             final Taxonomy t = n.getNodeData().getTaxonomy();
376             switch ( getGSDIRtaxCompBase() ) {
377                 case CODE: {
378                     rn.add( t.getTaxonomyCode() );
379                     break;
380                 }
381                 case ID: {
382                     rn.add( t.getIdentifier().toString() );
383                     break;
384                 }
385                 case SCIENTIFIC_NAME: {
386                     rn.add( t.getScientificName() );
387                     break;
388                 }
389             }
390         }
391         final StringBuilder sb = new StringBuilder();
392         for( final String s : rn ) {
393             sb.append( '\t' );
394             sb.append( s );
395         }
396         log( "Species stripped from gene trees    :" + sb );
397     }
398
399     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
400                                                       final Phylogeny species_tree,
401                                                       final ALGORITHM algorithm,
402                                                       final String outgroup,
403                                                       final int i,
404                                                       final boolean transfer_taxonomy )
405             throws SDIException, RIOException {
406         final Phylogeny assigned_tree;
407         switch ( algorithm ) {
408             case SDIR: {
409                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
410                 break;
411             }
412             case GSDIR: {
413                 assigned_tree = performOrthologInferenceByGSDI( gene_tree,
414                                                                 species_tree,
415                                                                 outgroup,
416                                                                 i,
417                                                                 transfer_taxonomy );
418                 break;
419             }
420             default: {
421                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
422             }
423         }
424         if ( i == 0 ) {
425             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
426             _int_nodes = assigned_tree.getNumberOfInternalNodes();
427         }
428         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
429             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
430                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
431                     + ")" );
432         }
433         return assigned_tree;
434     }
435
436     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
437                                                             final Phylogeny species_tree,
438                                                             final String outgroup,
439                                                             final int i,
440                                                             final boolean transfer_taxonomy )
441             throws SDIException, RIOException {
442         final Phylogeny assigned_tree;
443         final int dups;
444         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
445             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
446             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
447             if ( i == 0 ) {
448                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
449                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
450                     if ( !r.getNodeData().isHasTaxonomy() ) {
451                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
452                                 + i + ": " + r.toString() );
453                     }
454                 }
455             }
456             if ( i == 0 ) {
457                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
458             }
459             dups = gsdir.getMinDuplicationsSum();
460         }
461         else {
462             if ( _rerooting == REROOTING.MIDPOINT ) {
463                 PhylogenyMethods.midpointRoot( gene_tree );
464             }
465             else if ( _rerooting == REROOTING.OUTGROUP ) {
466                 final PhylogenyNode n = gene_tree.getNode( outgroup );
467                 gene_tree.reRoot( n );
468             }
469             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
470             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
471             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
472                 if ( !r.getNodeData().isHasTaxonomy() ) {
473                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
474                             + ": " + r.toString() );
475                 }
476             }
477             assigned_tree = gene_tree;
478             if ( i == 0 ) {
479                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
480             }
481             dups = gsdi.getDuplicationsSum();
482         }
483         if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
484             _min_dub_gene_tree = assigned_tree;
485             _min_dub_gene_tree.setRerootable( false );
486         }
487         _duplications_stats.addValue( dups );
488         return assigned_tree;
489     }
490
491     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
492             throws SDIException {
493         final SDIR sdir = new SDIR();
494         return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
495     }
496
497     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
498         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
499         final int min = ( int ) getDuplicationsStatistics().getMin();
500         final int max = ( int ) getDuplicationsStatistics().getMax();
501         final int median = ( int ) getDuplicationsStatistics().median();
502         int min_count = 0;
503         int max_count = 0;
504         int median_count = 0;
505         for( double d : getDuplicationsStatistics().getData() ) {
506             if ( ( ( int ) d ) == min ) {
507                 ++min_count;
508             }
509             if ( ( ( int ) d ) == max ) {
510                 ++max_count;
511             }
512             if ( ( ( int ) d ) == median ) {
513                 ++median_count;
514             }
515         }
516         final double min_count_percentage = ( 100.0 * min_count ) / getDuplicationsStatistics().getN();
517         final double max_count_percentage = ( 100.0 * max_count ) / getDuplicationsStatistics().getN();
518         final double median_count_percentage = ( 100.0 * median_count ) / getDuplicationsStatistics().getN();
519         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
520             logRemovedGeneTreeNodes();
521         }
522         log( "Gene trees analyzed                 :\t" + getDuplicationsStatistics().getN() );
523         if ( ( first >= 0 ) && ( last >= 0 ) ) {
524             log( "Gene trees analyzed range           :\t" + first + "-" + last );
525         }
526         log( "Gene tree internal nodes            :\t" + getIntNodesOfAnalyzedGeneTrees() );
527         log( "Gene tree external nodes            :\t" + getExtNodesOfAnalyzedGeneTrees() );
528         log( "Removed ext gene tree nodes         :\t" + getRemovedGeneTreeNodes().size() );
529         log( "Spec tree ext nodes (after strip)   :\t" + species_tree.getNumberOfExternalNodes() );
530         log( "Spec tree polytomies (after strip)  :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
531         log( "Taxonomy linking based on           :\t" + getGSDIRtaxCompBase() );
532         log( "Mean number of duplications         :\t" + df.format( getDuplicationsStatistics().arithmeticMean() )
533                 + "\t"
534                 + df.format( ( 100.0 * getDuplicationsStatistics().arithmeticMean() )
535                         / getIntNodesOfAnalyzedGeneTrees() )
536                 + "%\t(sd: " + df.format( getDuplicationsStatistics().sampleStandardDeviation() ) + ")" );
537         if ( getDuplicationsStatistics().getN() > 3 ) {
538             log( "Median number of duplications       :\t" + df.format( median ) + "\t"
539                     + df.format( ( 100.0 * median ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
540         }
541         log( "Minimum duplications                :\t" + min + "\t"
542                 + df.format( ( 100.0 * min ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
543         log( "Maximum duplications                :\t" + ( int ) max + "\t"
544                 + df.format( ( 100.0 * max ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
545         log( "Gene trees with median duplications :\t" + median_count + "\t" + df.format( median_count_percentage )
546                 + "%" );
547         log( "Gene trees with minimum duplications:\t" + min_count + "\t" + df.format( min_count_percentage ) + "%" );
548         log( "Gene trees with maximum duplications:\t" + max_count + "\t" + df.format( max_count_percentage ) + "%" );
549     }
550
551     private final void preLog( final int gene_trees,
552                                final Phylogeny species_tree,
553                                final ALGORITHM algorithm,
554                                final String outgroup ) {
555         if ( gene_trees > 0 ) {
556             log( "Number of gene trees (total)        :\t" + gene_trees );
557         }
558         log( "Algorithm                           :\t" + algorithm );
559         log( "Spec tree ext nodes (prior strip)   :\t" + species_tree.getNumberOfExternalNodes() );
560         log( "Spec tree polytomies (prior strip)  :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
561         String rs = "";
562         switch ( _rerooting ) {
563             case BY_ALGORITHM: {
564                 rs = "minimizing duplications";
565                 break;
566             }
567             case MIDPOINT: {
568                 rs = "midpoint";
569                 break;
570             }
571             case OUTGROUP: {
572                 rs = "outgroup: " + outgroup;
573                 break;
574             }
575             case NONE: {
576                 rs = "none";
577                 break;
578             }
579         }
580         log( "Re-rooting                          :\t" + rs );
581     }
582
583     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
584             throws RIOException {
585         final List<String> labels = new ArrayList<String>();
586         final Set<String> labels_set = new HashSet<String>();
587         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
588             final String label = obtainLabel( labels_set, n );
589             labels_set.add( label );
590             labels.add( label );
591         }
592         if ( sort ) {
593             Collections.sort( labels );
594         }
595         final IntMatrix m = new IntMatrix( labels );
596         int counter = 0;
597         for( final Phylogeny gt : analyzed_gene_trees ) {
598             counter++;
599             updateCounts( m, counter, gt );
600         }
601         return m;
602     }
603
604     public final static RIO executeAnalysis( final File gene_trees_file,
605                                              final File species_tree_file,
606                                              final ALGORITHM algorithm,
607                                              final REROOTING rerooting,
608                                              final String outgroup,
609                                              final int first,
610                                              final int last,
611                                              final boolean produce_log,
612                                              final boolean verbose,
613                                              final boolean transfer_taxonomy )
614             throws IOException, SDIException, RIOException {
615         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
616         if ( gene_trees.length < 1 ) {
617             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
618         }
619         final Phylogeny species_tree = SDIutil
620                 .parseSpeciesTree( gene_trees[ 0 ], species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
621         return new RIO( gene_trees,
622                         species_tree,
623                         algorithm,
624                         rerooting,
625                         outgroup,
626                         first,
627                         last,
628                         produce_log,
629                         verbose,
630                         transfer_taxonomy );
631     }
632
633     public final static RIO executeAnalysis( final File gene_trees_file,
634                                              final Phylogeny species_tree,
635                                              final ALGORITHM algorithm,
636                                              final REROOTING rerooting,
637                                              final String outgroup,
638                                              final boolean produce_log,
639                                              final boolean verbose,
640                                              final boolean transfer_taxonomy )
641             throws IOException, SDIException, RIOException {
642         return new RIO( parseGeneTrees( gene_trees_file ),
643                         species_tree,
644                         algorithm,
645                         rerooting,
646                         outgroup,
647                         DEFAULT_RANGE,
648                         DEFAULT_RANGE,
649                         produce_log,
650                         verbose,
651                         transfer_taxonomy );
652     }
653
654     public final static RIO executeAnalysis( final File gene_trees_file,
655                                              final Phylogeny species_tree,
656                                              final ALGORITHM algorithm,
657                                              final REROOTING rerooting,
658                                              final String outgroup,
659                                              final int first,
660                                              final int last,
661                                              final boolean produce_log,
662                                              final boolean verbose,
663                                              final boolean transfer_taxonomy )
664             throws IOException, SDIException, RIOException {
665         return new RIO( parseGeneTrees( gene_trees_file ),
666                         species_tree,
667                         algorithm,
668                         rerooting,
669                         outgroup,
670                         first,
671                         last,
672                         produce_log,
673                         verbose,
674                         transfer_taxonomy );
675     }
676
677     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
678                                              final File species_tree_file,
679                                              final ALGORITHM algorithm,
680                                              final REROOTING rerooting,
681                                              final String outgroup,
682                                              final int first,
683                                              final int last,
684                                              final boolean produce_log,
685                                              final boolean verbose,
686                                              final boolean transfer_taxonomy )
687             throws IOException, SDIException, RIOException {
688         final Phylogeny g0 = p.next();
689         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
690             throw new RIOException( "input file does not seem to contain any gene trees" );
691         }
692         final Phylogeny species_tree = SDIutil
693                 .parseSpeciesTree( g0, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
694         p.reset();
695         return new RIO( p,
696                         species_tree,
697                         algorithm,
698                         rerooting,
699                         outgroup,
700                         first,
701                         last,
702                         produce_log,
703                         verbose,
704                         transfer_taxonomy );
705     }
706
707     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
708                                              final Phylogeny species_tree,
709                                              final ALGORITHM algorithm,
710                                              final REROOTING rerooting,
711                                              final String outgroup,
712                                              final boolean produce_log,
713                                              final boolean verbose,
714                                              final boolean transfer_taxonomy )
715             throws IOException, SDIException, RIOException {
716         return new RIO( p,
717                         species_tree,
718                         algorithm,
719                         rerooting,
720                         outgroup,
721                         DEFAULT_RANGE,
722                         DEFAULT_RANGE,
723                         produce_log,
724                         verbose,
725                         transfer_taxonomy );
726     }
727
728     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
729                                              final Phylogeny species_tree,
730                                              final ALGORITHM algorithm,
731                                              final REROOTING rerooting,
732                                              final String outgroup,
733                                              final int first,
734                                              final int last,
735                                              final boolean produce_log,
736                                              final boolean verbose,
737                                              final boolean transfer_taxonomy )
738             throws IOException, SDIException, RIOException {
739         return new RIO( p,
740                         species_tree,
741                         algorithm,
742                         rerooting,
743                         outgroup,
744                         first,
745                         last,
746                         produce_log,
747                         verbose,
748                         transfer_taxonomy );
749     }
750
751     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
752             throws IOException, SDIException, RIOException {
753         return new RIO( gene_trees,
754                         species_tree,
755                         ALGORITHM.GSDIR,
756                         REROOTING.BY_ALGORITHM,
757                         null,
758                         DEFAULT_RANGE,
759                         DEFAULT_RANGE,
760                         false,
761                         false,
762                         false );
763     }
764
765     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
766                                              final Phylogeny species_tree,
767                                              final ALGORITHM algorithm,
768                                              final REROOTING rerooting,
769                                              final String outgroup,
770                                              final boolean produce_log,
771                                              final boolean verbose,
772                                              final boolean transfer_taxonomy )
773             throws IOException, SDIException, RIOException {
774         return new RIO( gene_trees,
775                         species_tree,
776                         algorithm,
777                         rerooting,
778                         outgroup,
779                         DEFAULT_RANGE,
780                         DEFAULT_RANGE,
781                         produce_log,
782                         verbose,
783                         transfer_taxonomy );
784     }
785
786     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
787                                              final Phylogeny species_tree,
788                                              final ALGORITHM algorithm,
789                                              final REROOTING rerooting,
790                                              final String outgroup,
791                                              final int first,
792                                              final int last,
793                                              final boolean produce_log,
794                                              final boolean verbose,
795                                              final boolean transfer_taxonomy )
796             throws IOException, SDIException, RIOException {
797         return new RIO( gene_trees,
798                         species_tree,
799                         algorithm,
800                         rerooting,
801                         outgroup,
802                         first,
803                         last,
804                         produce_log,
805                         verbose,
806                         transfer_taxonomy );
807     }
808
809     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
810             throws RIOException {
811         if ( counter == 0 ) {
812             final List<String> labels = new ArrayList<String>();
813             final Set<String> labels_set = new HashSet<String>();
814             for( final PhylogenyNode n : g.getExternalNodes() ) {
815                 final String label = obtainLabel( labels_set, n );
816                 labels_set.add( label );
817                 labels.add( label );
818             }
819             if ( sort ) {
820                 Collections.sort( labels );
821             }
822             _m = new IntMatrix( labels );
823         }
824         updateCounts( _m, counter, g );
825     }
826
827     private final static void checkPreconditions( final IteratingPhylogenyParser p,
828                                                   final Phylogeny species_tree,
829                                                   final REROOTING rerooting,
830                                                   final String outgroup,
831                                                   final int first,
832                                                   final int last )
833             throws RIOException, IOException {
834         final Phylogeny g0 = p.next();
835         if ( ( g0 == null ) || g0.isEmpty() ) {
836             throw new RIOException( "input file does not seem to contain any gene trees" );
837         }
838         if ( g0.getNumberOfExternalNodes() < 2 ) {
839             throw new RIOException( "input file does not seem to contain any useable gene trees" );
840         }
841         if ( !species_tree.isRooted() ) {
842             throw new RIOException( "species tree is not rooted" );
843         }
844         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
845                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
846             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
847                     + last );
848         }
849         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
850             throw new RIOException( "outgroup not set for midpoint rooting" );
851         }
852         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
853             throw new RIOException( "outgroup only used for midpoint rooting" );
854         }
855         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
856             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
857         }
858         if ( rerooting == REROOTING.OUTGROUP ) {
859             try {
860                 g0.getNode( outgroup );
861             }
862             catch ( final IllegalArgumentException e ) {
863                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
864             }
865         }
866     }
867
868     private final static void checkPreconditions( final Phylogeny[] gene_trees,
869                                                   final Phylogeny species_tree,
870                                                   final REROOTING rerooting,
871                                                   final String outgroup,
872                                                   final int first,
873                                                   final int last )
874             throws RIOException {
875         if ( !species_tree.isRooted() ) {
876             throw new RIOException( "species tree is not rooted" );
877         }
878         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
879                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
880             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
881                     + last + " (out of " + gene_trees.length + ")" );
882         }
883         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
884             throw new RIOException( "outgroup not set for midpoint rooting" );
885         }
886         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
887             throw new RIOException( "outgroup only used for midpoint rooting" );
888         }
889         if ( ( rerooting == REROOTING.MIDPOINT )
890                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
891             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
892         }
893         if ( rerooting == REROOTING.OUTGROUP ) {
894             try {
895                 gene_trees[ 0 ].getNode( outgroup );
896             }
897             catch ( final IllegalArgumentException e ) {
898                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
899             }
900         }
901     }
902
903     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
904         String label;
905         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
906             label = n.getNodeData().getSequence().getName();
907         }
908         else if ( n.getNodeData().isHasSequence()
909                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
910             label = n.getNodeData().getSequence().getSymbol();
911         }
912         else if ( n.getNodeData().isHasSequence()
913                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
914             label = n.getNodeData().getSequence().getGeneName();
915         }
916         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
917             label = n.getName();
918         }
919         else {
920             throw new RIOException( "node " + n + " has no appropriate label" );
921         }
922         if ( labels_set.contains( label ) ) {
923             throw new RIOException( "label " + label + " is not unique" );
924         }
925         return label;
926     }
927
928     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file )
929             throws FileNotFoundException, IOException {
930         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
931         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
932         if ( p instanceof NHXParser ) {
933             final NHXParser nhx = ( NHXParser ) p;
934             nhx.setReplaceUnderscores( false );
935             nhx.setIgnoreQuotes( true );
936             nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
937         }
938         else if ( p instanceof NexusPhylogeniesParser ) {
939             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
940             nex.setReplaceUnderscores( false );
941             nex.setIgnoreQuotes( true );
942             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
943         }
944         return factory.create( gene_trees_file, p );
945     }
946
947     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
948         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
949         if ( o > 0 ) {
950             if ( verbose ) {
951                 System.out.println( "warning: species tree has " + o
952                         + " internal nodes with only one descendent which are therefore going to be removed" );
953             }
954             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
955         }
956     }
957
958     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
959             throws RIOException {
960         PhylogenyMethods.preOrderReId( g );
961         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
962         for( int x = 0; x < m.size(); ++x ) {
963             final String mx = m.getLabel( x );
964             final PhylogenyNode nx = map.get( mx );
965             if ( nx == null ) {
966                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
967             }
968             String my;
969             PhylogenyNode ny;
970             for( int y = 0; y < m.size(); ++y ) {
971                 my = m.getLabel( y );
972                 ny = map.get( my );
973                 if ( ny == null ) {
974                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
975                 }
976                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
977                     m.inreaseByOne( x, y );
978                 }
979             }
980         }
981     }
982
983     public enum REROOTING {
984                            NONE,
985                            BY_ALGORITHM,
986                            MIDPOINT,
987                            OUTGROUP;
988     }
989 }