09020348807dcf3652935b6e44b6d049c7594968
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Set;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
41
42 import org.forester.datastructures.IntMatrix;
43 import org.forester.io.parsers.IteratingPhylogenyParser;
44 import org.forester.io.parsers.PhylogenyParser;
45 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
46 import org.forester.io.parsers.nhx.NHXParser;
47 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
48 import org.forester.io.parsers.util.ParserUtils;
49 import org.forester.phylogeny.Phylogeny;
50 import org.forester.phylogeny.PhylogenyMethods;
51 import org.forester.phylogeny.PhylogenyNode;
52 import org.forester.phylogeny.data.Taxonomy;
53 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
54 import org.forester.phylogeny.factories.PhylogenyFactory;
55 import org.forester.sdi.GSDI;
56 import org.forester.sdi.GSDIR;
57 import org.forester.sdi.SDIException;
58 import org.forester.sdi.SDIR;
59 import org.forester.sdi.SDIutil;
60 import org.forester.sdi.SDIutil.ALGORITHM;
61 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
62 import org.forester.util.BasicDescriptiveStatistics;
63 import org.forester.util.ForesterUtil;
64
65 public final class RIO {
66
67     public static final int                  DEFAULT_RANGE = -1;
68     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
69     private static IntMatrix                 _m;
70     private Phylogeny[]                      _analyzed_gene_trees;
71     private List<PhylogenyNode>              _removed_gene_tree_nodes;
72     private int                              _ext_nodes;
73     private int                              _int_nodes;
74     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
75     private final StringBuilder              _log;
76     private final BasicDescriptiveStatistics _duplications_stats;
77     private final boolean                    _produce_log;
78     private final boolean                    _verbose;
79     private final REROOTING                  _rerooting;
80     private final Phylogeny                  _species_tree;
81     private Phylogeny                        _min_dub_gene_tree;
82
83     private RIO( final IteratingPhylogenyParser p,
84                  final Phylogeny species_tree,
85                  final ALGORITHM algorithm,
86                  final REROOTING rerooting,
87                  final String outgroup,
88                  int first,
89                  int last,
90                  final boolean produce_log,
91                  final boolean verbose,
92                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
93         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
94             last = END_OF_GT;
95         }
96         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
97             first = 0;
98         }
99         removeSingleDescendentsNodes( species_tree, verbose );
100         p.reset();
101         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
102         _produce_log = produce_log;
103         _verbose = verbose;
104         _rerooting = rerooting;
105         _ext_nodes = -1;
106         _int_nodes = -1;
107         _log = new StringBuilder();
108         _gsdir_tax_comp_base = null;
109         _analyzed_gene_trees = null;
110         _removed_gene_tree_nodes = null;
111         _duplications_stats = new BasicDescriptiveStatistics();
112         p.reset();
113         inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
114         _species_tree = species_tree;
115     }
116
117     private RIO( final Phylogeny[] gene_trees,
118                  final Phylogeny species_tree,
119                  final ALGORITHM algorithm,
120                  final REROOTING rerooting,
121                  final String outgroup,
122                  int first,
123                  int last,
124                  final boolean produce_log,
125                  final boolean verbose,
126                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
127         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
128             last = gene_trees.length - 1;
129         }
130         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
131             first = 0;
132         }
133         removeSingleDescendentsNodes( species_tree, verbose );
134         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
135         _produce_log = produce_log;
136         _verbose = verbose;
137         _rerooting = rerooting;
138         _ext_nodes = -1;
139         _int_nodes = -1;
140         _log = new StringBuilder();
141         _gsdir_tax_comp_base = null;
142         _analyzed_gene_trees = null;
143         _removed_gene_tree_nodes = null;
144         _duplications_stats = new BasicDescriptiveStatistics();
145         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
146         _species_tree = species_tree;
147     }
148
149     public final Phylogeny[] getAnalyzedGeneTrees() {
150         return _analyzed_gene_trees;
151     }
152
153     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
154         return _duplications_stats;
155     }
156
157     /**
158      * Returns the numbers of number of ext nodes in gene trees analyzed (after
159      * stripping).
160      *
161      * @return number of ext nodes in gene trees analyzed (after stripping)
162      */
163     public final int getExtNodesOfAnalyzedGeneTrees() {
164         return _ext_nodes;
165     }
166
167     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
168         return _gsdir_tax_comp_base;
169     }
170
171     /**
172      * Returns the numbers of number of int nodes in gene trees analyzed (after
173      * stripping).
174      *
175      * @return number of int nodes in gene trees analyzed (after stripping)
176      */
177     public final int getIntNodesOfAnalyzedGeneTrees() {
178         return _int_nodes;
179     }
180
181     public final StringBuilder getLog() {
182         return _log;
183     }
184
185     final public Phylogeny getMinDuplicationsGeneTree() {
186         return _min_dub_gene_tree;
187     }
188
189     public final IntMatrix getOrthologTable() {
190         return _m;
191     }
192
193     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
194         return _removed_gene_tree_nodes;
195     }
196
197     public final Phylogeny getSpeciesTree() {
198         return _species_tree;
199     }
200
201     private final void inferOrthologs( final IteratingPhylogenyParser parser,
202                                        final Phylogeny species_tree,
203                                        final ALGORITHM algorithm,
204                                        final String outgroup,
205                                        int first,
206                                        final int last,
207                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
208                                        FileNotFoundException, IOException {
209         if ( !parser.hasNext() ) {
210             throw new RIOException( "no gene trees to analyze" );
211         }
212         if ( log() ) {
213             preLog( -1, species_tree, algorithm, outgroup );
214         }
215         if ( _verbose ) {
216             System.out.println();
217         }
218         int gene_tree_ext_nodes = 0;
219         int i = 0;
220         int counter = 0;
221         final boolean no_range = ( first < 0 ) || ( last < first );
222         while ( parser.hasNext() ) {
223             final Phylogeny gt = parser.next();
224             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
225                 if ( gt.isEmpty() ) {
226                     throw new RIOException( "gene tree #" + i + " is empty" );
227                 }
228                 if ( gt.getNumberOfExternalNodes() == 1 ) {
229                     throw new RIOException( "gene tree #" + i + " has only one external node" );
230                 }
231                 if ( _verbose ) {
232                     System.out.print( "\r" + i );
233                 }
234                 if ( counter == 0 ) {
235                     if ( algorithm == ALGORITHM.SDIR ) {
236                         // Removes from species_tree all species not found in gene_tree.
237                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
238                         if ( species_tree.isEmpty() ) {
239                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
240                         }
241                     }
242                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
243                 }
244                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
245                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
246                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
247                             + gene_tree_ext_nodes + ")" );
248                 }
249                 if ( algorithm == ALGORITHM.SDIR ) {
250                     // Removes from gene_tree all species not found in species_tree.
251                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
252                     if ( gt.isEmpty() ) {
253                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
254                     }
255                 }
256                 final Phylogeny analyzed_gt = performOrthologInference( gt,
257                                                                         species_tree,
258                                                                         algorithm,
259                                                                         outgroup,
260                                                                         counter,
261                                                                         transfer_taxonomy );
262                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
263                 ++counter;
264             }
265             ++i;
266         }
267         if ( _verbose ) {
268             System.out.print( "\rGene trees analyzed                 :\t" + counter  );
269         }
270         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
271             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
272         }
273         if ( no_range ) {
274             first = 0;
275         }
276         if ( log() ) {
277             postLog( species_tree, first, ( first + counter ) - 1 );
278         }
279         if ( _verbose ) {
280             System.out.println();
281             System.out.println();
282         }
283     }
284
285     private final void inferOrthologs( final Phylogeny[] gene_trees,
286                                        final Phylogeny species_tree,
287                                        final ALGORITHM algorithm,
288                                        final String outgroup,
289                                        final int first,
290                                        final int last,
291                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
292                                        FileNotFoundException, IOException {
293         if ( algorithm == ALGORITHM.SDIR ) {
294             // Removes from species_tree all species not found in gene_tree.
295             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
296             if ( species_tree.isEmpty() ) {
297                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
298             }
299         }
300         final Phylogeny[] my_gene_trees;
301         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
302             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
303             int c = 0;
304             for( int i = first; i <= last; ++i ) {
305                 my_gene_trees[ c++ ] = gene_trees[ i ];
306             }
307         }
308         else {
309             my_gene_trees = gene_trees;
310         }
311         if ( log() ) {
312             preLog( gene_trees.length, species_tree, algorithm, outgroup );
313         }
314         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
315             System.out.println();
316         }
317         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
318         int gene_tree_ext_nodes = 0;
319         for( int i = 0; i < my_gene_trees.length; ++i ) {
320             final Phylogeny gt = my_gene_trees[ i ];
321             if ( gt.isEmpty() ) {
322                 throw new RIOException( "gene tree #" + i + " is empty" );
323             }
324             if ( gt.getNumberOfExternalNodes() == 1 ) {
325                 throw new RIOException( "gene tree #" + i + " has only one external node" );
326             }
327             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
328                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
329             }
330             if ( i == 0 ) {
331                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
332             }
333             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
334                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
335                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
336                         + ")" );
337             }
338             if ( algorithm == ALGORITHM.SDIR ) {
339                 // Removes from gene_tree all species not found in species_tree.
340                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
341                 if ( gt.isEmpty() ) {
342                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
343                 }
344             }
345             _analyzed_gene_trees[ i ] = performOrthologInference( gt,
346                                                                   species_tree,
347                                                                   algorithm,
348                                                                   outgroup,
349                                                                   i,
350                                                                   transfer_taxonomy );
351         }
352         if ( log() ) {
353             postLog( species_tree, first, last );
354         }
355         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
356             System.out.println();
357             System.out.println();
358         }
359     }
360
361     private final boolean log() {
362         return _produce_log;
363     }
364
365     private final void log( final String s ) {
366         _log.append( s );
367         _log.append( ForesterUtil.LINE_SEPARATOR );
368     }
369
370     private final void logRemovedGeneTreeNodes() {
371         final SortedSet<String> rn = new TreeSet<String>();
372         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
373             final Taxonomy t = n.getNodeData().getTaxonomy();
374             switch ( getGSDIRtaxCompBase() ) {
375                 case CODE: {
376                     rn.add( t.getTaxonomyCode() );
377                     break;
378                 }
379                 case ID: {
380                     rn.add( t.getIdentifier().toString() );
381                     break;
382                 }
383                 case SCIENTIFIC_NAME: {
384                     rn.add( t.getScientificName() );
385                     break;
386                 }
387             }
388         }
389         final StringBuilder sb = new StringBuilder();
390         for( final String s : rn ) {
391             sb.append( '\t' );
392             sb.append( s );
393         }
394         log( "Species stripped from gene trees    :" + sb);
395     }
396
397     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
398                                                       final Phylogeny species_tree,
399                                                       final ALGORITHM algorithm,
400                                                       final String outgroup,
401                                                       final int i,
402                                                       final boolean transfer_taxonomy ) throws SDIException,
403                                                       RIOException {
404         final Phylogeny assigned_tree;
405         switch ( algorithm ) {
406             case SDIR: {
407                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
408                 break;
409             }
410             case GSDIR: {
411                 assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i, transfer_taxonomy );
412                 break;
413             }
414             default: {
415                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
416             }
417         }
418         if ( i == 0 ) {
419             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
420             _int_nodes = assigned_tree.getNumberOfInternalNodes();
421         }
422         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
423             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
424                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
425                     + ")" );
426         }
427         return assigned_tree;
428     }
429
430     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
431                                                             final Phylogeny species_tree,
432                                                             final String outgroup,
433                                                             final int i,
434                                                             final boolean transfer_taxonomy ) throws SDIException,
435                                                             RIOException {
436         final Phylogeny assigned_tree;
437         final int dups;
438         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
439             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
440             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
441             if ( i == 0 ) {
442                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
443                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
444                     if ( !r.getNodeData().isHasTaxonomy() ) {
445                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
446                                 + i + ": " + r.toString() );
447                     }
448                 }
449             }
450             if ( i == 0 ) {
451                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
452             }
453             dups = gsdir.getMinDuplicationsSum();
454         }
455         else {
456             if ( _rerooting == REROOTING.MIDPOINT ) {
457                 PhylogenyMethods.midpointRoot( gene_tree );
458             }
459             else if ( _rerooting == REROOTING.OUTGROUP ) {
460                 final PhylogenyNode n = gene_tree.getNode( outgroup );
461                 gene_tree.reRoot( n );
462             }
463             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
464             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
465             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
466                 if ( !r.getNodeData().isHasTaxonomy() ) {
467                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
468                                             + ": " + r.toString() );
469                 }
470             }
471             assigned_tree = gene_tree;
472             if ( i == 0 ) {
473                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
474             }
475             dups = gsdi.getDuplicationsSum();
476         }
477         if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
478             _min_dub_gene_tree = assigned_tree;
479             _min_dub_gene_tree.setRerootable( false );
480         }
481         _duplications_stats.addValue( dups );
482         return assigned_tree;
483     }
484
485     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
486             throws SDIException {
487         final SDIR sdir = new SDIR();
488         return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
489     }
490
491     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
492         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
493         final int min = ( int ) getDuplicationsStatistics().getMin();
494         final int max = ( int ) getDuplicationsStatistics().getMax();
495         final int median = ( int ) getDuplicationsStatistics().median();
496         int min_count = 0;
497         int max_count = 0;
498         int median_count = 0;
499         for( double d : getDuplicationsStatistics().getData() ) {
500             if ( ( ( int ) d ) == min ) {
501                 ++min_count;
502             }
503             if ( ( ( int ) d ) == max ) {
504                 ++max_count;
505             }
506             if ( ( ( int ) d ) == median ) {
507                 ++median_count;
508             }
509         }
510         final double min_count_percentage = ( 100.0 * min_count ) / getDuplicationsStatistics().getN();
511         final double max_count_percentage = ( 100.0 * max_count ) / getDuplicationsStatistics().getN();
512         final double median_count_percentage = ( 100.0 * median_count ) / getDuplicationsStatistics().getN();
513         
514        
515         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
516             logRemovedGeneTreeNodes();
517         }
518        
519         log( "Gene trees analyzed                 :\t" + getDuplicationsStatistics().getN() );
520         if ( ( first >= 0 ) && ( last >= 0 ) ) {
521             log( "Gene trees analyzed range           :\t" + first + "-" + last );
522         }
523         log( "Gene tree internal nodes            :\t" + getIntNodesOfAnalyzedGeneTrees() );
524         log( "Gene tree external nodes            :\t" + getExtNodesOfAnalyzedGeneTrees() );
525         log( "Removed ext gene tree nodes         :\t" + getRemovedGeneTreeNodes().size() );
526         log( "Spec tree ext nodes (after strip)   :\t" + species_tree.getNumberOfExternalNodes() );
527         log( "Spec tree polytomies (after strip)  :\t"
528                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
529         log( "Taxonomy linking based on           :\t" + getGSDIRtaxCompBase() );
530         log( "Mean number of duplications         :\t" + df.format( getDuplicationsStatistics().arithmeticMean() )
531                 + "\t" + df.format( ( 100.0 * getDuplicationsStatistics().arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
532                 + "%\t(sd: " + df.format( getDuplicationsStatistics().sampleStandardDeviation() ) + ")" );
533         if ( getDuplicationsStatistics().getN() > 3 ) {
534             log( "Median number of duplications       :\t" + df.format( median ) + "\t"
535                     + df.format( ( 100.0 * median ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
536         }
537         log( "Minimum duplications                :\t" + min + "\t"
538                 + df.format( ( 100.0 * min ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
539         log( "Maximum duplications                :\t" + ( int ) max + "\t"
540                 + df.format( ( 100.0 * max ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
541         log( "Gene trees with median duplications :\t" + median_count + "\t"
542                 + df.format( median_count_percentage ) + "%" );
543         log( "Gene trees with minimum duplications:\t" + min_count + "\t"
544                 + df.format( min_count_percentage ) + "%" );
545         log( "Gene trees with maximum duplications:\t" + max_count + "\t"
546                 + df.format( max_count_percentage ) + "%" );
547         
548     }
549
550     private final void preLog( final int gene_trees,
551                                final Phylogeny species_tree,
552                                final ALGORITHM algorithm,
553                                final String outgroup ) {
554         if ( gene_trees > 0 ) {
555             log( "Number of gene trees (total)        :\t" + gene_trees );
556         }
557
558         log( "Algorithm                           :\t" + algorithm );
559         log( "Spec tree ext nodes (prior strip)   :\t" + species_tree.getNumberOfExternalNodes() );
560         log( "Spec tree polytomies (prior strip)  :\t"
561                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
562         String rs = "";
563         switch ( _rerooting ) {
564             case BY_ALGORITHM: {
565                 rs = "minimizing duplications";
566                 break;
567             }
568             case MIDPOINT: {
569                 rs = "midpoint";
570                 break;
571             }
572             case OUTGROUP: {
573                 rs = "outgroup: " + outgroup;
574                 break;
575             }
576             case NONE: {
577                 rs = "none";
578                 break;
579             }
580         }
581         log( "Re-rooting                          :\t" + rs );
582         
583     }
584
585     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
586             throws RIOException {
587         final List<String> labels = new ArrayList<String>();
588         final Set<String> labels_set = new HashSet<String>();
589         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
590             final String label = obtainLabel( labels_set, n );
591             labels_set.add( label );
592             labels.add( label );
593         }
594         if ( sort ) {
595             Collections.sort( labels );
596         }
597         final IntMatrix m = new IntMatrix( labels );
598         int counter = 0;
599         for( final Phylogeny gt : analyzed_gene_trees ) {
600             counter++;
601             updateCounts( m, counter, gt );
602         }
603         return m;
604     }
605
606     public final static RIO executeAnalysis( final File gene_trees_file,
607                                              final File species_tree_file,
608                                              final ALGORITHM algorithm,
609                                              final REROOTING rerooting,
610                                              final String outgroup,
611                                              final int first,
612                                              final int last,
613                                              final boolean produce_log,
614                                              final boolean verbose,
615                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
616                                              RIOException {
617         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
618         if ( gene_trees.length < 1 ) {
619             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
620         }
621         final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
622                                                                  species_tree_file,
623                                                                  false,
624                                                                  true,
625                                                                  TAXONOMY_EXTRACTION.NO );
626         return new RIO( gene_trees,
627                         species_tree,
628                         algorithm,
629                         rerooting,
630                         outgroup,
631                         first,
632                         last,
633                         produce_log,
634                         verbose,
635                         transfer_taxonomy );
636     }
637
638     public final static RIO executeAnalysis( final File gene_trees_file,
639                                              final Phylogeny species_tree,
640                                              final ALGORITHM algorithm,
641                                              final REROOTING rerooting,
642                                              final String outgroup,
643                                              final boolean produce_log,
644                                              final boolean verbose,
645                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
646                                              RIOException {
647         return new RIO( parseGeneTrees( gene_trees_file ),
648                         species_tree,
649                         algorithm,
650                         rerooting,
651                         outgroup,
652                         DEFAULT_RANGE,
653                         DEFAULT_RANGE,
654                         produce_log,
655                         verbose,
656                         transfer_taxonomy );
657     }
658
659     public final static RIO executeAnalysis( final File gene_trees_file,
660                                              final Phylogeny species_tree,
661                                              final ALGORITHM algorithm,
662                                              final REROOTING rerooting,
663                                              final String outgroup,
664                                              final int first,
665                                              final int last,
666                                              final boolean produce_log,
667                                              final boolean verbose,
668                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
669                                              RIOException {
670         return new RIO( parseGeneTrees( gene_trees_file ),
671                         species_tree,
672                         algorithm,
673                         rerooting,
674                         outgroup,
675                         first,
676                         last,
677                         produce_log,
678                         verbose,
679                         transfer_taxonomy );
680     }
681
682     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
683                                              final File species_tree_file,
684                                              final ALGORITHM algorithm,
685                                              final REROOTING rerooting,
686                                              final String outgroup,
687                                              final int first,
688                                              final int last,
689                                              final boolean produce_log,
690                                              final boolean verbose,
691                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
692                                              RIOException {
693         final Phylogeny g0 = p.next();
694         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
695             throw new RIOException( "input file does not seem to contain any gene trees" );
696         }
697         final Phylogeny species_tree = SDIutil.parseSpeciesTree( g0,
698                                                                  species_tree_file,
699                                                                  false,
700                                                                  true,
701                                                                  TAXONOMY_EXTRACTION.NO );
702         p.reset();
703         return new RIO( p,
704                         species_tree,
705                         algorithm,
706                         rerooting,
707                         outgroup,
708                         first,
709                         last,
710                         produce_log,
711                         verbose,
712                         transfer_taxonomy );
713     }
714
715     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
716                                              final Phylogeny species_tree,
717                                              final ALGORITHM algorithm,
718                                              final REROOTING rerooting,
719                                              final String outgroup,
720                                              final boolean produce_log,
721                                              final boolean verbose,
722                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
723                                              RIOException {
724         return new RIO( p,
725                         species_tree,
726                         algorithm,
727                         rerooting,
728                         outgroup,
729                         DEFAULT_RANGE,
730                         DEFAULT_RANGE,
731                         produce_log,
732                         verbose,
733                         transfer_taxonomy );
734     }
735
736     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
737                                              final Phylogeny species_tree,
738                                              final ALGORITHM algorithm,
739                                              final REROOTING rerooting,
740                                              final String outgroup,
741                                              final int first,
742                                              final int last,
743                                              final boolean produce_log,
744                                              final boolean verbose,
745                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
746                                              RIOException {
747         return new RIO( p,
748                         species_tree,
749                         algorithm,
750                         rerooting,
751                         outgroup,
752                         first,
753                         last,
754                         produce_log,
755                         verbose,
756                         transfer_taxonomy );
757     }
758
759     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
760             throws IOException, SDIException, RIOException {
761         return new RIO( gene_trees,
762                         species_tree,
763                         ALGORITHM.GSDIR,
764                         REROOTING.BY_ALGORITHM,
765                         null,
766                         DEFAULT_RANGE,
767                         DEFAULT_RANGE,
768                         false,
769                         false,
770                         false );
771     }
772
773     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
774                                              final Phylogeny species_tree,
775                                              final ALGORITHM algorithm,
776                                              final REROOTING rerooting,
777                                              final String outgroup,
778                                              final boolean produce_log,
779                                              final boolean verbose,
780                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
781                                              RIOException {
782         return new RIO( gene_trees,
783                         species_tree,
784                         algorithm,
785                         rerooting,
786                         outgroup,
787                         DEFAULT_RANGE,
788                         DEFAULT_RANGE,
789                         produce_log,
790                         verbose,
791                         transfer_taxonomy );
792     }
793
794     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
795                                              final Phylogeny species_tree,
796                                              final ALGORITHM algorithm,
797                                              final REROOTING rerooting,
798                                              final String outgroup,
799                                              final int first,
800                                              final int last,
801                                              final boolean produce_log,
802                                              final boolean verbose,
803                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
804                                              RIOException {
805         return new RIO( gene_trees,
806                         species_tree,
807                         algorithm,
808                         rerooting,
809                         outgroup,
810                         first,
811                         last,
812                         produce_log,
813                         verbose,
814                         transfer_taxonomy );
815     }
816
817     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
818             throws RIOException {
819         if ( counter == 0 ) {
820             final List<String> labels = new ArrayList<String>();
821             final Set<String> labels_set = new HashSet<String>();
822             for( final PhylogenyNode n : g.getExternalNodes() ) {
823                 final String label = obtainLabel( labels_set, n );
824                 labels_set.add( label );
825                 labels.add( label );
826             }
827             if ( sort ) {
828                 Collections.sort( labels );
829             }
830             _m = new IntMatrix( labels );
831         }
832         updateCounts( _m, counter, g );
833     }
834
835     private final static void checkPreconditions( final IteratingPhylogenyParser p,
836                                                   final Phylogeny species_tree,
837                                                   final REROOTING rerooting,
838                                                   final String outgroup,
839                                                   final int first,
840                                                   final int last ) throws RIOException, IOException {
841         final Phylogeny g0 = p.next();
842         if ( ( g0 == null ) || g0.isEmpty() ) {
843             throw new RIOException( "input file does not seem to contain any gene trees" );
844         }
845         if ( g0.getNumberOfExternalNodes() < 2 ) {
846             throw new RIOException( "input file does not seem to contain any useable gene trees" );
847         }
848         if ( !species_tree.isRooted() ) {
849             throw new RIOException( "species tree is not rooted" );
850         }
851         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
852                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
853             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
854                     + last );
855         }
856         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
857             throw new RIOException( "outgroup not set for midpoint rooting" );
858         }
859         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
860             throw new RIOException( "outgroup only used for midpoint rooting" );
861         }
862         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
863             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
864         }
865         if ( rerooting == REROOTING.OUTGROUP ) {
866             try {
867                 g0.getNode( outgroup );
868             }
869             catch ( final IllegalArgumentException e ) {
870                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
871             }
872         }
873     }
874
875     private final static void checkPreconditions( final Phylogeny[] gene_trees,
876                                                   final Phylogeny species_tree,
877                                                   final REROOTING rerooting,
878                                                   final String outgroup,
879                                                   final int first,
880                                                   final int last ) throws RIOException {
881         if ( !species_tree.isRooted() ) {
882             throw new RIOException( "species tree is not rooted" );
883         }
884         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
885                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
886             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
887                     + last + " (out of " + gene_trees.length + ")" );
888         }
889         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
890             throw new RIOException( "outgroup not set for midpoint rooting" );
891         }
892         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
893             throw new RIOException( "outgroup only used for midpoint rooting" );
894         }
895         if ( ( rerooting == REROOTING.MIDPOINT )
896                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
897             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
898         }
899         if ( rerooting == REROOTING.OUTGROUP ) {
900             try {
901                 gene_trees[ 0 ].getNode( outgroup );
902             }
903             catch ( final IllegalArgumentException e ) {
904                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
905             }
906         }
907     }
908
909     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
910         String label;
911         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
912             label = n.getNodeData().getSequence().getName();
913         }
914         else if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
915             label = n.getNodeData().getSequence().getSymbol();
916         }
917         else if ( n.getNodeData().isHasSequence()
918                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
919             label = n.getNodeData().getSequence().getGeneName();
920         }
921         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
922             label = n.getName();
923         }
924         else {
925             throw new RIOException( "node " + n + " has no appropriate label" );
926         }
927         if ( labels_set.contains( label ) ) {
928             throw new RIOException( "label " + label + " is not unique" );
929         }
930         return label;
931     }
932
933     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
934     IOException {
935         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
936         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
937         if ( p instanceof NHXParser ) {
938             final NHXParser nhx = ( NHXParser ) p;
939             nhx.setReplaceUnderscores( false );
940             nhx.setIgnoreQuotes( true );
941             nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
942         }
943         else if ( p instanceof NexusPhylogeniesParser ) {
944             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
945             nex.setReplaceUnderscores( false );
946             nex.setIgnoreQuotes( true );
947             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
948         }
949         return factory.create( gene_trees_file, p );
950     }
951
952     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
953         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
954         if ( o > 0 ) {
955             if ( verbose ) {
956                 System.out.println( "warning: species tree has " + o
957                                     + " internal nodes with only one descendent which are therefore going to be removed" );
958             }
959             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
960         }
961     }
962
963     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
964             throws RIOException {
965         PhylogenyMethods.preOrderReId( g );
966         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
967         for( int x = 0; x < m.size(); ++x ) {
968             final String mx = m.getLabel( x );
969             final PhylogenyNode nx = map.get( mx );
970             if ( nx == null ) {
971                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
972             }
973             String my;
974             PhylogenyNode ny;
975             for( int y = 0; y < m.size(); ++y ) {
976                 my = m.getLabel( y );
977                 ny = map.get( my );
978                 if ( ny == null ) {
979                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
980                 }
981                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
982                     m.inreaseByOne( x, y );
983                 }
984             }
985         }
986     }
987
988     public enum REROOTING {
989         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
990     }
991 }