kepp only one dup min gt
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.text.DecimalFormat;
34 import java.util.ArrayList;
35 import java.util.Collections;
36 import java.util.HashMap;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeSet;
42
43 import org.forester.datastructures.IntMatrix;
44 import org.forester.io.parsers.IteratingPhylogenyParser;
45 import org.forester.io.parsers.PhylogenyParser;
46 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
47 import org.forester.io.parsers.nhx.NHXParser;
48 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
49 import org.forester.io.parsers.util.ParserUtils;
50 import org.forester.phylogeny.Phylogeny;
51 import org.forester.phylogeny.PhylogenyMethods;
52 import org.forester.phylogeny.PhylogenyNode;
53 import org.forester.phylogeny.data.Taxonomy;
54 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
55 import org.forester.phylogeny.factories.PhylogenyFactory;
56 import org.forester.sdi.GSDI;
57 import org.forester.sdi.GSDIR;
58 import org.forester.sdi.SDIException;
59 import org.forester.sdi.SDIR;
60 import org.forester.sdi.SDIutil;
61 import org.forester.sdi.SDIutil.ALGORITHM;
62 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
63 import org.forester.util.BasicDescriptiveStatistics;
64 import org.forester.util.ForesterUtil;
65
66 public final class RIO {
67
68     public static final int                  DEFAULT_RANGE = -1;
69     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
70     private static IntMatrix                 _m;
71     private Phylogeny[]                      _analyzed_gene_trees;
72     private List<PhylogenyNode>              _removed_gene_tree_nodes;
73     private int                              _ext_nodes;
74     private int                              _int_nodes;
75     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
76     private final StringBuilder              _log;
77     private final BasicDescriptiveStatistics _duplications_stats;
78     private final boolean                    _produce_log;
79     private final boolean                    _verbose;
80     private final REROOTING                  _rerooting;
81
82     private RIO( final Phylogeny[] gene_trees,
83                  final Phylogeny species_tree,
84                  final ALGORITHM algorithm,
85                  final REROOTING rerooting,
86                  final String outgroup,
87                  int first,
88                  int last,
89                  final boolean produce_log,
90                  final boolean verbose ) throws IOException, SDIException, RIOException {
91         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
92             last = gene_trees.length - 1;
93         }
94         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
95             first = 0;
96         }
97         removeSingleDescendentsNodes( species_tree, verbose );
98         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
99         _produce_log = produce_log;
100         _verbose = verbose;
101         _rerooting = rerooting;
102         _ext_nodes = -1;
103         _int_nodes = -1;
104         _log = new StringBuilder();
105         _gsdir_tax_comp_base = null;
106         _analyzed_gene_trees = null;
107         _removed_gene_tree_nodes = null;
108         _duplications_stats = new BasicDescriptiveStatistics();
109         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last );
110     }
111
112     private RIO( final IteratingPhylogenyParser p,
113                  final Phylogeny species_tree,
114                  final ALGORITHM algorithm,
115                  final REROOTING rerooting,
116                  final String outgroup,
117                  int first,
118                  int last,
119                  final boolean produce_log,
120                  final boolean verbose ) throws IOException, SDIException, RIOException {
121         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
122             last = END_OF_GT;
123         }
124         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
125             first = 0;
126         }
127         removeSingleDescendentsNodes( species_tree, verbose );
128         p.reset();
129         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
130         _produce_log = produce_log;
131         _verbose = verbose;
132         _rerooting = rerooting;
133         _ext_nodes = -1;
134         _int_nodes = -1;
135         _log = new StringBuilder();
136         _gsdir_tax_comp_base = null;
137         _analyzed_gene_trees = null;
138         _removed_gene_tree_nodes = null;
139         _duplications_stats = new BasicDescriptiveStatistics();
140         p.reset();
141         inferOrthologs( p, species_tree, algorithm, outgroup, first, last );
142     }
143
144     public final Phylogeny[] getAnalyzedGeneTrees() {
145         return _analyzed_gene_trees;
146     }
147
148     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
149         return _duplications_stats;
150     }
151
152     /**
153      * Returns the numbers of number of ext nodes in gene trees analyzed (after
154      * stripping).
155      * 
156      * @return number of ext nodes in gene trees analyzed (after stripping)
157      */
158     public final int getExtNodesOfAnalyzedGeneTrees() {
159         return _ext_nodes;
160     }
161
162     /**
163      * Returns the numbers of number of int nodes in gene trees analyzed (after
164      * stripping).
165      * 
166      * @return number of int nodes in gene trees analyzed (after stripping)
167      */
168     public final int getIntNodesOfAnalyzedGeneTrees() {
169         return _int_nodes;
170     }
171
172     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
173         return _gsdir_tax_comp_base;
174     }
175
176     public final StringBuilder getLog() {
177         return _log;
178     }
179
180     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
181         return _removed_gene_tree_nodes;
182     }
183
184     private final void inferOrthologs( final Phylogeny[] gene_trees,
185                                        final Phylogeny species_tree,
186                                        final ALGORITHM algorithm,
187                                        final String outgroup,
188                                        final int first,
189                                        final int last ) throws SDIException, RIOException, FileNotFoundException,
190             IOException {
191         if ( algorithm == ALGORITHM.SDIR ) {
192             // Removes from species_tree all species not found in gene_tree.
193             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
194             if ( species_tree.isEmpty() ) {
195                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
196             }
197         }
198         final Phylogeny[] my_gene_trees;
199         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
200             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
201             int c = 0;
202             for( int i = first; i <= last; ++i ) {
203                 my_gene_trees[ c++ ] = gene_trees[ i ];
204             }
205         }
206         else {
207             my_gene_trees = gene_trees;
208         }
209         if ( log() ) {
210             preLog( gene_trees.length, species_tree, algorithm, outgroup );
211         }
212         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
213             System.out.println();
214         }
215         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
216         int gene_tree_ext_nodes = 0;
217         for( int i = 0; i < my_gene_trees.length; ++i ) {
218             final Phylogeny gt = my_gene_trees[ i ];
219             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
220                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
221             }
222             if ( i == 0 ) {
223                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
224             }
225             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
226                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
227                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
228                         + ")" );
229             }
230             if ( algorithm == ALGORITHM.SDIR ) {
231                 // Removes from gene_tree all species not found in species_tree.
232                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
233                 if ( gt.isEmpty() ) {
234                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
235                 }
236             }
237             _analyzed_gene_trees[ i ] = performOrthologInference( gt, species_tree, algorithm, outgroup, i );
238         }
239         if ( log() ) {
240             postLog( species_tree, first, last );
241         }
242         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
243             System.out.println();
244             System.out.println();
245         }
246     }
247
248     private final void inferOrthologs( final IteratingPhylogenyParser parser,
249                                        final Phylogeny species_tree,
250                                        final ALGORITHM algorithm,
251                                        final String outgroup,
252                                        int first,
253                                        final int last ) throws SDIException, RIOException, FileNotFoundException,
254             IOException {
255         if ( !parser.hasNext() ) {
256             throw new RIOException( "no gene trees to analyze" );
257         }
258         if ( log() ) {
259             preLog( -1, species_tree, algorithm, outgroup );
260         }
261         if ( _verbose ) {
262             System.out.println();
263         }
264         final DecimalFormat pf = new java.text.DecimalFormat( "000" );
265         int gene_tree_ext_nodes = 0;
266         int i = 0;
267         int counter = 0;
268         final boolean no_range = ( first < 0 ) || ( last < first );
269         while ( parser.hasNext() ) {
270             final Phylogeny gt = parser.next();
271             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
272                 if ( _verbose ) {
273                     ForesterUtil.updateProgress( i, pf );
274                 }
275                 if ( counter == 0 ) {
276                     if ( algorithm == ALGORITHM.SDIR ) {
277                         // Removes from species_tree all species not found in gene_tree.
278                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
279                         if ( species_tree.isEmpty() ) {
280                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
281                         }
282                     }
283                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
284                 }
285                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
286                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
287                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
288                             + gene_tree_ext_nodes + ")" );
289                 }
290                 if ( algorithm == ALGORITHM.SDIR ) {
291                     // Removes from gene_tree all species not found in species_tree.
292                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
293                     if ( gt.isEmpty() ) {
294                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
295                     }
296                 }
297                 final Phylogeny analyzed_gt = performOrthologInference( gt, species_tree, algorithm, outgroup, counter );
298                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
299                 ++counter;
300             }
301             ++i;
302         }
303         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
304             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
305         }
306         if ( no_range ) {
307             first = 0;
308         }
309         if ( log() ) {
310             postLog( species_tree, first, first + counter - 1 );
311         }
312         if ( _verbose ) {
313             System.out.println();
314             System.out.println();
315         }
316     }
317
318     private final boolean log() {
319         return _produce_log;
320     }
321
322     private final void log( final String s ) {
323         _log.append( s );
324         _log.append( ForesterUtil.LINE_SEPARATOR );
325     }
326
327     private final void logRemovedGeneTreeNodes() {
328         log( "Species stripped from gene trees:" );
329         final SortedSet<String> rn = new TreeSet<String>();
330         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
331             final Taxonomy t = n.getNodeData().getTaxonomy();
332             switch ( getGSDIRtaxCompBase() ) {
333                 case CODE: {
334                     rn.add( t.getTaxonomyCode() );
335                     break;
336                 }
337                 case ID: {
338                     rn.add( t.getIdentifier().toString() );
339                     break;
340                 }
341                 case SCIENTIFIC_NAME: {
342                     rn.add( t.getScientificName() );
343                     break;
344                 }
345             }
346         }
347         for( final String s : rn ) {
348             log( s );
349         }
350         log( "" );
351     }
352
353     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
354                                                       final Phylogeny species_tree,
355                                                       final ALGORITHM algorithm,
356                                                       final String outgroup,
357                                                       final int i ) throws SDIException, RIOException {
358         final Phylogeny assigned_tree;
359         switch ( algorithm ) {
360             case SDIR: {
361                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
362                 break;
363             }
364             case GSDIR: {
365                 assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i );
366                 break;
367             }
368             default: {
369                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
370             }
371         }
372         if ( i == 0 ) {
373             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
374             _int_nodes = assigned_tree.getNumberOfInternalNodes();
375         }
376         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
377             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
378                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
379                     + ")" );
380         }
381         return assigned_tree;
382     }
383
384     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
385                                                             final Phylogeny species_tree,
386                                                             final String outgroup,
387                                                             final int i ) throws SDIException, RIOException {
388         final Phylogeny assigned_tree;
389         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
390             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
391             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
392             if ( i == 0 ) {
393                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
394                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
395                     if ( !r.getNodeData().isHasTaxonomy() ) {
396                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
397                                 + i + ": " + r.toString() );
398                     }
399                 }
400             }
401             if ( i == 0 ) {
402                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
403             }
404             _duplications_stats.addValue( gsdir.getMinDuplicationsSum() );
405         }
406         else {
407             if ( _rerooting == REROOTING.MIDPOINT ) {
408                 PhylogenyMethods.midpointRoot( gene_tree );
409             }
410             else if ( _rerooting == REROOTING.OUTGROUP ) {
411                 final PhylogenyNode n = gene_tree.getNode( outgroup );
412                 gene_tree.reRoot( n );
413             }
414             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true );
415             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
416             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
417                 if ( !r.getNodeData().isHasTaxonomy() ) {
418                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
419                             + ": " + r.toString() );
420                 }
421             }
422             assigned_tree = gene_tree;
423             if ( i == 0 ) {
424                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
425             }
426             _duplications_stats.addValue( gsdi.getDuplicationsSum() );
427         }
428         return assigned_tree;
429     }
430
431     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
432             throws SDIException {
433         final SDIR sdir = new SDIR();
434         return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
435     }
436
437     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
438         log( "" );
439         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
440             logRemovedGeneTreeNodes();
441         }
442         log( "Species tree external nodes (after stripping)   : " + species_tree.getNumberOfExternalNodes() );
443         log( "Species tree polytomies (after stripping)       : "
444                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
445         log( "Taxonomy linking based on                       : " + getGSDIRtaxCompBase() );
446         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" );
447         if ( ( first >= 0 ) && ( last >= 0 ) ) {
448             log( "Gene trees analyzed range                       : " + first + "-" + last );
449         }
450         log( "Gene trees analyzed                             : " + _duplications_stats.getN() );
451         log( "Mean number of duplications                     : " + df.format( _duplications_stats.arithmeticMean() )
452                 + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " ("
453                 + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
454                 + "%)" );
455         if ( _duplications_stats.getN() > 3 ) {
456             log( "Median number of duplications                   : " + df.format( _duplications_stats.median() )
457                     + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() )
458                     + "%)" );
459         }
460         log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() + " ("
461                 + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
462         log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() + " ("
463                 + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
464         log( "Gene tree internal nodes                        : " + getIntNodesOfAnalyzedGeneTrees() );
465         log( "Gene tree external nodes                        : " + getExtNodesOfAnalyzedGeneTrees() );
466     }
467
468     private final void preLog( final int gene_trees,
469                                final Phylogeny species_tree,
470                                final ALGORITHM algorithm,
471                                final String outgroup ) {
472         if ( gene_trees > 0 ) {
473             log( "Number of gene trees (total)                    : " + gene_trees );
474         }
475         log( "Algorithm                                       : " + algorithm );
476         log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() );
477         log( "Species tree polytomies (prior to stripping)    : "
478                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
479         String rs = "";
480         switch ( _rerooting ) {
481             case BY_ALGORITHM: {
482                 rs = "minimizing duplications";
483                 break;
484             }
485             case MIDPOINT: {
486                 rs = "midpoint";
487                 break;
488             }
489             case OUTGROUP: {
490                 rs = "outgroup: " + outgroup;
491                 break;
492             }
493             case NONE: {
494                 rs = "none";
495                 break;
496             }
497         }
498         log( "Re-rooting                                      : " + rs );
499     }
500
501     public final IntMatrix getOrthologTable() {
502         return _m;
503     }
504
505     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
506             throws RIOException {
507         final List<String> labels = new ArrayList<String>();
508         final Set<String> labels_set = new HashSet<String>();
509         if ( counter == 0 ) {
510             for( final PhylogenyNode n : g.getExternalNodes() ) {
511                 final String label = obtainLabel( labels_set, n );
512                 labels_set.add( label );
513                 labels.add( label );
514             }
515             if ( sort ) {
516                 Collections.sort( labels );
517             }
518             _m = new IntMatrix( labels );
519         }
520         updateCounts( _m, counter, g );
521     }
522
523     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
524         String label;
525         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
526             label = n.getNodeData().getSequence().getName();
527         }
528         else if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
529             label = n.getNodeData().getSequence().getSymbol();
530         }
531         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
532             label = n.getName();
533         }
534         else {
535             throw new RIOException( "node " + n + " has no appropriate label" );
536         }
537         if ( labels_set.contains( label ) ) {
538             throw new RIOException( "label " + label + " is not unique" );
539         }
540         return label;
541     }
542
543     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
544             throws RIOException {
545         PhylogenyMethods.preOrderReId( g );
546         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
547         for( int x = 0; x < m.size(); ++x ) {
548             final String mx = m.getLabel( x );
549             final PhylogenyNode nx = map.get( mx );
550             if ( nx == null ) {
551                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
552             }
553             String my;
554             PhylogenyNode ny;
555             for( int y = 0; y < m.size(); ++y ) {
556                 my = m.getLabel( y );
557                 ny = map.get( my );
558                 if ( ny == null ) {
559                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
560                 }
561                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
562                     m.inreaseByOne( x, y );
563                 }
564             }
565         }
566     }
567
568     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
569             throws RIOException {
570         final List<String> labels = new ArrayList<String>();
571         final Set<String> labels_set = new HashSet<String>();
572         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
573             final String label = obtainLabel( labels_set, n );
574             labels_set.add( label );
575             labels.add( label );
576         }
577         if ( sort ) {
578             Collections.sort( labels );
579         }
580         final IntMatrix m = new IntMatrix( labels );
581         int counter = 0;
582         for( final Phylogeny gt : analyzed_gene_trees ) {
583             counter++;
584             updateCounts( m, counter, gt );
585         }
586         return m;
587     }
588
589     public final static RIO executeAnalysis( final File gene_trees_file,
590                                              final File species_tree_file,
591                                              final ALGORITHM algorithm,
592                                              final REROOTING rerooting,
593                                              final String outgroup,
594                                              final int first,
595                                              final int last,
596                                              final boolean produce_log,
597                                              final boolean verbose ) throws IOException, SDIException, RIOException {
598         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
599         if ( gene_trees.length < 1 ) {
600             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
601         }
602         final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
603                                                                  species_tree_file,
604                                                                  false,
605                                                                  true,
606                                                                  TAXONOMY_EXTRACTION.NO );
607         return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
608     }
609
610     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
611                                              final File species_tree_file,
612                                              final ALGORITHM algorithm,
613                                              final REROOTING rerooting,
614                                              final String outgroup,
615                                              final int first,
616                                              final int last,
617                                              final boolean produce_log,
618                                              final boolean verbose ) throws IOException, SDIException, RIOException {
619         final Phylogeny g0 = p.next();
620         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
621             throw new RIOException( "input file does not seem to contain any gene trees" );
622         }
623         final Phylogeny species_tree = SDIutil.parseSpeciesTree( g0,
624                                                                  species_tree_file,
625                                                                  false,
626                                                                  true,
627                                                                  TAXONOMY_EXTRACTION.NO );
628         p.reset();
629         return new RIO( p, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
630     }
631
632     public final static RIO executeAnalysis( final File gene_trees_file,
633                                              final Phylogeny species_tree,
634                                              final ALGORITHM algorithm,
635                                              final REROOTING rerooting,
636                                              final String outgroup,
637                                              final boolean produce_log,
638                                              final boolean verbose ) throws IOException, SDIException, RIOException {
639         return new RIO( parseGeneTrees( gene_trees_file ),
640                         species_tree,
641                         algorithm,
642                         rerooting,
643                         outgroup,
644                         DEFAULT_RANGE,
645                         DEFAULT_RANGE,
646                         produce_log,
647                         verbose );
648     }
649
650     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
651                                              final Phylogeny species_tree,
652                                              final ALGORITHM algorithm,
653                                              final REROOTING rerooting,
654                                              final String outgroup,
655                                              final boolean produce_log,
656                                              final boolean verbose ) throws IOException, SDIException, RIOException {
657         return new RIO( p,
658                         species_tree,
659                         algorithm,
660                         rerooting,
661                         outgroup,
662                         DEFAULT_RANGE,
663                         DEFAULT_RANGE,
664                         produce_log,
665                         verbose );
666     }
667
668     public final static RIO executeAnalysis( final File gene_trees_file,
669                                              final Phylogeny species_tree,
670                                              final ALGORITHM algorithm,
671                                              final REROOTING rerooting,
672                                              final String outgroup,
673                                              final int first,
674                                              final int last,
675                                              final boolean produce_log,
676                                              final boolean verbose ) throws IOException, SDIException, RIOException {
677         return new RIO( parseGeneTrees( gene_trees_file ),
678                         species_tree,
679                         algorithm,
680                         rerooting,
681                         outgroup,
682                         first,
683                         last,
684                         produce_log,
685                         verbose );
686     }
687
688     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
689             throws IOException, SDIException, RIOException {
690         return new RIO( gene_trees,
691                         species_tree,
692                         ALGORITHM.GSDIR,
693                         REROOTING.BY_ALGORITHM,
694                         null,
695                         DEFAULT_RANGE,
696                         DEFAULT_RANGE,
697                         false,
698                         false );
699     }
700
701     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
702                                              final Phylogeny species_tree,
703                                              final ALGORITHM algorithm,
704                                              final REROOTING rerooting,
705                                              final String outgroup,
706                                              final boolean produce_log,
707                                              final boolean verbose ) throws IOException, SDIException, RIOException {
708         return new RIO( gene_trees,
709                         species_tree,
710                         algorithm,
711                         rerooting,
712                         outgroup,
713                         DEFAULT_RANGE,
714                         DEFAULT_RANGE,
715                         produce_log,
716                         verbose );
717     }
718
719     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
720                                              final Phylogeny species_tree,
721                                              final ALGORITHM algorithm,
722                                              final REROOTING rerooting,
723                                              final String outgroup,
724                                              final int first,
725                                              final int last,
726                                              final boolean produce_log,
727                                              final boolean verbose ) throws IOException, SDIException, RIOException {
728         return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
729     }
730
731     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
732                                              final Phylogeny species_tree,
733                                              final ALGORITHM algorithm,
734                                              final REROOTING rerooting,
735                                              final String outgroup,
736                                              final int first,
737                                              final int last,
738                                              final boolean produce_log,
739                                              final boolean verbose ) throws IOException, SDIException, RIOException {
740         return new RIO( p, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
741     }
742
743     private final static void checkPreconditions( final Phylogeny[] gene_trees,
744                                                   final Phylogeny species_tree,
745                                                   final REROOTING rerooting,
746                                                   final String outgroup,
747                                                   final int first,
748                                                   final int last ) throws RIOException {
749         if ( !species_tree.isRooted() ) {
750             throw new RIOException( "species tree is not rooted" );
751         }
752         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
753                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
754             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
755                     + last + " (out of " + gene_trees.length + ")" );
756         }
757         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
758             throw new RIOException( "outgroup not set for midpoint rooting" );
759         }
760         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
761             throw new RIOException( "outgroup only used for midpoint rooting" );
762         }
763         if ( ( rerooting == REROOTING.MIDPOINT )
764                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
765             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
766         }
767         if ( rerooting == REROOTING.OUTGROUP ) {
768             try {
769                 gene_trees[ 0 ].getNode( outgroup );
770             }
771             catch ( final IllegalArgumentException e ) {
772                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
773             }
774         }
775     }
776
777     private final static void checkPreconditions( final IteratingPhylogenyParser p,
778                                                   final Phylogeny species_tree,
779                                                   final REROOTING rerooting,
780                                                   final String outgroup,
781                                                   final int first,
782                                                   final int last ) throws RIOException, IOException {
783         final Phylogeny g0 = p.next();
784         if ( ( g0 == null ) || g0.isEmpty() ) {
785             throw new RIOException( "input file does not seem to contain any gene trees" );
786         }
787         if ( g0.getNumberOfExternalNodes() < 2 ) {
788             throw new RIOException( "input file does not seem to contain any useable gene trees" );
789         }
790         if ( !species_tree.isRooted() ) {
791             throw new RIOException( "species tree is not rooted" );
792         }
793         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
794                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
795             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
796                     + last );
797         }
798         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
799             throw new RIOException( "outgroup not set for midpoint rooting" );
800         }
801         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
802             throw new RIOException( "outgroup only used for midpoint rooting" );
803         }
804         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
805             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
806         }
807         if ( rerooting == REROOTING.OUTGROUP ) {
808             try {
809                 g0.getNode( outgroup );
810             }
811             catch ( final IllegalArgumentException e ) {
812                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
813             }
814         }
815     }
816
817     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
818             IOException {
819         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
820         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
821         if ( p instanceof NHXParser ) {
822             final NHXParser nhx = ( NHXParser ) p;
823             nhx.setReplaceUnderscores( false );
824             nhx.setIgnoreQuotes( true );
825             nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
826         }
827         else if ( p instanceof NexusPhylogeniesParser ) {
828             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
829             nex.setReplaceUnderscores( false );
830             nex.setIgnoreQuotes( true );
831             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.YES );
832         }
833         return factory.create( gene_trees_file, p );
834     }
835
836     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
837         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
838         if ( o > 0 ) {
839             if ( verbose ) {
840                 System.out.println( "warning: species tree has " + o
841                         + " internal nodes with only one descendent which are therefore going to be removed" );
842             }
843             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
844         }
845     }
846
847     public enum REROOTING {
848         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
849     }
850 }