in progress...
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.text.DecimalFormat;
34 import java.util.ArrayList;
35 import java.util.Collections;
36 import java.util.HashMap;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeSet;
42
43 import org.forester.datastructures.IntMatrix;
44 import org.forester.io.parsers.IteratingPhylogenyParser;
45 import org.forester.io.parsers.PhylogenyParser;
46 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
47 import org.forester.io.parsers.nhx.NHXParser;
48 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
49 import org.forester.io.parsers.util.ParserUtils;
50 import org.forester.phylogeny.Phylogeny;
51 import org.forester.phylogeny.PhylogenyMethods;
52 import org.forester.phylogeny.PhylogenyNode;
53 import org.forester.phylogeny.data.Taxonomy;
54 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
55 import org.forester.phylogeny.factories.PhylogenyFactory;
56 import org.forester.sdi.GSDI;
57 import org.forester.sdi.GSDIR;
58 import org.forester.sdi.SDIException;
59 import org.forester.sdi.SDIR;
60 import org.forester.sdi.SDIutil;
61 import org.forester.sdi.SDIutil.ALGORITHM;
62 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
63 import org.forester.util.BasicDescriptiveStatistics;
64 import org.forester.util.ForesterUtil;
65
66 public final class RIO {
67
68     public static final int                  DEFAULT_RANGE = -1;
69     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
70     private static IntMatrix                 _m;
71     private Phylogeny[]                      _analyzed_gene_trees;
72     private List<PhylogenyNode>              _removed_gene_tree_nodes;
73     private int                              _ext_nodes;
74     private int                              _int_nodes;
75     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
76     private final StringBuilder              _log;
77     private final BasicDescriptiveStatistics _duplications_stats;
78     private final boolean                    _produce_log;
79     private final boolean                    _verbose;
80     private final REROOTING                  _rerooting;
81     private final Phylogeny                  _species_tree;
82     private Phylogeny                        _min_dub_gene_tree;
83
84     private RIO( final IteratingPhylogenyParser p,
85                  final Phylogeny species_tree,
86                  final ALGORITHM algorithm,
87                  final REROOTING rerooting,
88                  final String outgroup,
89                  int first,
90                  int last,
91                  final boolean produce_log,
92                  final boolean verbose,
93                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
94         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
95             last = END_OF_GT;
96         }
97         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
98             first = 0;
99         }
100         removeSingleDescendentsNodes( species_tree, verbose );
101         p.reset();
102         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
103         _produce_log = produce_log;
104         _verbose = verbose;
105         _rerooting = rerooting;
106         _ext_nodes = -1;
107         _int_nodes = -1;
108         _log = new StringBuilder();
109         _gsdir_tax_comp_base = null;
110         _analyzed_gene_trees = null;
111         _removed_gene_tree_nodes = null;
112         _duplications_stats = new BasicDescriptiveStatistics();
113         p.reset();
114         inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
115         _species_tree = species_tree;
116     }
117
118     private RIO( final Phylogeny[] gene_trees,
119                  final Phylogeny species_tree,
120                  final ALGORITHM algorithm,
121                  final REROOTING rerooting,
122                  final String outgroup,
123                  int first,
124                  int last,
125                  final boolean produce_log,
126                  final boolean verbose,
127                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
128         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
129             last = gene_trees.length - 1;
130         }
131         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
132             first = 0;
133         }
134         removeSingleDescendentsNodes( species_tree, verbose );
135         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
136         _produce_log = produce_log;
137         _verbose = verbose;
138         _rerooting = rerooting;
139         _ext_nodes = -1;
140         _int_nodes = -1;
141         _log = new StringBuilder();
142         _gsdir_tax_comp_base = null;
143         _analyzed_gene_trees = null;
144         _removed_gene_tree_nodes = null;
145         _duplications_stats = new BasicDescriptiveStatistics();
146         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
147         _species_tree = species_tree;
148     }
149
150     public final Phylogeny[] getAnalyzedGeneTrees() {
151         return _analyzed_gene_trees;
152     }
153
154     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
155         return _duplications_stats;
156     }
157
158     /**
159      * Returns the numbers of number of ext nodes in gene trees analyzed (after
160      * stripping).
161      *
162      * @return number of ext nodes in gene trees analyzed (after stripping)
163      */
164     public final int getExtNodesOfAnalyzedGeneTrees() {
165         return _ext_nodes;
166     }
167
168     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
169         return _gsdir_tax_comp_base;
170     }
171
172     /**
173      * Returns the numbers of number of int nodes in gene trees analyzed (after
174      * stripping).
175      *
176      * @return number of int nodes in gene trees analyzed (after stripping)
177      */
178     public final int getIntNodesOfAnalyzedGeneTrees() {
179         return _int_nodes;
180     }
181
182     public final StringBuilder getLog() {
183         return _log;
184     }
185
186     final public Phylogeny getMinDuplicationsGeneTree() {
187         return _min_dub_gene_tree;
188     }
189
190     public final IntMatrix getOrthologTable() {
191         return _m;
192     }
193
194     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
195         return _removed_gene_tree_nodes;
196     }
197
198     public final Phylogeny getSpeciesTree() {
199         return _species_tree;
200     }
201
202     private final void inferOrthologs( final IteratingPhylogenyParser parser,
203                                        final Phylogeny species_tree,
204                                        final ALGORITHM algorithm,
205                                        final String outgroup,
206                                        int first,
207                                        final int last,
208                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
209                                        FileNotFoundException, IOException {
210         if ( !parser.hasNext() ) {
211             throw new RIOException( "no gene trees to analyze" );
212         }
213         if ( log() ) {
214             preLog( -1, species_tree, algorithm, outgroup );
215         }
216         if ( _verbose ) {
217             System.out.println();
218         }
219         int gene_tree_ext_nodes = 0;
220         int i = 0;
221         int counter = 0;
222         final boolean no_range = ( first < 0 ) || ( last < first );
223         while ( parser.hasNext() ) {
224             final Phylogeny gt = parser.next();
225             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
226                 if ( gt.isEmpty() ) {
227                     throw new RIOException( "gene tree #" + i + " is empty" );
228                 }
229                 if ( gt.getNumberOfExternalNodes() == 1 ) {
230                     throw new RIOException( "gene tree #" + i + " has only one external node" );
231                 }
232                 if ( _verbose ) {
233                     System.out.print( "\r" + i );
234                 }
235                 if ( counter == 0 ) {
236                     if ( algorithm == ALGORITHM.SDIR ) {
237                         // Removes from species_tree all species not found in gene_tree.
238                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
239                         if ( species_tree.isEmpty() ) {
240                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
241                         }
242                     }
243                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
244                 }
245                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
246                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
247                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
248                             + gene_tree_ext_nodes + ")" );
249                 }
250                 if ( algorithm == ALGORITHM.SDIR ) {
251                     // Removes from gene_tree all species not found in species_tree.
252                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
253                     if ( gt.isEmpty() ) {
254                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
255                     }
256                 }
257                 final Phylogeny analyzed_gt = performOrthologInference( gt,
258                                                                         species_tree,
259                                                                         algorithm,
260                                                                         outgroup,
261                                                                         counter,
262                                                                         transfer_taxonomy );
263                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
264                 ++counter;
265             }
266             ++i;
267         }
268         if ( _verbose ) {
269             System.out.print( "\rGene trees analyzed                 :\t" + counter  );
270         }
271         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
272             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
273         }
274         if ( no_range ) {
275             first = 0;
276         }
277         if ( log() ) {
278             postLog( species_tree, first, ( first + counter ) - 1 );
279         }
280         if ( _verbose ) {
281             System.out.println();
282             System.out.println();
283         }
284     }
285
286     private final void inferOrthologs( final Phylogeny[] gene_trees,
287                                        final Phylogeny species_tree,
288                                        final ALGORITHM algorithm,
289                                        final String outgroup,
290                                        final int first,
291                                        final int last,
292                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
293                                        FileNotFoundException, IOException {
294         if ( algorithm == ALGORITHM.SDIR ) {
295             // Removes from species_tree all species not found in gene_tree.
296             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
297             if ( species_tree.isEmpty() ) {
298                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
299             }
300         }
301         final Phylogeny[] my_gene_trees;
302         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
303             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
304             int c = 0;
305             for( int i = first; i <= last; ++i ) {
306                 my_gene_trees[ c++ ] = gene_trees[ i ];
307             }
308         }
309         else {
310             my_gene_trees = gene_trees;
311         }
312         if ( log() ) {
313             preLog( gene_trees.length, species_tree, algorithm, outgroup );
314         }
315         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
316             System.out.println();
317         }
318         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
319         int gene_tree_ext_nodes = 0;
320         for( int i = 0; i < my_gene_trees.length; ++i ) {
321             final Phylogeny gt = my_gene_trees[ i ];
322             if ( gt.isEmpty() ) {
323                 throw new RIOException( "gene tree #" + i + " is empty" );
324             }
325             if ( gt.getNumberOfExternalNodes() == 1 ) {
326                 throw new RIOException( "gene tree #" + i + " has only one external node" );
327             }
328             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
329                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
330             }
331             if ( i == 0 ) {
332                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
333             }
334             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
335                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
336                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
337                         + ")" );
338             }
339             if ( algorithm == ALGORITHM.SDIR ) {
340                 // Removes from gene_tree all species not found in species_tree.
341                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
342                 if ( gt.isEmpty() ) {
343                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
344                 }
345             }
346             _analyzed_gene_trees[ i ] = performOrthologInference( gt,
347                                                                   species_tree,
348                                                                   algorithm,
349                                                                   outgroup,
350                                                                   i,
351                                                                   transfer_taxonomy );
352         }
353         if ( log() ) {
354             postLog( species_tree, first, last );
355         }
356         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
357             System.out.println();
358             System.out.println();
359         }
360     }
361
362     private final boolean log() {
363         return _produce_log;
364     }
365
366     private final void log( final String s ) {
367         _log.append( s );
368         _log.append( ForesterUtil.LINE_SEPARATOR );
369     }
370
371     private final void logRemovedGeneTreeNodes() {
372         log( "Species stripped from gene trees:" );
373         final SortedSet<String> rn = new TreeSet<String>();
374         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
375             final Taxonomy t = n.getNodeData().getTaxonomy();
376             switch ( getGSDIRtaxCompBase() ) {
377                 case CODE: {
378                     rn.add( t.getTaxonomyCode() );
379                     break;
380                 }
381                 case ID: {
382                     rn.add( t.getIdentifier().toString() );
383                     break;
384                 }
385                 case SCIENTIFIC_NAME: {
386                     rn.add( t.getScientificName() );
387                     break;
388                 }
389             }
390         }
391         for( final String s : rn ) {
392             log( s );
393         }
394         log( "" );
395     }
396
397     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
398                                                       final Phylogeny species_tree,
399                                                       final ALGORITHM algorithm,
400                                                       final String outgroup,
401                                                       final int i,
402                                                       final boolean transfer_taxonomy ) throws SDIException,
403                                                       RIOException {
404         final Phylogeny assigned_tree;
405         switch ( algorithm ) {
406             case SDIR: {
407                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
408                 break;
409             }
410             case GSDIR: {
411                 assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i, transfer_taxonomy );
412                 break;
413             }
414             default: {
415                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
416             }
417         }
418         if ( i == 0 ) {
419             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
420             _int_nodes = assigned_tree.getNumberOfInternalNodes();
421         }
422         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
423             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
424                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
425                     + ")" );
426         }
427         return assigned_tree;
428     }
429
430     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
431                                                             final Phylogeny species_tree,
432                                                             final String outgroup,
433                                                             final int i,
434                                                             final boolean transfer_taxonomy ) throws SDIException,
435                                                             RIOException {
436         final Phylogeny assigned_tree;
437         final int dups;
438         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
439             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
440             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
441             if ( i == 0 ) {
442                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
443                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
444                     if ( !r.getNodeData().isHasTaxonomy() ) {
445                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
446                                 + i + ": " + r.toString() );
447                     }
448                 }
449             }
450             if ( i == 0 ) {
451                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
452             }
453             dups = gsdir.getMinDuplicationsSum();
454         }
455         else {
456             if ( _rerooting == REROOTING.MIDPOINT ) {
457                 PhylogenyMethods.midpointRoot( gene_tree );
458             }
459             else if ( _rerooting == REROOTING.OUTGROUP ) {
460                 final PhylogenyNode n = gene_tree.getNode( outgroup );
461                 gene_tree.reRoot( n );
462             }
463             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
464             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
465             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
466                 if ( !r.getNodeData().isHasTaxonomy() ) {
467                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
468                                             + ": " + r.toString() );
469                 }
470             }
471             assigned_tree = gene_tree;
472             if ( i == 0 ) {
473                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
474             }
475             dups = gsdi.getDuplicationsSum();
476         }
477         if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
478             _min_dub_gene_tree = assigned_tree;
479         }
480         _duplications_stats.addValue( dups );
481         return assigned_tree;
482     }
483
484     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
485             throws SDIException {
486         final SDIR sdir = new SDIR();
487         return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
488     }
489
490     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
491         log( "" );
492         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
493             logRemovedGeneTreeNodes();
494         }
495         log( "Species tree external nodes (after stripping)   : " + species_tree.getNumberOfExternalNodes() );
496         log( "Species tree polytomies (after stripping)       : "
497                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
498         log( "Taxonomy linking based on                       : " + getGSDIRtaxCompBase() );
499         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" );
500         if ( ( first >= 0 ) && ( last >= 0 ) ) {
501             log( "Gene trees analyzed range                       : " + first + "-" + last );
502         }
503         log( "Gene trees analyzed                             : " + _duplications_stats.getN() );
504         log( "Mean number of duplications                     : " + df.format( _duplications_stats.arithmeticMean() )
505              + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " ("
506              + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
507              + "%)" );
508         if ( _duplications_stats.getN() > 3 ) {
509             log( "Median number of duplications                   : " + df.format( _duplications_stats.median() )
510                  + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() )
511                  + "%)" );
512         }
513         log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() + " ("
514                 + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
515         log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() + " ("
516                 + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
517         log( "Gene tree internal nodes                        : " + getIntNodesOfAnalyzedGeneTrees() );
518         log( "Gene tree external nodes                        : " + getExtNodesOfAnalyzedGeneTrees() );
519     }
520
521     private final void preLog( final int gene_trees,
522                                final Phylogeny species_tree,
523                                final ALGORITHM algorithm,
524                                final String outgroup ) {
525         if ( gene_trees > 0 ) {
526             log( "Number of gene trees (total)                    : " + gene_trees );
527         }
528         log( "Algorithm                                       : " + algorithm );
529         log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() );
530         log( "Species tree polytomies (prior to stripping)    : "
531                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
532         String rs = "";
533         switch ( _rerooting ) {
534             case BY_ALGORITHM: {
535                 rs = "minimizing duplications";
536                 break;
537             }
538             case MIDPOINT: {
539                 rs = "midpoint";
540                 break;
541             }
542             case OUTGROUP: {
543                 rs = "outgroup: " + outgroup;
544                 break;
545             }
546             case NONE: {
547                 rs = "none";
548                 break;
549             }
550         }
551         log( "Re-rooting                                      : " + rs );
552     }
553
554     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
555             throws RIOException {
556         final List<String> labels = new ArrayList<String>();
557         final Set<String> labels_set = new HashSet<String>();
558         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
559             final String label = obtainLabel( labels_set, n );
560             labels_set.add( label );
561             labels.add( label );
562         }
563         if ( sort ) {
564             Collections.sort( labels );
565         }
566         final IntMatrix m = new IntMatrix( labels );
567         int counter = 0;
568         for( final Phylogeny gt : analyzed_gene_trees ) {
569             counter++;
570             updateCounts( m, counter, gt );
571         }
572         return m;
573     }
574
575     public final static RIO executeAnalysis( final File gene_trees_file,
576                                              final File species_tree_file,
577                                              final ALGORITHM algorithm,
578                                              final REROOTING rerooting,
579                                              final String outgroup,
580                                              final int first,
581                                              final int last,
582                                              final boolean produce_log,
583                                              final boolean verbose,
584                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
585                                              RIOException {
586         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
587         if ( gene_trees.length < 1 ) {
588             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
589         }
590         final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
591                                                                  species_tree_file,
592                                                                  false,
593                                                                  true,
594                                                                  TAXONOMY_EXTRACTION.NO );
595         return new RIO( gene_trees,
596                         species_tree,
597                         algorithm,
598                         rerooting,
599                         outgroup,
600                         first,
601                         last,
602                         produce_log,
603                         verbose,
604                         transfer_taxonomy );
605     }
606
607     public final static RIO executeAnalysis( final File gene_trees_file,
608                                              final Phylogeny species_tree,
609                                              final ALGORITHM algorithm,
610                                              final REROOTING rerooting,
611                                              final String outgroup,
612                                              final boolean produce_log,
613                                              final boolean verbose,
614                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
615                                              RIOException {
616         return new RIO( parseGeneTrees( gene_trees_file ),
617                         species_tree,
618                         algorithm,
619                         rerooting,
620                         outgroup,
621                         DEFAULT_RANGE,
622                         DEFAULT_RANGE,
623                         produce_log,
624                         verbose,
625                         transfer_taxonomy );
626     }
627
628     public final static RIO executeAnalysis( final File gene_trees_file,
629                                              final Phylogeny species_tree,
630                                              final ALGORITHM algorithm,
631                                              final REROOTING rerooting,
632                                              final String outgroup,
633                                              final int first,
634                                              final int last,
635                                              final boolean produce_log,
636                                              final boolean verbose,
637                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
638                                              RIOException {
639         return new RIO( parseGeneTrees( gene_trees_file ),
640                         species_tree,
641                         algorithm,
642                         rerooting,
643                         outgroup,
644                         first,
645                         last,
646                         produce_log,
647                         verbose,
648                         transfer_taxonomy );
649     }
650
651     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
652                                              final File species_tree_file,
653                                              final ALGORITHM algorithm,
654                                              final REROOTING rerooting,
655                                              final String outgroup,
656                                              final int first,
657                                              final int last,
658                                              final boolean produce_log,
659                                              final boolean verbose,
660                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
661                                              RIOException {
662         final Phylogeny g0 = p.next();
663         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
664             throw new RIOException( "input file does not seem to contain any gene trees" );
665         }
666         final Phylogeny species_tree = SDIutil.parseSpeciesTree( g0,
667                                                                  species_tree_file,
668                                                                  false,
669                                                                  true,
670                                                                  TAXONOMY_EXTRACTION.NO );
671         p.reset();
672         return new RIO( p,
673                         species_tree,
674                         algorithm,
675                         rerooting,
676                         outgroup,
677                         first,
678                         last,
679                         produce_log,
680                         verbose,
681                         transfer_taxonomy );
682     }
683
684     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
685                                              final Phylogeny species_tree,
686                                              final ALGORITHM algorithm,
687                                              final REROOTING rerooting,
688                                              final String outgroup,
689                                              final boolean produce_log,
690                                              final boolean verbose,
691                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
692                                              RIOException {
693         return new RIO( p,
694                         species_tree,
695                         algorithm,
696                         rerooting,
697                         outgroup,
698                         DEFAULT_RANGE,
699                         DEFAULT_RANGE,
700                         produce_log,
701                         verbose,
702                         transfer_taxonomy );
703     }
704
705     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
706                                              final Phylogeny species_tree,
707                                              final ALGORITHM algorithm,
708                                              final REROOTING rerooting,
709                                              final String outgroup,
710                                              final int first,
711                                              final int last,
712                                              final boolean produce_log,
713                                              final boolean verbose,
714                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
715                                              RIOException {
716         return new RIO( p,
717                         species_tree,
718                         algorithm,
719                         rerooting,
720                         outgroup,
721                         first,
722                         last,
723                         produce_log,
724                         verbose,
725                         transfer_taxonomy );
726     }
727
728     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
729             throws IOException, SDIException, RIOException {
730         return new RIO( gene_trees,
731                         species_tree,
732                         ALGORITHM.GSDIR,
733                         REROOTING.BY_ALGORITHM,
734                         null,
735                         DEFAULT_RANGE,
736                         DEFAULT_RANGE,
737                         false,
738                         false,
739                         false );
740     }
741
742     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
743                                              final Phylogeny species_tree,
744                                              final ALGORITHM algorithm,
745                                              final REROOTING rerooting,
746                                              final String outgroup,
747                                              final boolean produce_log,
748                                              final boolean verbose,
749                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
750                                              RIOException {
751         return new RIO( gene_trees,
752                         species_tree,
753                         algorithm,
754                         rerooting,
755                         outgroup,
756                         DEFAULT_RANGE,
757                         DEFAULT_RANGE,
758                         produce_log,
759                         verbose,
760                         transfer_taxonomy );
761     }
762
763     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
764                                              final Phylogeny species_tree,
765                                              final ALGORITHM algorithm,
766                                              final REROOTING rerooting,
767                                              final String outgroup,
768                                              final int first,
769                                              final int last,
770                                              final boolean produce_log,
771                                              final boolean verbose,
772                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
773                                              RIOException {
774         return new RIO( gene_trees,
775                         species_tree,
776                         algorithm,
777                         rerooting,
778                         outgroup,
779                         first,
780                         last,
781                         produce_log,
782                         verbose,
783                         transfer_taxonomy );
784     }
785
786     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
787             throws RIOException {
788         if ( counter == 0 ) {
789             final List<String> labels = new ArrayList<String>();
790             final Set<String> labels_set = new HashSet<String>();
791             for( final PhylogenyNode n : g.getExternalNodes() ) {
792                 final String label = obtainLabel( labels_set, n );
793                 labels_set.add( label );
794                 labels.add( label );
795             }
796             if ( sort ) {
797                 Collections.sort( labels );
798             }
799             _m = new IntMatrix( labels );
800         }
801         updateCounts( _m, counter, g );
802     }
803
804     private final static void checkPreconditions( final IteratingPhylogenyParser p,
805                                                   final Phylogeny species_tree,
806                                                   final REROOTING rerooting,
807                                                   final String outgroup,
808                                                   final int first,
809                                                   final int last ) throws RIOException, IOException {
810         final Phylogeny g0 = p.next();
811         if ( ( g0 == null ) || g0.isEmpty() ) {
812             throw new RIOException( "input file does not seem to contain any gene trees" );
813         }
814         if ( g0.getNumberOfExternalNodes() < 2 ) {
815             throw new RIOException( "input file does not seem to contain any useable gene trees" );
816         }
817         if ( !species_tree.isRooted() ) {
818             throw new RIOException( "species tree is not rooted" );
819         }
820         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
821                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
822             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
823                     + last );
824         }
825         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
826             throw new RIOException( "outgroup not set for midpoint rooting" );
827         }
828         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
829             throw new RIOException( "outgroup only used for midpoint rooting" );
830         }
831         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
832             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
833         }
834         if ( rerooting == REROOTING.OUTGROUP ) {
835             try {
836                 g0.getNode( outgroup );
837             }
838             catch ( final IllegalArgumentException e ) {
839                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
840             }
841         }
842     }
843
844     private final static void checkPreconditions( final Phylogeny[] gene_trees,
845                                                   final Phylogeny species_tree,
846                                                   final REROOTING rerooting,
847                                                   final String outgroup,
848                                                   final int first,
849                                                   final int last ) throws RIOException {
850         if ( !species_tree.isRooted() ) {
851             throw new RIOException( "species tree is not rooted" );
852         }
853         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
854                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
855             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
856                     + last + " (out of " + gene_trees.length + ")" );
857         }
858         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
859             throw new RIOException( "outgroup not set for midpoint rooting" );
860         }
861         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
862             throw new RIOException( "outgroup only used for midpoint rooting" );
863         }
864         if ( ( rerooting == REROOTING.MIDPOINT )
865                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
866             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
867         }
868         if ( rerooting == REROOTING.OUTGROUP ) {
869             try {
870                 gene_trees[ 0 ].getNode( outgroup );
871             }
872             catch ( final IllegalArgumentException e ) {
873                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
874             }
875         }
876     }
877
878     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
879         String label;
880         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
881             label = n.getNodeData().getSequence().getName();
882         }
883         else if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
884             label = n.getNodeData().getSequence().getSymbol();
885         }
886         else if ( n.getNodeData().isHasSequence()
887                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
888             label = n.getNodeData().getSequence().getGeneName();
889         }
890         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
891             label = n.getName();
892         }
893         else {
894             throw new RIOException( "node " + n + " has no appropriate label" );
895         }
896         if ( labels_set.contains( label ) ) {
897             throw new RIOException( "label " + label + " is not unique" );
898         }
899         return label;
900     }
901
902     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
903     IOException {
904         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
905         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
906         if ( p instanceof NHXParser ) {
907             final NHXParser nhx = ( NHXParser ) p;
908             nhx.setReplaceUnderscores( false );
909             nhx.setIgnoreQuotes( true );
910             nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
911         }
912         else if ( p instanceof NexusPhylogeniesParser ) {
913             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
914             nex.setReplaceUnderscores( false );
915             nex.setIgnoreQuotes( true );
916             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
917         }
918         return factory.create( gene_trees_file, p );
919     }
920
921     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
922         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
923         if ( o > 0 ) {
924             if ( verbose ) {
925                 System.out.println( "warning: species tree has " + o
926                                     + " internal nodes with only one descendent which are therefore going to be removed" );
927             }
928             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
929         }
930     }
931
932     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
933             throws RIOException {
934         PhylogenyMethods.preOrderReId( g );
935         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
936         for( int x = 0; x < m.size(); ++x ) {
937             final String mx = m.getLabel( x );
938             final PhylogenyNode nx = map.get( mx );
939             if ( nx == null ) {
940                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
941             }
942             String my;
943             PhylogenyNode ny;
944             for( int y = 0; y < m.size(); ++y ) {
945                 my = m.getLabel( y );
946                 ny = map.get( my );
947                 if ( ny == null ) {
948                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
949                 }
950                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
951                     m.inreaseByOne( x, y );
952                 }
953             }
954         }
955     }
956
957     public enum REROOTING {
958         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
959     }
960 }