make PH, aPH, CL selection
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27
28 package org.forester.rio;
29
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.text.DecimalFormat;
34 import java.util.ArrayList;
35 import java.util.Collections;
36 import java.util.HashMap;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40 import java.util.SortedSet;
41 import java.util.TreeSet;
42
43 import org.forester.datastructures.IntMatrix;
44 import org.forester.io.parsers.IteratingPhylogenyParser;
45 import org.forester.io.parsers.PhylogenyParser;
46 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
47 import org.forester.io.parsers.nhx.NHXParser;
48 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
49 import org.forester.io.parsers.util.ParserUtils;
50 import org.forester.phylogeny.Phylogeny;
51 import org.forester.phylogeny.PhylogenyMethods;
52 import org.forester.phylogeny.PhylogenyNode;
53 import org.forester.phylogeny.data.Taxonomy;
54 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
55 import org.forester.phylogeny.factories.PhylogenyFactory;
56 import org.forester.sdi.GSDI;
57 import org.forester.sdi.GSDIR;
58 import org.forester.sdi.SDIException;
59 import org.forester.sdi.SDIR;
60 import org.forester.sdi.SDIutil;
61 import org.forester.sdi.SDIutil.ALGORITHM;
62 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
63 import org.forester.util.BasicDescriptiveStatistics;
64 import org.forester.util.ForesterUtil;
65
66 public final class RIO {
67
68     public static final int                  DEFAULT_RANGE = -1;
69     private static final int                 END_OF_GT     = Integer.MAX_VALUE;
70     private static IntMatrix                 _m;
71     private Phylogeny[]                      _analyzed_gene_trees;
72     private List<PhylogenyNode>              _removed_gene_tree_nodes;
73     private int                              _ext_nodes;
74     private int                              _int_nodes;
75     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
76     private final StringBuilder              _log;
77     private final BasicDescriptiveStatistics _duplications_stats;
78     private final boolean                    _produce_log;
79     private final boolean                    _verbose;
80     private final REROOTING                  _rerooting;
81     private final Phylogeny                  _species_tree;
82     private Phylogeny                        _min_dub_gene_tree;
83
84     private RIO( final IteratingPhylogenyParser p,
85                  final Phylogeny species_tree,
86                  final ALGORITHM algorithm,
87                  final REROOTING rerooting,
88                  final String outgroup,
89                  int first,
90                  int last,
91                  final boolean produce_log,
92                  final boolean verbose,
93                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
94         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
95             last = END_OF_GT;
96         }
97         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
98             first = 0;
99         }
100         removeSingleDescendentsNodes( species_tree, verbose );
101         p.reset();
102         checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
103         _produce_log = produce_log;
104         _verbose = verbose;
105         _rerooting = rerooting;
106         _ext_nodes = -1;
107         _int_nodes = -1;
108         _log = new StringBuilder();
109         _gsdir_tax_comp_base = null;
110         _analyzed_gene_trees = null;
111         _removed_gene_tree_nodes = null;
112         _duplications_stats = new BasicDescriptiveStatistics();
113         p.reset();
114         inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
115         _species_tree = species_tree;
116     }
117
118     private RIO( final Phylogeny[] gene_trees,
119                  final Phylogeny species_tree,
120                  final ALGORITHM algorithm,
121                  final REROOTING rerooting,
122                  final String outgroup,
123                  int first,
124                  int last,
125                  final boolean produce_log,
126                  final boolean verbose,
127                  final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException {
128         if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
129             last = gene_trees.length - 1;
130         }
131         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
132             first = 0;
133         }
134         removeSingleDescendentsNodes( species_tree, verbose );
135         checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
136         _produce_log = produce_log;
137         _verbose = verbose;
138         _rerooting = rerooting;
139         _ext_nodes = -1;
140         _int_nodes = -1;
141         _log = new StringBuilder();
142         _gsdir_tax_comp_base = null;
143         _analyzed_gene_trees = null;
144         _removed_gene_tree_nodes = null;
145         _duplications_stats = new BasicDescriptiveStatistics();
146         inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
147         _species_tree = species_tree;
148     }
149
150     public final Phylogeny[] getAnalyzedGeneTrees() {
151         return _analyzed_gene_trees;
152     }
153
154     public final BasicDescriptiveStatistics getDuplicationsStatistics() {
155         return _duplications_stats;
156     }
157
158     /**
159      * Returns the numbers of number of ext nodes in gene trees analyzed (after
160      * stripping).
161      *
162      * @return number of ext nodes in gene trees analyzed (after stripping)
163      */
164     public final int getExtNodesOfAnalyzedGeneTrees() {
165         return _ext_nodes;
166     }
167
168     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
169         return _gsdir_tax_comp_base;
170     }
171
172     /**
173      * Returns the numbers of number of int nodes in gene trees analyzed (after
174      * stripping).
175      *
176      * @return number of int nodes in gene trees analyzed (after stripping)
177      */
178     public final int getIntNodesOfAnalyzedGeneTrees() {
179         return _int_nodes;
180     }
181
182     public final StringBuilder getLog() {
183         return _log;
184     }
185
186     final public Phylogeny getMinDuplicationsGeneTree() {
187         return _min_dub_gene_tree;
188     }
189
190     public final IntMatrix getOrthologTable() {
191         return _m;
192     }
193
194     public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
195         return _removed_gene_tree_nodes;
196     }
197
198     public final Phylogeny getSpeciesTree() {
199         return _species_tree;
200     }
201
202     private final void inferOrthologs( final IteratingPhylogenyParser parser,
203                                        final Phylogeny species_tree,
204                                        final ALGORITHM algorithm,
205                                        final String outgroup,
206                                        int first,
207                                        final int last,
208                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
209                                        FileNotFoundException, IOException {
210         if ( !parser.hasNext() ) {
211             throw new RIOException( "no gene trees to analyze" );
212         }
213         if ( log() ) {
214             preLog( -1, species_tree, algorithm, outgroup );
215         }
216         if ( _verbose ) {
217             System.out.println();
218         }
219         final DecimalFormat pf = new java.text.DecimalFormat( "000" );
220         int gene_tree_ext_nodes = 0;
221         int i = 0;
222         int counter = 0;
223         final boolean no_range = ( first < 0 ) || ( last < first );
224         while ( parser.hasNext() ) {
225             final Phylogeny gt = parser.next();
226             if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
227                 if ( gt.isEmpty() ) {
228                     throw new RIOException( "gene tree #" + i + " is empty" );
229                 }
230                 if ( gt.getNumberOfExternalNodes() == 1 ) {
231                     throw new RIOException( "gene tree #" + i + " has only one external node" );
232                 }
233                 if ( _verbose ) {
234                     ForesterUtil.updateProgress( i, pf );
235                 }
236                 if ( counter == 0 ) {
237                     if ( algorithm == ALGORITHM.SDIR ) {
238                         // Removes from species_tree all species not found in gene_tree.
239                         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
240                         if ( species_tree.isEmpty() ) {
241                             throw new RIOException( "failed to establish species based mapping between gene and species trees" );
242                         }
243                     }
244                     gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
245                 }
246                 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
247                     throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
248                             + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
249                             + gene_tree_ext_nodes + ")" );
250                 }
251                 if ( algorithm == ALGORITHM.SDIR ) {
252                     // Removes from gene_tree all species not found in species_tree.
253                     PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
254                     if ( gt.isEmpty() ) {
255                         throw new RIOException( "failed to establish species based mapping between gene and species trees" );
256                     }
257                 }
258                 final Phylogeny analyzed_gt = performOrthologInference( gt,
259                                                                         species_tree,
260                                                                         algorithm,
261                                                                         outgroup,
262                                                                         counter,
263                                                                         transfer_taxonomy );
264                 RIO.calculateOrthologTable( analyzed_gt, true, counter );
265                 ++counter;
266             }
267             ++i;
268         }
269         if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
270             throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
271         }
272         if ( no_range ) {
273             first = 0;
274         }
275         if ( log() ) {
276             postLog( species_tree, first, ( first + counter ) - 1 );
277         }
278         if ( _verbose ) {
279             System.out.println();
280             System.out.println();
281         }
282     }
283
284     private final void inferOrthologs( final Phylogeny[] gene_trees,
285                                        final Phylogeny species_tree,
286                                        final ALGORITHM algorithm,
287                                        final String outgroup,
288                                        final int first,
289                                        final int last,
290                                        final boolean transfer_taxonomy ) throws SDIException, RIOException,
291                                        FileNotFoundException, IOException {
292         if ( algorithm == ALGORITHM.SDIR ) {
293             // Removes from species_tree all species not found in gene_tree.
294             PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
295             if ( species_tree.isEmpty() ) {
296                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
297             }
298         }
299         final Phylogeny[] my_gene_trees;
300         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
301             my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
302             int c = 0;
303             for( int i = first; i <= last; ++i ) {
304                 my_gene_trees[ c++ ] = gene_trees[ i ];
305             }
306         }
307         else {
308             my_gene_trees = gene_trees;
309         }
310         if ( log() ) {
311             preLog( gene_trees.length, species_tree, algorithm, outgroup );
312         }
313         if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
314             System.out.println();
315         }
316         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
317         int gene_tree_ext_nodes = 0;
318         for( int i = 0; i < my_gene_trees.length; ++i ) {
319             final Phylogeny gt = my_gene_trees[ i ];
320             if ( gt.isEmpty() ) {
321                 throw new RIOException( "gene tree #" + i + " is empty" );
322             }
323             if ( gt.getNumberOfExternalNodes() == 1 ) {
324                 throw new RIOException( "gene tree #" + i + " has only one external node" );
325             }
326             if ( _verbose && ( my_gene_trees.length > 4 ) ) {
327                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
328             }
329             if ( i == 0 ) {
330                 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
331             }
332             else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
333                 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
334                         + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
335                         + ")" );
336             }
337             if ( algorithm == ALGORITHM.SDIR ) {
338                 // Removes from gene_tree all species not found in species_tree.
339                 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
340                 if ( gt.isEmpty() ) {
341                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
342                 }
343             }
344             _analyzed_gene_trees[ i ] = performOrthologInference( gt,
345                                                                   species_tree,
346                                                                   algorithm,
347                                                                   outgroup,
348                                                                   i,
349                                                                   transfer_taxonomy );
350         }
351         if ( log() ) {
352             postLog( species_tree, first, last );
353         }
354         if ( _verbose && ( my_gene_trees.length > 4 ) ) {
355             System.out.println();
356             System.out.println();
357         }
358     }
359
360     private final boolean log() {
361         return _produce_log;
362     }
363
364     private final void log( final String s ) {
365         _log.append( s );
366         _log.append( ForesterUtil.LINE_SEPARATOR );
367     }
368
369     private final void logRemovedGeneTreeNodes() {
370         log( "Species stripped from gene trees:" );
371         final SortedSet<String> rn = new TreeSet<String>();
372         for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
373             final Taxonomy t = n.getNodeData().getTaxonomy();
374             switch ( getGSDIRtaxCompBase() ) {
375                 case CODE: {
376                     rn.add( t.getTaxonomyCode() );
377                     break;
378                 }
379                 case ID: {
380                     rn.add( t.getIdentifier().toString() );
381                     break;
382                 }
383                 case SCIENTIFIC_NAME: {
384                     rn.add( t.getScientificName() );
385                     break;
386                 }
387             }
388         }
389         for( final String s : rn ) {
390             log( s );
391         }
392         log( "" );
393     }
394
395     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
396                                                       final Phylogeny species_tree,
397                                                       final ALGORITHM algorithm,
398                                                       final String outgroup,
399                                                       final int i,
400                                                       final boolean transfer_taxonomy ) throws SDIException,
401                                                       RIOException {
402         final Phylogeny assigned_tree;
403         switch ( algorithm ) {
404             case SDIR: {
405                 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
406                 break;
407             }
408             case GSDIR: {
409                 assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i, transfer_taxonomy );
410                 break;
411             }
412             default: {
413                 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
414             }
415         }
416         if ( i == 0 ) {
417             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
418             _int_nodes = assigned_tree.getNumberOfInternalNodes();
419         }
420         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
421             throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
422                     + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
423                     + ")" );
424         }
425         return assigned_tree;
426     }
427
428     private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
429                                                             final Phylogeny species_tree,
430                                                             final String outgroup,
431                                                             final int i,
432                                                             final boolean transfer_taxonomy ) throws SDIException,
433                                                             RIOException {
434         final Phylogeny assigned_tree;
435         final int dups;
436         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
437             final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
438             assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
439             if ( i == 0 ) {
440                 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
441                 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
442                     if ( !r.getNodeData().isHasTaxonomy() ) {
443                         throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
444                                 + i + ": " + r.toString() );
445                     }
446                 }
447             }
448             if ( i == 0 ) {
449                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
450             }
451             dups = gsdir.getMinDuplicationsSum();
452         }
453         else {
454             if ( _rerooting == REROOTING.MIDPOINT ) {
455                 PhylogenyMethods.midpointRoot( gene_tree );
456             }
457             else if ( _rerooting == REROOTING.OUTGROUP ) {
458                 final PhylogenyNode n = gene_tree.getNode( outgroup );
459                 gene_tree.reRoot( n );
460             }
461             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
462             _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
463             for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
464                 if ( !r.getNodeData().isHasTaxonomy() ) {
465                     throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
466                                             + ": " + r.toString() );
467                 }
468             }
469             assigned_tree = gene_tree;
470             if ( i == 0 ) {
471                 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
472             }
473             dups = gsdi.getDuplicationsSum();
474         }
475         if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
476             _min_dub_gene_tree = assigned_tree;
477         }
478         _duplications_stats.addValue( dups );
479         return assigned_tree;
480     }
481
482     private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
483             throws SDIException {
484         final SDIR sdir = new SDIR();
485         return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
486     }
487
488     private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
489         log( "" );
490         if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
491             logRemovedGeneTreeNodes();
492         }
493         log( "Species tree external nodes (after stripping)   : " + species_tree.getNumberOfExternalNodes() );
494         log( "Species tree polytomies (after stripping)       : "
495                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
496         log( "Taxonomy linking based on                       : " + getGSDIRtaxCompBase() );
497         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" );
498         if ( ( first >= 0 ) && ( last >= 0 ) ) {
499             log( "Gene trees analyzed range                       : " + first + "-" + last );
500         }
501         log( "Gene trees analyzed                             : " + _duplications_stats.getN() );
502         log( "Mean number of duplications                     : " + df.format( _duplications_stats.arithmeticMean() )
503              + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " ("
504              + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
505              + "%)" );
506         if ( _duplications_stats.getN() > 3 ) {
507             log( "Median number of duplications                   : " + df.format( _duplications_stats.median() )
508                  + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() )
509                  + "%)" );
510         }
511         log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() + " ("
512                 + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
513         log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() + " ("
514                 + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
515         log( "Gene tree internal nodes                        : " + getIntNodesOfAnalyzedGeneTrees() );
516         log( "Gene tree external nodes                        : " + getExtNodesOfAnalyzedGeneTrees() );
517     }
518
519     private final void preLog( final int gene_trees,
520                                final Phylogeny species_tree,
521                                final ALGORITHM algorithm,
522                                final String outgroup ) {
523         if ( gene_trees > 0 ) {
524             log( "Number of gene trees (total)                    : " + gene_trees );
525         }
526         log( "Algorithm                                       : " + algorithm );
527         log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() );
528         log( "Species tree polytomies (prior to stripping)    : "
529                 + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
530         String rs = "";
531         switch ( _rerooting ) {
532             case BY_ALGORITHM: {
533                 rs = "minimizing duplications";
534                 break;
535             }
536             case MIDPOINT: {
537                 rs = "midpoint";
538                 break;
539             }
540             case OUTGROUP: {
541                 rs = "outgroup: " + outgroup;
542                 break;
543             }
544             case NONE: {
545                 rs = "none";
546                 break;
547             }
548         }
549         log( "Re-rooting                                      : " + rs );
550     }
551
552     public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
553             throws RIOException {
554         final List<String> labels = new ArrayList<String>();
555         final Set<String> labels_set = new HashSet<String>();
556         for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
557             final String label = obtainLabel( labels_set, n );
558             labels_set.add( label );
559             labels.add( label );
560         }
561         if ( sort ) {
562             Collections.sort( labels );
563         }
564         final IntMatrix m = new IntMatrix( labels );
565         int counter = 0;
566         for( final Phylogeny gt : analyzed_gene_trees ) {
567             counter++;
568             updateCounts( m, counter, gt );
569         }
570         return m;
571     }
572
573     public final static RIO executeAnalysis( final File gene_trees_file,
574                                              final File species_tree_file,
575                                              final ALGORITHM algorithm,
576                                              final REROOTING rerooting,
577                                              final String outgroup,
578                                              final int first,
579                                              final int last,
580                                              final boolean produce_log,
581                                              final boolean verbose,
582                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
583                                              RIOException {
584         final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
585         if ( gene_trees.length < 1 ) {
586             throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
587         }
588         final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
589                                                                  species_tree_file,
590                                                                  false,
591                                                                  true,
592                                                                  TAXONOMY_EXTRACTION.NO );
593         return new RIO( gene_trees,
594                         species_tree,
595                         algorithm,
596                         rerooting,
597                         outgroup,
598                         first,
599                         last,
600                         produce_log,
601                         verbose,
602                         transfer_taxonomy );
603     }
604
605     public final static RIO executeAnalysis( final File gene_trees_file,
606                                              final Phylogeny species_tree,
607                                              final ALGORITHM algorithm,
608                                              final REROOTING rerooting,
609                                              final String outgroup,
610                                              final boolean produce_log,
611                                              final boolean verbose,
612                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
613                                              RIOException {
614         return new RIO( parseGeneTrees( gene_trees_file ),
615                         species_tree,
616                         algorithm,
617                         rerooting,
618                         outgroup,
619                         DEFAULT_RANGE,
620                         DEFAULT_RANGE,
621                         produce_log,
622                         verbose,
623                         transfer_taxonomy );
624     }
625
626     public final static RIO executeAnalysis( final File gene_trees_file,
627                                              final Phylogeny species_tree,
628                                              final ALGORITHM algorithm,
629                                              final REROOTING rerooting,
630                                              final String outgroup,
631                                              final int first,
632                                              final int last,
633                                              final boolean produce_log,
634                                              final boolean verbose,
635                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
636                                              RIOException {
637         return new RIO( parseGeneTrees( gene_trees_file ),
638                         species_tree,
639                         algorithm,
640                         rerooting,
641                         outgroup,
642                         first,
643                         last,
644                         produce_log,
645                         verbose,
646                         transfer_taxonomy );
647     }
648
649     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
650                                              final File species_tree_file,
651                                              final ALGORITHM algorithm,
652                                              final REROOTING rerooting,
653                                              final String outgroup,
654                                              final int first,
655                                              final int last,
656                                              final boolean produce_log,
657                                              final boolean verbose,
658                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
659                                              RIOException {
660         final Phylogeny g0 = p.next();
661         if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
662             throw new RIOException( "input file does not seem to contain any gene trees" );
663         }
664         final Phylogeny species_tree = SDIutil.parseSpeciesTree( g0,
665                                                                  species_tree_file,
666                                                                  false,
667                                                                  true,
668                                                                  TAXONOMY_EXTRACTION.NO );
669         p.reset();
670         return new RIO( p,
671                         species_tree,
672                         algorithm,
673                         rerooting,
674                         outgroup,
675                         first,
676                         last,
677                         produce_log,
678                         verbose,
679                         transfer_taxonomy );
680     }
681
682     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
683                                              final Phylogeny species_tree,
684                                              final ALGORITHM algorithm,
685                                              final REROOTING rerooting,
686                                              final String outgroup,
687                                              final boolean produce_log,
688                                              final boolean verbose,
689                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
690                                              RIOException {
691         return new RIO( p,
692                         species_tree,
693                         algorithm,
694                         rerooting,
695                         outgroup,
696                         DEFAULT_RANGE,
697                         DEFAULT_RANGE,
698                         produce_log,
699                         verbose,
700                         transfer_taxonomy );
701     }
702
703     public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
704                                              final Phylogeny species_tree,
705                                              final ALGORITHM algorithm,
706                                              final REROOTING rerooting,
707                                              final String outgroup,
708                                              final int first,
709                                              final int last,
710                                              final boolean produce_log,
711                                              final boolean verbose,
712                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
713                                              RIOException {
714         return new RIO( p,
715                         species_tree,
716                         algorithm,
717                         rerooting,
718                         outgroup,
719                         first,
720                         last,
721                         produce_log,
722                         verbose,
723                         transfer_taxonomy );
724     }
725
726     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
727             throws IOException, SDIException, RIOException {
728         return new RIO( gene_trees,
729                         species_tree,
730                         ALGORITHM.GSDIR,
731                         REROOTING.BY_ALGORITHM,
732                         null,
733                         DEFAULT_RANGE,
734                         DEFAULT_RANGE,
735                         false,
736                         false,
737                         false );
738     }
739
740     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
741                                              final Phylogeny species_tree,
742                                              final ALGORITHM algorithm,
743                                              final REROOTING rerooting,
744                                              final String outgroup,
745                                              final boolean produce_log,
746                                              final boolean verbose,
747                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
748                                              RIOException {
749         return new RIO( gene_trees,
750                         species_tree,
751                         algorithm,
752                         rerooting,
753                         outgroup,
754                         DEFAULT_RANGE,
755                         DEFAULT_RANGE,
756                         produce_log,
757                         verbose,
758                         transfer_taxonomy );
759     }
760
761     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
762                                              final Phylogeny species_tree,
763                                              final ALGORITHM algorithm,
764                                              final REROOTING rerooting,
765                                              final String outgroup,
766                                              final int first,
767                                              final int last,
768                                              final boolean produce_log,
769                                              final boolean verbose,
770                                              final boolean transfer_taxonomy ) throws IOException, SDIException,
771                                              RIOException {
772         return new RIO( gene_trees,
773                         species_tree,
774                         algorithm,
775                         rerooting,
776                         outgroup,
777                         first,
778                         last,
779                         produce_log,
780                         verbose,
781                         transfer_taxonomy );
782     }
783
784     private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
785             throws RIOException {
786         if ( counter == 0 ) {
787             final List<String> labels = new ArrayList<String>();
788             final Set<String> labels_set = new HashSet<String>();
789             for( final PhylogenyNode n : g.getExternalNodes() ) {
790                 final String label = obtainLabel( labels_set, n );
791                 labels_set.add( label );
792                 labels.add( label );
793             }
794             if ( sort ) {
795                 Collections.sort( labels );
796             }
797             _m = new IntMatrix( labels );
798         }
799         updateCounts( _m, counter, g );
800     }
801
802     private final static void checkPreconditions( final IteratingPhylogenyParser p,
803                                                   final Phylogeny species_tree,
804                                                   final REROOTING rerooting,
805                                                   final String outgroup,
806                                                   final int first,
807                                                   final int last ) throws RIOException, IOException {
808         final Phylogeny g0 = p.next();
809         if ( ( g0 == null ) || g0.isEmpty() ) {
810             throw new RIOException( "input file does not seem to contain any gene trees" );
811         }
812         if ( g0.getNumberOfExternalNodes() < 2 ) {
813             throw new RIOException( "input file does not seem to contain any useable gene trees" );
814         }
815         if ( !species_tree.isRooted() ) {
816             throw new RIOException( "species tree is not rooted" );
817         }
818         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
819                 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
820             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
821                     + last );
822         }
823         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
824             throw new RIOException( "outgroup not set for midpoint rooting" );
825         }
826         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
827             throw new RIOException( "outgroup only used for midpoint rooting" );
828         }
829         if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
830             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
831         }
832         if ( rerooting == REROOTING.OUTGROUP ) {
833             try {
834                 g0.getNode( outgroup );
835             }
836             catch ( final IllegalArgumentException e ) {
837                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
838             }
839         }
840     }
841
842     private final static void checkPreconditions( final Phylogeny[] gene_trees,
843                                                   final Phylogeny species_tree,
844                                                   final REROOTING rerooting,
845                                                   final String outgroup,
846                                                   final int first,
847                                                   final int last ) throws RIOException {
848         if ( !species_tree.isRooted() ) {
849             throw new RIOException( "species tree is not rooted" );
850         }
851         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
852                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
853             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
854                     + last + " (out of " + gene_trees.length + ")" );
855         }
856         if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
857             throw new RIOException( "outgroup not set for midpoint rooting" );
858         }
859         if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
860             throw new RIOException( "outgroup only used for midpoint rooting" );
861         }
862         if ( ( rerooting == REROOTING.MIDPOINT )
863                 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
864             throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
865         }
866         if ( rerooting == REROOTING.OUTGROUP ) {
867             try {
868                 gene_trees[ 0 ].getNode( outgroup );
869             }
870             catch ( final IllegalArgumentException e ) {
871                 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
872             }
873         }
874     }
875
876     private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
877         String label;
878         if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
879             label = n.getNodeData().getSequence().getName();
880         }
881         else if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
882             label = n.getNodeData().getSequence().getSymbol();
883         }
884         else if ( n.getNodeData().isHasSequence()
885                 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
886             label = n.getNodeData().getSequence().getGeneName();
887         }
888         else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
889             label = n.getName();
890         }
891         else {
892             throw new RIOException( "node " + n + " has no appropriate label" );
893         }
894         if ( labels_set.contains( label ) ) {
895             throw new RIOException( "label " + label + " is not unique" );
896         }
897         return label;
898     }
899
900     private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
901     IOException {
902         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
903         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
904         if ( p instanceof NHXParser ) {
905             final NHXParser nhx = ( NHXParser ) p;
906             nhx.setReplaceUnderscores( false );
907             nhx.setIgnoreQuotes( true );
908             nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
909         }
910         else if ( p instanceof NexusPhylogeniesParser ) {
911             final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
912             nex.setReplaceUnderscores( false );
913             nex.setIgnoreQuotes( true );
914             nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
915         }
916         return factory.create( gene_trees_file, p );
917     }
918
919     private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
920         final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
921         if ( o > 0 ) {
922             if ( verbose ) {
923                 System.out.println( "warning: species tree has " + o
924                                     + " internal nodes with only one descendent which are therefore going to be removed" );
925             }
926             PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
927         }
928     }
929
930     private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
931             throws RIOException {
932         PhylogenyMethods.preOrderReId( g );
933         final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
934         for( int x = 0; x < m.size(); ++x ) {
935             final String mx = m.getLabel( x );
936             final PhylogenyNode nx = map.get( mx );
937             if ( nx == null ) {
938                 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
939             }
940             String my;
941             PhylogenyNode ny;
942             for( int y = 0; y < m.size(); ++y ) {
943                 my = m.getLabel( y );
944                 ny = map.get( my );
945                 if ( ny == null ) {
946                     throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
947                 }
948                 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
949                     m.inreaseByOne( x, y );
950                 }
951             }
952         }
953     }
954
955     public enum REROOTING {
956         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
957     }
958 }