e094f69ed4313b6948f5cd4fdfb8e8968a7243c5
[jalview.git] / forester / java / src / org / forester / application / count_support.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.application;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.Arrays;
32 import java.util.List;
33
34 import org.forester.io.parsers.PhylogenyParser;
35 import org.forester.io.writers.PhylogenyWriter;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Confidence;
40 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
41 import org.forester.phylogeny.factories.PhylogenyFactory;
42 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
43 import org.forester.tools.SupportCount;
44 import org.forester.util.CommandLineArguments;
45 import org.forester.util.ForesterUtil;
46
47 public class count_support {
48
49     final static private String  PRG_NAME                = "count_support";
50     final static private String  PRG_VERSION             = "1.0";
51     final static private String  PRG_DATE                = "2008.03.04";
52     private final static boolean WRITE_EVALUATORS_AS_NHX = false;
53
54     public static void main( final String args[] ) {
55         ForesterUtil
56                 .printProgramInformation( count_support.PRG_NAME, count_support.PRG_VERSION, count_support.PRG_DATE );
57         if ( ( args.length < 3 ) || ( args.length > 7 ) ) {
58             System.out.println();
59             System.out.println( count_support.PRG_NAME + ": wrong number of arguments" );
60             System.out.println();
61             System.out
62                     .println( "Usage: \"count_support [options] <file containing phylogeny to be evaluated> <file with phylogenies to be used for evaluation> <outfile> [outfile for evaluator phylogenies, "
63                             + "always unstripped if -t=<d> option is used, otherwise strippedness is dependent on -s option]\"\n" );
64             System.out
65                     .println( " Options: -s strip external nodes from evaluator phylogenies not found in phylogeny to be evaluated" );
66             System.out.println( "        : -t=<d> threshold for similarity (0.0 to 1.0)" );
67             System.out.println( "        : -n no branch lengths in outfile for evaluator phylogenies" );
68             System.out.println();
69             System.exit( -1 );
70         }
71         CommandLineArguments cla = null;
72         try {
73             cla = new CommandLineArguments( args );
74         }
75         catch ( final Exception e ) {
76             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
77         }
78         final List<String> allowed_options = new ArrayList<String>();
79         allowed_options.add( "s" );
80         allowed_options.add( "t" );
81         allowed_options.add( "n" );
82         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
83         if ( dissallowed_options.length() > 0 ) {
84             ForesterUtil.fatalError( count_support.PRG_NAME, "Unknown option(s): " + dissallowed_options );
85         }
86         final File phylogeny_infile = cla.getFile( 0 );
87         final File evaluators_infile = cla.getFile( 1 );
88         final File phylogeny_outfile = cla.getFile( 2 );
89         File evaluators_outfile = null;
90         boolean branch_lengths_in_ev_out = true;
91         if ( cla.isOptionSet( "n" ) ) {
92             branch_lengths_in_ev_out = false;
93         }
94         if ( cla.getNumberOfNames() == 4 ) {
95             evaluators_outfile = cla.getFile( 3 );
96         }
97         else {
98             if ( !branch_lengths_in_ev_out ) {
99                 ForesterUtil.fatalError( count_support.PRG_NAME,
100                                          "Cannot use -n option if no outfile for evaluators specified" );
101             }
102         }
103         Phylogeny p = null;
104         Phylogeny[] ev = null;
105         try {
106             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
107             final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( phylogeny_infile, true );
108             p = factory.create( phylogeny_infile, pp )[ 0 ];
109         }
110         catch ( final Exception e ) {
111             ForesterUtil.fatalError( count_support.PRG_NAME,
112                                      "Could not read \"" + phylogeny_infile + "\" [" + e.getMessage() + "]" );
113         }
114         try {
115             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
116             final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( evaluators_infile, true );
117             ev = factory.create( evaluators_infile, pp );
118         }
119         catch ( final Exception e ) {
120             ForesterUtil.fatalError( count_support.PRG_NAME,
121                                      "Could not read \"" + evaluators_infile + "\" [" + e.getMessage() + "]" );
122         }
123         boolean strip = false;
124         if ( cla.isOptionSet( "s" ) ) {
125             strip = true;
126         }
127         double threshhold = -1.0;
128         if ( cla.isOptionSet( "t" ) ) {
129             try {
130                 threshhold = cla.getOptionValueAsDouble( "t" );
131             }
132             catch ( final Exception e ) {
133                 ForesterUtil.fatalError( count_support.PRG_NAME, "error in command line arguments: " + e.getMessage() );
134             }
135             if ( ( threshhold < 0 ) || ( threshhold > 1.0 ) ) {
136                 ForesterUtil.fatalError( count_support.PRG_NAME,
137                                          "support threshold has to be between 0.0 and 1.0 (inclusive)" );
138             }
139         }
140         List<Phylogeny> evaluator_phylogenies_above_threshold = null;
141         try {
142             if ( threshhold >= 0 ) {
143                 evaluator_phylogenies_above_threshold = SupportCount.count( p, ev, strip, threshhold, true );
144                 if ( evaluator_phylogenies_above_threshold.size() < 1 ) {
145                     ForesterUtil.fatalError( "count_support", "appears like threshold for similarity is set too high" );
146                 }
147             }
148             else {
149                 SupportCount.count( p, ev, strip, true );
150             }
151         }
152         catch ( final Exception e ) {
153             ForesterUtil.fatalError( count_support.PRG_NAME, "Failure during support counting: " + e.getMessage() );
154         }
155         if ( threshhold >= 0 ) {
156             count_support.normalizeSupport( p, 100, evaluator_phylogenies_above_threshold.size() );
157             System.out.println( evaluator_phylogenies_above_threshold.size() + " out of " + ev.length
158                     + " evaluator phylogenies are above threshold of " + threshhold );
159         }
160         try {
161             final PhylogenyWriter w = new PhylogenyWriter();
162             w.toPhyloXML( phylogeny_outfile, p, 1 );
163         }
164         catch ( final IOException e ) {
165             ForesterUtil.fatalError( count_support.PRG_NAME, "Failure to write output [" + e.getMessage() + "]" );
166         }
167         System.out.println();
168         System.out.println( "Wrote phylogeny with support values to: " + phylogeny_outfile );
169         if ( evaluators_outfile != null ) {
170             try {
171                 final PhylogenyWriter w = new PhylogenyWriter();
172                 if ( evaluator_phylogenies_above_threshold != null ) {
173                     System.out.println( "Writing " + evaluator_phylogenies_above_threshold.size()
174                             + " evaluator phylogenies above threshold of " + threshhold + " to: " + evaluators_outfile );
175                     if ( count_support.WRITE_EVALUATORS_AS_NHX ) {
176                         w.toNewHampshireX( evaluator_phylogenies_above_threshold, evaluators_outfile, ";"
177                                 + ForesterUtil.getLineSeparator() );
178                     }
179                     else {
180                         w.toNewHampshire( evaluator_phylogenies_above_threshold,
181                                           true,
182                                           branch_lengths_in_ev_out,
183                                           evaluators_outfile,
184                                           ";" + ForesterUtil.getLineSeparator() );
185                     }
186                 }
187                 else {
188                     System.out.println( "Writing " + ev.length + " evaluator phylogenies to :" + evaluators_outfile );
189                     if ( count_support.WRITE_EVALUATORS_AS_NHX ) {
190                         w.toNewHampshireX( Arrays.asList( ev ),
191                                            evaluators_outfile,
192                                            ";" + ForesterUtil.getLineSeparator() );
193                     }
194                     else {
195                         w.toNewHampshire( Arrays.asList( ev ), true, branch_lengths_in_ev_out, evaluators_outfile, ";"
196                                 + ForesterUtil.getLineSeparator() );
197                     }
198                 }
199             }
200             catch ( final IOException e ) {
201                 ForesterUtil.fatalError( count_support.PRG_NAME, "Failure to write output [" + e.getMessage() + "]" );
202             }
203         }
204         System.out.println();
205         System.out.println( "Done." );
206         System.out.println();
207     }
208
209     private static void normalizeSupport( final Phylogeny p, final double normalized_max, final int number_phylos ) {
210         double min = Double.MAX_VALUE;
211         double max = -Double.MAX_VALUE;
212         double sum = 0.0;
213         int n = 0;
214         for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) {
215             final PhylogenyNode node = iter.next();
216             if ( !node.isRoot() && !node.isExternal() ) {
217                 final double b = PhylogenyMethods.getConfidenceValue( node );
218                 if ( b > max ) {
219                     max = b;
220                 }
221                 if ( ( b >= 0 ) && ( b < min ) ) {
222                     min = b;
223                 }
224                 sum += b;
225                 ++n;
226             }
227         }
228         double av = sum / n;
229         System.out.println( "Max support before normalization is    : " + max );
230         System.out.println( "Min support before normalization is    : " + min );
231         System.out.println( "Average support before normalization is: " + av + " (=" + sum + "/" + n + ")" );
232         System.out.println( "Normalizing so that theoretical maximum support value is: " + normalized_max );
233         System.out.println( "Number of phylogenies used in support analysis: " + number_phylos );
234         final double f = normalized_max / number_phylos;
235         min = Double.MAX_VALUE;
236         max = -Double.MAX_VALUE;
237         sum = 0.0;
238         n = 0;
239         for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) {
240             final PhylogenyNode node = iter.next();
241             if ( node.isRoot() || node.isExternal() ) {
242                 PhylogenyMethods.setBootstrapConfidence( node, Confidence.CONFIDENCE_DEFAULT_VALUE );
243             }
244             else {
245                 double b = PhylogenyMethods.getConfidenceValue( node );
246                 b = f * b;
247                 PhylogenyMethods.setBootstrapConfidence( node, b );
248                 if ( b > max ) {
249                     max = b;
250                 }
251                 if ( ( b >= 0 ) && ( b < min ) ) {
252                     min = b;
253                 }
254                 sum += b;
255                 ++n;
256             }
257         }
258         av = sum / n;
259         System.out.println( "Max support after normalization is    : " + max );
260         System.out.println( "Min support after normalization is    : " + min );
261         System.out.println( "Average support after normalization is: " + av + " (=" + sum + "/" + n + ")" );
262     }
263 }