0c1a387ce0967c01664748b59bf3675b43457840
[jalview.git] / forester / java / src / org / forester / application / gsdi.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.application;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.io.Writer;
31 import java.text.SimpleDateFormat;
32 import java.util.ArrayList;
33 import java.util.Date;
34 import java.util.List;
35
36 import org.forester.io.parsers.PhylogenyParser;
37 import org.forester.io.parsers.nhx.NHXParser;
38 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
39 import org.forester.io.parsers.util.ParserUtils;
40 import org.forester.io.writers.PhylogenyWriter;
41 import org.forester.phylogeny.Phylogeny;
42 import org.forester.phylogeny.PhylogenyMethods;
43 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
44 import org.forester.phylogeny.factories.PhylogenyFactory;
45 import org.forester.sdi.GSDI;
46 import org.forester.sdi.SDI;
47 import org.forester.sdi.SDI.TaxonomyComparisonBase;
48 import org.forester.sdi.SDIse;
49 import org.forester.util.CommandLineArguments;
50 import org.forester.util.ForesterUtil;
51
52 public final class gsdi {
53
54     private enum BASE_ALGORITHM {
55         GSDI, SDI
56     }
57     final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE     = true;
58     final static private String STRIP_OPTION                               = "s";
59     final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION        = "g";
60     final static private String SDI_OPTION                                 = "b";
61     final static private String MOST_PARSIMONIOUS_OPTION                   = "m";
62     final static private String GUESS_FORMAT_OF_SPECIES_TREE               = "q";
63     final static private String HELP_OPTION_1                              = "help";
64     final static private String HELP_OPTION_2                              = "h";
65     final static private String DEFAULT_OUTFILE_SUFFIX                     = "_gsdi_out.phylo.xml";
66     final static private String SUFFIX_FOR_LIST_OF_STIPPED_GENE_TREE_NODES = "_stripped_gene_tree_nodes.txt";
67     final static private String LOGFILE_SUFFIX                             = "_gsdi_log.txt";
68     final static private String PRG_NAME                                   = "gsdi";
69     final static private String PRG_VERSION                                = "0.901";
70     final static private String PRG_DATE                                   = "120608";
71     final static private String PRG_DESC                                   = "general speciation duplication inference";
72     final static private String E_MAIL                                     = "phylosoft@gmail.com";
73     final static private String WWW                                        = "www.phylosoft.org/forester";
74
75     public static void main( final String args[] ) {
76         try {
77             ForesterUtil.printProgramInformation( PRG_NAME,
78                                                   PRG_DESC,
79                                                   PRG_VERSION,
80                                                   PRG_DATE,
81                                                   E_MAIL,
82                                                   WWW,
83                                                   ForesterUtil.getForesterLibraryInformation() );
84             CommandLineArguments cla = null;
85             try {
86                 cla = new CommandLineArguments( args );
87             }
88             catch ( final Exception e ) {
89                 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
90             }
91             if ( cla.isOptionSet( gsdi.HELP_OPTION_1 ) || cla.isOptionSet( gsdi.HELP_OPTION_2 ) ) {
92                 System.out.println();
93                 gsdi.print_help();
94                 System.exit( 0 );
95             }
96             else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() < 2 ) || ( cla.getNumberOfNames() > 3 ) ) {
97                 System.out.println();
98                 System.out.println( "Wrong number of arguments." );
99                 System.out.println();
100                 gsdi.print_help();
101                 System.exit( -1 );
102             }
103             final List<String> allowed_options = new ArrayList<String>();
104             allowed_options.add( gsdi.STRIP_OPTION );
105             allowed_options.add( gsdi.SDI_OPTION );
106             allowed_options.add( gsdi.GUESS_FORMAT_OF_SPECIES_TREE );
107             allowed_options.add( gsdi.MOST_PARSIMONIOUS_OPTION );
108             allowed_options.add( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
109             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
110             if ( dissallowed_options.length() > 0 ) {
111                 ForesterUtil.fatalError( gsdi.PRG_NAME, "unknown option(s): " + dissallowed_options );
112             }
113             execute( cla );
114         }
115         catch ( final IOException e ) {
116             ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() );
117         }
118     }
119
120     private static void execute( final CommandLineArguments cla ) throws IOException {
121         BASE_ALGORITHM base_algorithm = BASE_ALGORITHM.GSDI;
122         boolean strip = false;
123         boolean most_parsimonous_duplication_model = false;
124         boolean species_tree_in_phyloxml = true;
125         boolean allow_stripping_of_gene_tree = false;
126         if ( cla.isOptionSet( gsdi.STRIP_OPTION ) ) {
127             strip = true;
128         }
129         if ( cla.isOptionSet( gsdi.SDI_OPTION ) ) {
130             base_algorithm = BASE_ALGORITHM.SDI;
131         }
132         if ( cla.isOptionSet( gsdi.MOST_PARSIMONIOUS_OPTION ) ) {
133             if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
134                 ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only use most parsimonious duplication mode with GSDI" );
135             }
136             most_parsimonous_duplication_model = true;
137         }
138         if ( cla.isOptionSet( gsdi.GUESS_FORMAT_OF_SPECIES_TREE ) ) {
139             species_tree_in_phyloxml = false;
140         }
141         if ( cla.isOptionSet( gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) {
142             if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
143                 ForesterUtil.fatalError( gsdi.PRG_NAME, "Can only allow stripping of gene tree with GSDI" );
144             }
145             allow_stripping_of_gene_tree = true;
146         }
147         Phylogeny species_tree = null;
148         Phylogeny gene_tree = null;
149         File gene_tree_file = null;
150         File species_tree_file = null;
151         File out_file = null;
152         File log_file = null;
153         Writer log_writer = null;
154         try {
155             gene_tree_file = cla.getFile( 0 );
156             species_tree_file = cla.getFile( 1 );
157             if ( cla.getNumberOfNames() == 3 ) {
158                 out_file = cla.getFile( 2 );
159             }
160             else {
161                 out_file = new File( ForesterUtil.removeSuffix( gene_tree_file.toString() ) + DEFAULT_OUTFILE_SUFFIX );
162             }
163             log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX );
164         }
165         catch ( final IllegalArgumentException e ) {
166             ForesterUtil.fatalError( gsdi.PRG_NAME, "error in command line: " + e.getMessage() );
167         }
168         if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
169             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
170         }
171         if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
172             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
173         }
174         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
175             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
176         }
177         if ( ForesterUtil.isWritableFile( log_file ) != null ) {
178             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
179         }
180         try {
181             log_writer = ForesterUtil.createBufferedWriter( log_file );
182         }
183         catch ( final IOException e ) {
184             ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
185         }
186         try {
187             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
188             gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
189         }
190         catch ( final IOException e ) {
191             ForesterUtil.fatalError( gsdi.PRG_NAME,
192                                      "Failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage() );
193         }
194         try {
195             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
196             if ( species_tree_in_phyloxml ) {
197                 species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
198             }
199             else {
200                 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( species_tree_file, true );
201                 if ( REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE && ( p instanceof NHXParser ) ) {
202                     ( ( NHXParser ) p ).setReplaceUnderscores( true );
203                 }
204                 species_tree = factory.create( species_tree_file, p )[ 0 ];
205                 final TaxonomyComparisonBase comp_base = GSDI.determineTaxonomyComparisonBase( gene_tree );
206                 switch ( comp_base ) {
207                     case SCIENTIFIC_NAME:
208                         PhylogenyMethods
209                                 .transferNodeNameToField( species_tree,
210                                                           PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1 );
211                         break;
212                     case CODE:
213                         PhylogenyMethods.transferNodeNameToField( species_tree,
214                                                                   PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE );
215                         break;
216                     case ID:
217                         PhylogenyMethods.transferNodeNameToField( species_tree,
218                                                                   PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID );
219                         break;
220                     default:
221                         ForesterUtil.fatalError( gsdi.PRG_NAME, "unable to determine comparison base" );
222                 }
223             }
224         }
225         catch ( final IOException e ) {
226             ForesterUtil.fatalError( gsdi.PRG_NAME,
227                                      "Failed to read species tree from [" + gene_tree_file + "]: " + e.getMessage() );
228         }
229         gene_tree.setRooted( true );
230         species_tree.setRooted( true );
231         if ( !gene_tree.isCompletelyBinary() ) {
232             ForesterUtil.fatalError( gsdi.PRG_NAME, "gene tree (\"" + gene_tree_file + "\") is not completely binary" );
233         }
234         if ( base_algorithm != BASE_ALGORITHM.GSDI ) {
235             if ( !species_tree.isCompletelyBinary() ) {
236                 ForesterUtil.fatalError( gsdi.PRG_NAME, "species tree (\"" + species_tree_file
237                         + "\") is not completely binary, use GSDI instead" );
238             }
239         }
240         // For timing.
241         // gene_tree = Helper.createBalancedTree( 10 );
242         // species_tree = Helper.createBalancedTree( 13 );
243         // species_tree = Helper.createUnbalancedTree( 1024 );
244         // gene_tree = Helper.createUnbalancedTree( 8192 );
245         // species_tree = gene_tree.copyTree();
246         // gene_tree = species_tree.copyTree();
247         // Helper.numberSpeciesInOrder( species_tree );
248         // Helper.numberSpeciesInOrder( gene_tree );
249         // Helper.randomizeSpecies( 1, 8192, gene_tree );
250         // Helper.intervalNumberSpecies( gene_tree, 4096 );
251         // Helper.numberSpeciesInDescOrder( gene_tree );
252         log_writer.write( PRG_NAME + " " + PRG_VERSION + " " + PRG_DATE );
253         log_writer.write( ForesterUtil.LINE_SEPARATOR );
254         log_writer.write( PRG_DESC );
255         log_writer.write( ForesterUtil.LINE_SEPARATOR );
256         log_writer.write( PRG_DESC );
257         log_writer.write( ForesterUtil.LINE_SEPARATOR );
258         log_writer.write( ForesterUtil.LINE_SEPARATOR );
259         log_writer.write( new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
260         log_writer.write( ForesterUtil.LINE_SEPARATOR );
261         System.out.println();
262         System.out.println( "Strip species tree: " + strip );
263         log_writer.write( "Strip species tree: " + strip );
264         log_writer.write( ForesterUtil.LINE_SEPARATOR );
265         SDI sdi = null;
266         final long start_time = new Date().getTime();
267         try {
268             if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
269                 System.out.println();
270                 System.out.println( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
271                 System.out.println( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
272                 log_writer.write( "Use most parsimonous duplication model: " + most_parsimonous_duplication_model );
273                 log_writer.write( ForesterUtil.LINE_SEPARATOR );
274                 log_writer.write( "Allow stripping of gene tree nodes    : " + allow_stripping_of_gene_tree );
275                 log_writer.write( ForesterUtil.LINE_SEPARATOR );
276                 sdi = new GSDI( gene_tree,
277                                 species_tree,
278                                 most_parsimonous_duplication_model,
279                                 allow_stripping_of_gene_tree );
280             }
281             else {
282                 System.out.println();
283                 System.out.println( "Using SDIse algorithm" );
284                 log_writer.write( "Using SDIse algorithm" );
285                 log_writer.write( ForesterUtil.LINE_SEPARATOR );
286                 sdi = new SDIse( gene_tree, species_tree );
287             }
288         }
289         catch ( final Exception e ) {
290             ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
291         }
292         System.out.println();
293         System.out.println( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
294         log_writer.write( "Running time (excluding I/O): " + ( new Date().getTime() - start_time ) + "ms" );
295         log_writer.write( ForesterUtil.LINE_SEPARATOR );
296         try {
297             final PhylogenyWriter writer = new PhylogenyWriter();
298             writer.toPhyloXML( out_file, gene_tree, 0 );
299         }
300         catch ( final IOException e ) {
301             ForesterUtil.fatalError( PRG_NAME, "Failed to write to [" + out_file + "]: " + e.getMessage() );
302         }
303         System.out.println();
304         System.out.println( "Successfully wrote resulting gene tree to: " + out_file );
305         System.out.println();
306         log_writer.write( "Wrote resulting gene tree to: " + out_file );
307         log_writer.write( ForesterUtil.LINE_SEPARATOR );
308         if ( base_algorithm == BASE_ALGORITHM.SDI ) {
309             sdi.computeMappingCostL();
310             System.out.println( "Mapping cost                    : " + sdi.computeMappingCostL() );
311             log_writer.write( "Mapping cost                    : " + sdi.computeMappingCostL() );
312             log_writer.write( ForesterUtil.LINE_SEPARATOR );
313         }
314         System.out.println( "Number of duplications          : " + sdi.getDuplicationsSum() );
315         log_writer.write( "Number of duplications          : " + sdi.getDuplicationsSum() );
316         log_writer.write( ForesterUtil.LINE_SEPARATOR );
317         if ( ( base_algorithm == BASE_ALGORITHM.GSDI ) && !most_parsimonous_duplication_model ) {
318             final int duplications = ( ( GSDI ) sdi ).getSpeciationOrDuplicationEventsSum();
319             System.out.println( "Number of potential duplications: " + duplications );
320             log_writer.write( "Number of potential duplications: " + duplications );
321             log_writer.write( ForesterUtil.LINE_SEPARATOR );
322         }
323         if ( base_algorithm == BASE_ALGORITHM.GSDI ) {
324             final int spec = ( ( GSDI ) sdi ).getSpeciationsSum();
325             System.out.println( "Number of speciations            : " + spec );
326             log_writer.write( "Number of speciations            : " + spec );
327             log_writer.write( ForesterUtil.LINE_SEPARATOR );
328         }
329         System.out.println();
330         //  some stat on gene tree:
331         //      filename, name
332         //      number of external nodes, strppided nodes
333         //  some stats on sepcies tree, external nodes,
334         //  filename, name
335         //  internal nodes
336         //  how many of which are polytomies
337         //wrote log file to
338         //  if ( allow_stripping_of_gene_tree ) {
339         //      stripped x nodes, y external nodes remain
340         //  }
341     }
342
343     private static void print_help() {
344         System.out.println( "Usage: " + gsdi.PRG_NAME
345                 + " [-options] <gene tree in phyloXML format> <species tree in phyloXML format> [outfile]" );
346         System.out.println();
347         System.out.println( "Options:" );
348         //    System.out.println( " -" + gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION + ": to allow stripping of gene tree nodes without a matching species in the species tree (writes list of stripped nodes to " + );
349         System.out.println( " -" + gsdi.STRIP_OPTION
350                 + ": to strip the species tree of unneeded species prior to duplication inference" );
351         System.out.println( " -" + gsdi.SDI_OPTION + ": to use SDI algorithm instead of GSDI algorithm" );//TODO gsdi.ALLOW_STRIPPING_OF_GENE_TREE_OPTION not allowed
352         System.out.println( " -" + gsdi.MOST_PARSIMONIOUS_OPTION
353                 + ": use most parimonious duplication model for GSDI: " );
354         System.out.println( "     assign nodes as speciations which would otherwise be assiged" );
355         System.out.println( "     as unknown because of polytomies in the species tree" );
356         System.out.println( " -" + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
357                 + ": to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" );
358         System.out.println();
359         System.out.println( "Species tree:" );
360         System.out
361                 .println( " In phyloXML format (unless option -"
362                         + gsdi.GUESS_FORMAT_OF_SPECIES_TREE
363                         + " is used, in which case the species matching between gene tree and species tree must be via scientific name), with taxonomy data in appropriate fields" );
364         System.out.println();
365         System.out.println( "Gene tree:" );
366         System.out.println( " In phyloXM format, with taxonomy and sequence data in appropriate fields" );
367         System.out.println();
368         System.out.println( "Example:" );
369         //  System.out.println( "gsdi 
370         //    System.out.println();
371         System.out.println( "Note -- GSDI algorithm is under development" );
372         System.out.println();
373     }
374 }