2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.application;
29 import java.io.FileInputStream;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.Iterator;
34 import java.util.List;
36 import java.util.Map.Entry;
38 import org.forester.io.parsers.FastaParser;
39 import org.forester.io.parsers.PhylogenyParser;
40 import org.forester.io.parsers.util.ParserUtils;
41 import org.forester.io.writers.PhylogenyWriter;
42 import org.forester.phylogeny.Phylogeny;
43 import org.forester.phylogeny.PhylogenyMethods;
44 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
45 import org.forester.phylogeny.data.Identifier;
46 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
47 import org.forester.phylogeny.factories.PhylogenyFactory;
48 import org.forester.sequence.MolecularSequence;
49 import org.forester.tools.PhylogenyDecorator;
50 import org.forester.tools.PhylogenyDecorator.FIELD;
51 import org.forester.util.BasicTable;
52 import org.forester.util.BasicTableParser;
53 import org.forester.util.CommandLineArguments;
54 import org.forester.util.ForesterUtil;
56 public final class decorator {
58 private static final String SEQUENCE_NAME_FIELD = "s";
59 private static final String MOL_SEQ = "m";
60 private static final String TAXONOMY_CODE_FIELD = "c";
61 private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD = "sn";
62 private static final String DS_FILED = "d";
63 private static final String SEQUENCE_ANNOTATION_DESC = "a";
64 private static final String NODE_NAME_FIELD = "n";
65 final static private String PICKY_OPTION = "p";
66 final static private String FIELD_OPTION = "f";
67 final static private String TRIM_AFTER_TILDE_OPTION = "t";
68 final static private String VERBOSE_OPTION = "ve";
69 final static private String TREE_NAME_OPTION = "pn";
70 final static private String TREE_ID_OPTION = "pi";
71 final static private String TREE_DESC_OPTION = "pd";
72 final static private String MIDPOINT_ROOT_OPTION = "mp";
73 final static private String ORDER_TREE_OPTION = "or";
74 final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION = "sn";
75 final static private String EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = "tc";
76 final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION = "c";
77 final static private String ADVANCED_TABLE_OPTION = "table";
78 final static private String KEY_COLUMN = "k";
79 final static private String VALUE_COLUMN = "v";
80 final static private String MAPPING_FILE_SEPARATOR_OPTION = "s";
81 final static private char MAPPING_FILE_SEPARATOR_DEFAULT = '\t';
82 final static private String PRG_NAME = "decorator";
83 final static private String PRG_VERSION = "1.16";
84 final static private String PRG_DATE = "131113";
86 public static void main( final String args[] ) {
87 ForesterUtil.printProgramInformation( decorator.PRG_NAME, decorator.PRG_VERSION, decorator.PRG_DATE );
89 if ( ( args.length < 4 ) || ( args.length > 13 ) ) {
90 decorator.argumentsError();
92 CommandLineArguments cla = null;
94 cla = new CommandLineArguments( args );
96 catch ( final Exception e ) {
97 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
99 if ( ( cla.getNumberOfNames() < 3 ) || ( cla.getNumberOfNames() > 4 ) ) {
100 decorator.argumentsError();
102 final File phylogenies_infile = cla.getFile( 0 );
103 final File mapping_infile = cla.getFile( 1 );
104 final File phylogenies_outfile = cla.getFile( 2 );
105 if ( phylogenies_outfile.exists() ) {
106 ForesterUtil.fatalError( PRG_NAME, "[" + phylogenies_outfile + "] already exists" );
108 String err = ForesterUtil.isReadableFile( phylogenies_infile );
109 if ( !ForesterUtil.isEmpty( err ) ) {
110 ForesterUtil.fatalError( PRG_NAME, err );
112 err = ForesterUtil.isReadableFile( mapping_infile );
113 if ( !ForesterUtil.isEmpty( err ) ) {
114 ForesterUtil.fatalError( PRG_NAME, err );
116 final List<String> allowed_options = new ArrayList<String>();
117 allowed_options.add( decorator.ADVANCED_TABLE_OPTION );
118 allowed_options.add( decorator.PICKY_OPTION );
119 allowed_options.add( decorator.FIELD_OPTION );
120 allowed_options.add( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION );
121 allowed_options.add( decorator.KEY_COLUMN );
122 allowed_options.add( decorator.VALUE_COLUMN );
123 allowed_options.add( decorator.MAPPING_FILE_SEPARATOR_OPTION );
124 allowed_options.add( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION );
125 allowed_options.add( decorator.EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION );
126 allowed_options.add( decorator.TREE_NAME_OPTION );
127 allowed_options.add( decorator.TREE_ID_OPTION );
128 allowed_options.add( decorator.TREE_DESC_OPTION );
129 allowed_options.add( decorator.TRIM_AFTER_TILDE_OPTION );
130 allowed_options.add( decorator.ORDER_TREE_OPTION );
131 allowed_options.add( decorator.MIDPOINT_ROOT_OPTION );
132 allowed_options.add( decorator.VERBOSE_OPTION );
133 final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
134 if ( dissallowed_options.length() > 0 ) {
135 ForesterUtil.fatalError( decorator.PRG_NAME, "unknown option(s): " + dissallowed_options );
137 final boolean advanced_table = cla.isOptionSet( decorator.ADVANCED_TABLE_OPTION );
138 if ( !advanced_table ) {
139 final List<String> mandatory_options = new ArrayList<String>();
140 mandatory_options.add( decorator.FIELD_OPTION );
141 final String missing_options = cla.validateMandatoryOptionsAsString( mandatory_options );
142 if ( missing_options.length() > 0 ) {
143 ForesterUtil.fatalError( decorator.PRG_NAME, "missing option(s): " + missing_options );
146 final boolean picky = cla.isOptionSet( decorator.PICKY_OPTION );
147 char separator = decorator.MAPPING_FILE_SEPARATOR_DEFAULT;
148 if ( cla.isOptionSet( decorator.MAPPING_FILE_SEPARATOR_OPTION ) ) {
149 if ( advanced_table ) {
152 separator = cla.getOptionValueAsChar( decorator.MAPPING_FILE_SEPARATOR_OPTION );
155 int value_column = 1;
156 String field_str = "";
157 FIELD field = FIELD.NODE_NAME;
158 boolean cut_name_after_space = false;
159 boolean extract_bracketed_scientific_name = false;
160 boolean extract_bracketed_tax_code = false;
161 boolean trim_after_tilde = false;
162 boolean order_tree = false;
163 boolean midpoint_root = false;
164 boolean verbose = false;
165 String tree_name = "";
167 String tree_desc = "";
169 if ( cla.isOptionSet( decorator.TREE_NAME_OPTION ) ) {
170 tree_name = cla.getOptionValueAsCleanString( decorator.TREE_NAME_OPTION );
172 if ( cla.isOptionSet( decorator.TREE_ID_OPTION ) ) {
173 tree_id = cla.getOptionValueAsCleanString( decorator.TREE_ID_OPTION );
175 if ( cla.isOptionSet( decorator.TREE_DESC_OPTION ) ) {
176 tree_desc = cla.getOptionValueAsCleanString( decorator.TREE_DESC_OPTION );
178 if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION ) ) {
179 if ( advanced_table ) {
182 extract_bracketed_scientific_name = true;
184 if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION ) ) {
185 if ( advanced_table ) {
188 extract_bracketed_tax_code = true;
190 if ( cla.isOptionSet( decorator.KEY_COLUMN ) ) {
191 if ( advanced_table ) {
194 key_column = cla.getOptionValueAsInt( decorator.KEY_COLUMN );
196 if ( cla.isOptionSet( decorator.VALUE_COLUMN ) ) {
197 if ( advanced_table ) {
200 value_column = cla.getOptionValueAsInt( decorator.VALUE_COLUMN );
202 if ( cla.isOptionSet( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION ) ) {
203 if ( advanced_table ) {
206 cut_name_after_space = true;
208 if ( cla.isOptionSet( decorator.TRIM_AFTER_TILDE_OPTION ) ) {
209 if ( advanced_table ) {
212 trim_after_tilde = true;
214 if ( cla.isOptionSet( decorator.MIDPOINT_ROOT_OPTION ) ) {
215 midpoint_root = true;
217 if ( cla.isOptionSet( decorator.ORDER_TREE_OPTION ) ) {
220 if ( cla.isOptionSet( decorator.VERBOSE_OPTION ) ) {
223 if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) {
224 field_str = cla.getOptionValue( decorator.FIELD_OPTION );
225 if ( field_str.equals( NODE_NAME_FIELD ) ) {
226 field = FIELD.NODE_NAME;
228 else if ( field_str.equals( SEQUENCE_ANNOTATION_DESC ) ) {
229 field = FIELD.SEQUENCE_ANNOTATION_DESC;
231 else if ( field_str.equals( DS_FILED ) ) {
232 field = FIELD.DOMAIN_STRUCTURE;
233 extract_bracketed_scientific_name = false;
234 extract_bracketed_tax_code = false;
236 else if ( field_str.equals( TAXONOMY_CODE_FIELD ) ) {
237 field = FIELD.TAXONOMY_CODE;
239 else if ( field_str.equals( SEQUENCE_NAME_FIELD ) ) {
240 field = FIELD.SEQUENCE_NAME;
242 else if ( field_str.equals( MOL_SEQ ) ) {
243 field = FIELD.MOL_SEQ;
245 else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) {
246 field = FIELD.TAXONOMY_SCIENTIFIC_NAME;
247 extract_bracketed_scientific_name = false;
248 extract_bracketed_tax_code = false;
251 ForesterUtil.fatalError( decorator.PRG_NAME, "unknown value for \"" + decorator.FIELD_OPTION
252 + "\" option: \"" + field_str + "\"" );
256 catch ( final Exception e ) {
257 ForesterUtil.fatalError( decorator.PRG_NAME, "error in command line: " + e.getMessage() );
259 if ( extract_bracketed_scientific_name && extract_bracketed_tax_code ) {
262 ForesterUtil.programMessage( PRG_NAME, "input tree(s) : " + phylogenies_infile );
263 ForesterUtil.programMessage( PRG_NAME, "map : " + mapping_infile );
264 ForesterUtil.programMessage( PRG_NAME, "output tree(s): " + phylogenies_outfile );
265 System.out.println();
266 Phylogeny[] phylogenies = null;
268 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
269 final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( phylogenies_infile, true );
270 phylogenies = factory.create( phylogenies_infile, pp );
272 catch ( final Exception e ) {
273 ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read phylgenies from [" + phylogenies_infile
274 + "] [" + e.getMessage() + "]" );
276 Map<String, String> map = null;
277 if ( !advanced_table ) {
278 if ( field != FIELD.MOL_SEQ ) {
279 BasicTable<String> mapping_table = null;
281 mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
283 catch ( final Exception e ) {
284 ForesterUtil.fatalError( decorator.PRG_NAME,
285 "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
287 if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
288 ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
290 if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
291 ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
293 if ( mapping_table.isEmpty() || ( mapping_table.getNumberOfColumns() < 1 ) ) {
294 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table is empty" );
296 if ( mapping_table.getNumberOfColumns() == 1 ) {
297 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table has only one column" );
299 map = mapping_table.getColumnsAsMap( key_column, value_column );
300 final Iterator<Entry<String, String>> iter = map.entrySet().iterator();
302 System.out.println();
304 while ( iter.hasNext() ) {
305 final Entry<String, String> e = iter.next();
306 if ( ForesterUtil.isEmpty( e.getKey() ) ) {
307 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table contains empty key" );
309 if ( ForesterUtil.isEmpty( e.getValue() ) ) {
310 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table contains empty value" );
313 System.out.println( e.getKey() + " => " + e.getValue() );
317 System.out.println();
321 map = readFastaFileIntoMap( mapping_infile, verbose );
324 if ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id )
325 || !ForesterUtil.isEmpty( tree_desc ) ) {
326 if ( ( phylogenies.length > 1 )
327 && ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id ) ) ) {
328 ForesterUtil.fatalError( decorator.PRG_NAME,
329 "attempt to set same name or id on more than one phylogeny" );
331 if ( !ForesterUtil.isEmpty( tree_name ) ) {
332 phylogenies[ 0 ].setName( tree_name );
334 if ( !ForesterUtil.isEmpty( tree_id ) ) {
335 final String[] s_ary = tree_id.split( ":" );
336 phylogenies[ 0 ].setIdentifier( new Identifier( s_ary[ 1 ], s_ary[ 0 ] ) );
338 if ( !ForesterUtil.isEmpty( tree_desc ) ) {
339 for( final Phylogeny phylogenie : phylogenies ) {
340 phylogenie.setDescription( tree_desc );
345 if ( advanced_table ) {
346 Map<String, Map<String, String>> table = null;
348 table = PhylogenyDecorator.parseMappingTable( mapping_infile );
350 catch ( final IOException e ) {
351 ForesterUtil.fatalError( decorator.PRG_NAME,
352 "failed to read \"" + mapping_infile + "\" [" + e.getMessage() + "]" );
354 for( final Phylogeny phylogenie : phylogenies ) {
355 PhylogenyDecorator.decorate( phylogenie, table, picky );
359 for( final Phylogeny phylogenie : phylogenies ) {
360 final String msg = PhylogenyDecorator.decorate( phylogenie,
363 extract_bracketed_scientific_name,
364 extract_bracketed_tax_code,
366 cut_name_after_space,
369 ForesterUtil.programMessage( PRG_NAME, msg );
373 catch ( final NullPointerException e ) {
374 ForesterUtil.unexpectedFatalError( decorator.PRG_NAME, e );
376 catch ( final Exception e ) {
377 ForesterUtil.fatalError( decorator.PRG_NAME, e.getLocalizedMessage() );
379 if ( midpoint_root || order_tree ) {
380 for( final Phylogeny phy : phylogenies ) {
381 if ( midpoint_root ) {
382 PhylogenyMethods.midpointRoot( phy );
385 PhylogenyMethods.orderAppearance( phy.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.TAXONOMY );
390 final PhylogenyWriter w = new PhylogenyWriter();
391 w.toPhyloXML( phylogenies, 0, phylogenies_outfile, ForesterUtil.getLineSeparator() );
393 catch ( final IOException e ) {
394 ForesterUtil.fatalError( decorator.PRG_NAME, "failed to write output [" + e.getMessage() + "]" );
396 System.out.println();
397 ForesterUtil.programMessage( PRG_NAME, "wrote: " + phylogenies_outfile );
398 ForesterUtil.programMessage( PRG_NAME, "OK." );
401 private static Map<String, String> readFastaFileIntoMap( final File mapping_infile, final boolean verbose ) {
402 List<MolecularSequence> seqs = null;
404 seqs = FastaParser.parse( new FileInputStream( mapping_infile ) );
406 catch ( final IOException e ) {
407 ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read fasta-file from [" + mapping_infile + "] ["
408 + e.getMessage() + "]" );
410 if ( ForesterUtil.isEmpty( seqs ) ) {
411 ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
412 + "] is devoid of fasta-formatted sequences" );
414 final Map<String, String> map = new HashMap<String, String>();
415 for( final MolecularSequence seq : seqs ) {
416 if ( ForesterUtil.isEmpty( seq.getIdentifier() ) ) {
417 ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
418 + "] contains sequence with empty identifier" );
420 if ( map.containsKey( seq.getIdentifier() ) ) {
421 ForesterUtil.fatalError( decorator.PRG_NAME, "sequence identifier [" + seq.getIdentifier()
422 + "] is not unique" );
424 if ( seq.getLength() < 1 ) {
425 ForesterUtil.fatalError( decorator.PRG_NAME, "sequence [" + seq.getIdentifier() + "] is empty" );
427 map.put( seq.getIdentifier(), seq.getMolecularSequenceAsString() );
429 System.out.println( seq.getIdentifier() + " => " + seq.getMolecularSequenceAsString() );
435 private static void argumentsError() {
436 System.out.println();
437 System.out.println( decorator.PRG_NAME + " -" + ADVANCED_TABLE_OPTION + " | -f=<c> <phylogenies infile> "
438 + "<mapping table file|fasta-file> <phylogenies outfile>" );
439 System.out.println();
440 System.out.println( "options:" );
441 System.out.println();
442 System.out.println( " -" + ADVANCED_TABLE_OPTION + " : table instead of one to one map (-f=<c>)" );
443 System.out.println( " -p : picky, fails if node name not found in mapping table" );
444 System.out.println( " -" + TREE_NAME_OPTION + "=<s>: name for the phylogeny" );
445 System.out.println( " -" + TREE_ID_OPTION + "=<s>: identifier for the phylogeny (in the form provider:value)" );
446 System.out.println( " -" + TREE_DESC_OPTION + "=<s>: description for phylogenies" );
447 System.out.println();
448 System.out.println();
449 System.out.println( "advanced options, only available if -" + ADVANCED_TABLE_OPTION + " is not used:" );
450 System.out.println();
451 System.out.println( " -f=<c> : field to be replaced: " + NODE_NAME_FIELD + " : node name" );
452 System.out.println( " " + SEQUENCE_ANNOTATION_DESC
453 + " : sequence annotation description" );
454 System.out.println( " " + DS_FILED + " : domain structure" );
455 System.out.println( " " + TAXONOMY_CODE_FIELD + " : taxonomy code" );
456 System.out.println( " " + TAXONOMY_SCIENTIFIC_NAME_FIELD
457 + ": taxonomy scientific name" );
458 System.out.println( " " + SEQUENCE_NAME_FIELD + " : sequence name" );
459 System.out.println( " " + MOL_SEQ + " : molecular sequence" );
460 System.out.println( " -k=<n> : key column in mapping table (0 based)," );
461 System.out.println( " names of the node to be decorated - default is 0" );
462 System.out.println( " -v=<n> : value column in mapping table (0 based)," );
463 System.out.println( " data which with to decorate - default is 1" );
464 System.out.println( " -" + EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION
465 + " : to extract bracketed scientific names, e.g. [Nematostella vectensis]" );
466 System.out.println( " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION
467 + " : to extract bracketed taxonomic codes, e.g. [NEMVE]" );
468 System.out.println( " -s=<c> : column separator in mapping file, default is tab" );
469 System.out.println( " -c : cut name after first space (only for -f=n)" );
470 System.out.println( " -" + decorator.TRIM_AFTER_TILDE_OPTION
471 + " : trim node name to be replaced after tilde" );
472 System.out.println( " -" + decorator.MIDPOINT_ROOT_OPTION + " : to midpoint-root the tree" );
473 System.out.println( " -" + decorator.ORDER_TREE_OPTION + " : to order tree branches" );
474 System.out.println( " -" + decorator.VERBOSE_OPTION + " : verbose" );
475 System.out.println();