inprogress
[jalview.git] / forester / java / src / org / forester / application / decorator.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.application;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35
36 import org.forester.io.parsers.PhylogenyParser;
37 import org.forester.io.parsers.util.ParserUtils;
38 import org.forester.io.writers.PhylogenyWriter;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyMethods;
41 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
44 import org.forester.phylogeny.factories.PhylogenyFactory;
45 import org.forester.tools.PhylogenyDecorator;
46 import org.forester.tools.PhylogenyDecorator.FIELD;
47 import org.forester.util.BasicTable;
48 import org.forester.util.BasicTableParser;
49 import org.forester.util.CommandLineArguments;
50 import org.forester.util.ForesterUtil;
51
52 public final class decorator {
53
54     private static final String SEQUENCE_NAME_FIELD                     = "s";
55     private static final String TAXONOMY_CODE_FIELD                     = "c";
56     private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD          = "sn";
57     private static final String DS_FILED                                = "d";
58     private static final String SEQUENCE_ANNOTATION_DESC                = "a";
59     private static final String NODE_NAME_FIELD                         = "n";
60     final static private String PICKY_OPTION                            = "p";
61     final static private String FIELD_OPTION                            = "f";
62     final static private String TRIM_AFTER_TILDE_OPTION                 = "t";
63     final static private String TREE_NAME_OPTION                        = "pn";
64     final static private String TREE_ID_OPTION                          = "pi";
65     final static private String TREE_DESC_OPTION                        = "pd";
66     final static private String MIDPOINT_ROOT_OPTION                    = "mp";
67     final static private String ORDER_TREE_OPTION                       = "or";
68     final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION  = "sn";
69     final static private String EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = "tc";
70     final static private String PROCESS_NAME_INTELLIGENTLY_OPTION       = "x";
71     final static private String PROCESS_SIMILAR_TO_OPTION               = "xs";
72     final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION       = "c";
73     final static private String ALLOW_REMOVAL_OF_CHARS_OPTION           = "r";
74     final static private String ADVANCED_TABLE_OPTION                   = "table";
75     final static private String KEY_COLUMN                              = "k";
76     final static private String VALUE_COLUMN                            = "v";
77     final static private String MAPPING_FILE_SEPARATOR_OPTION           = "s";
78     final static private char   MAPPING_FILE_SEPARATOR_DEFAULT          = '\t';
79     final static private String PRG_NAME                                = "decorator";
80     final static private String PRG_VERSION                             = "1.14";
81     final static private String PRG_DATE                                = "130426";
82
83     public static void main( final String args[] ) {
84         ForesterUtil.printProgramInformation( decorator.PRG_NAME, decorator.PRG_VERSION, decorator.PRG_DATE );
85         if ( ( args.length < 4 ) || ( args.length > 12 ) ) {
86             decorator.argumentsError();
87         }
88         CommandLineArguments cla = null;
89         try {
90             cla = new CommandLineArguments( args );
91         }
92         catch ( final Exception e ) {
93             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
94         }
95         if ( ( cla.getNumberOfNames() < 3 ) || ( cla.getNumberOfNames() > 4 ) ) {
96             decorator.argumentsError();
97         }
98         final File phylogenies_infile = cla.getFile( 0 );
99         final File mapping_infile = cla.getFile( 1 );
100         final File phylogenies_outfile = cla.getFile( 2 );
101         if ( phylogenies_outfile.exists() ) {
102             ForesterUtil.fatalError( PRG_NAME, "[" + phylogenies_outfile + "] already exists" );
103         }
104         final List<String> allowed_options = new ArrayList<String>();
105         allowed_options.add( decorator.ADVANCED_TABLE_OPTION );
106         allowed_options.add( decorator.PICKY_OPTION );
107         allowed_options.add( decorator.FIELD_OPTION );
108         allowed_options.add( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION );
109         allowed_options.add( decorator.PROCESS_SIMILAR_TO_OPTION );
110         allowed_options.add( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION );
111         allowed_options.add( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION );
112         allowed_options.add( decorator.KEY_COLUMN );
113         allowed_options.add( decorator.VALUE_COLUMN );
114         allowed_options.add( decorator.MAPPING_FILE_SEPARATOR_OPTION );
115         allowed_options.add( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION );
116         allowed_options.add( decorator.EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION );
117         allowed_options.add( decorator.TREE_NAME_OPTION );
118         allowed_options.add( decorator.TREE_ID_OPTION );
119         allowed_options.add( decorator.TREE_DESC_OPTION );
120         allowed_options.add( decorator.TRIM_AFTER_TILDE_OPTION );
121         allowed_options.add( decorator.ORDER_TREE_OPTION );
122         allowed_options.add( decorator.MIDPOINT_ROOT_OPTION );
123         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
124         if ( dissallowed_options.length() > 0 ) {
125             ForesterUtil.fatalError( decorator.PRG_NAME, "unknown option(s): " + dissallowed_options );
126         }
127         final boolean advanced_table = cla.isOptionSet( decorator.ADVANCED_TABLE_OPTION );
128         if ( !advanced_table ) {
129             final List<String> mandatory_options = new ArrayList<String>();
130             mandatory_options.add( decorator.FIELD_OPTION );
131             final String missing_options = cla.validateMandatoryOptionsAsString( mandatory_options );
132             if ( missing_options.length() > 0 ) {
133                 ForesterUtil.fatalError( decorator.PRG_NAME, "missing option(s): " + missing_options );
134             }
135         }
136         final boolean picky = cla.isOptionSet( decorator.PICKY_OPTION );
137         char separator = decorator.MAPPING_FILE_SEPARATOR_DEFAULT;
138         if ( cla.isOptionSet( decorator.MAPPING_FILE_SEPARATOR_OPTION ) ) {
139             if ( advanced_table ) {
140                 argumentsError();
141             }
142             separator = cla.getOptionValueAsChar( decorator.MAPPING_FILE_SEPARATOR_OPTION );
143         }
144         int key_column = 0;
145         int value_column = 1;
146         String field_str = "";
147         FIELD field = FIELD.NODE_NAME;
148         int numbers_of_chars_allowed_to_remove_if_not_found_in_map = -1;
149         boolean cut_name_after_space = false;
150         boolean process_name_intelligently = false;
151         boolean process_similar_to = false;
152         boolean extract_bracketed_scientific_name = false;
153         boolean extract_bracketed_tax_code = false;
154         boolean trim_after_tilde = false;
155         boolean order_tree = false;
156         boolean midpoint_root = false;
157         String tree_name = "";
158         String tree_id = "";
159         String tree_desc = "";
160         try {
161             if ( cla.isOptionSet( decorator.TREE_NAME_OPTION ) ) {
162                 tree_name = cla.getOptionValueAsCleanString( decorator.TREE_NAME_OPTION );
163             }
164             if ( cla.isOptionSet( decorator.TREE_ID_OPTION ) ) {
165                 tree_id = cla.getOptionValueAsCleanString( decorator.TREE_ID_OPTION );
166             }
167             if ( cla.isOptionSet( decorator.TREE_DESC_OPTION ) ) {
168                 tree_desc = cla.getOptionValueAsCleanString( decorator.TREE_DESC_OPTION );
169             }
170             if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION ) ) {
171                 if ( advanced_table ) {
172                     argumentsError();
173                 }
174                 extract_bracketed_scientific_name = true;
175             }
176             if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION ) ) {
177                 if ( advanced_table ) {
178                     argumentsError();
179                 }
180                 extract_bracketed_tax_code = true;
181             }
182             if ( cla.isOptionSet( decorator.KEY_COLUMN ) ) {
183                 if ( advanced_table ) {
184                     argumentsError();
185                 }
186                 key_column = cla.getOptionValueAsInt( decorator.KEY_COLUMN );
187             }
188             if ( cla.isOptionSet( decorator.VALUE_COLUMN ) ) {
189                 if ( advanced_table ) {
190                     argumentsError();
191                 }
192                 value_column = cla.getOptionValueAsInt( decorator.VALUE_COLUMN );
193             }
194             if ( cla.isOptionSet( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION ) ) {
195                 if ( advanced_table ) {
196                     argumentsError();
197                 }
198                 cut_name_after_space = true;
199             }
200             if ( cla.isOptionSet( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION ) ) {
201                 if ( advanced_table ) {
202                     argumentsError();
203                 }
204                 process_name_intelligently = true;
205             }
206             if ( cla.isOptionSet( decorator.PROCESS_SIMILAR_TO_OPTION ) ) {
207                 if ( advanced_table ) {
208                     argumentsError();
209                 }
210                 process_similar_to = true;
211             }
212             if ( cla.isOptionSet( decorator.TRIM_AFTER_TILDE_OPTION ) ) {
213                 if ( advanced_table ) {
214                     argumentsError();
215                 }
216                 trim_after_tilde = true;
217             }
218             if ( cla.isOptionSet( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ) ) {
219                 numbers_of_chars_allowed_to_remove_if_not_found_in_map = cla
220                         .getOptionValueAsInt( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION );
221             }
222             if ( cla.isOptionSet( decorator.MIDPOINT_ROOT_OPTION ) ) {
223                 midpoint_root = true;
224             }
225             if ( cla.isOptionSet( decorator.ORDER_TREE_OPTION ) ) {
226                 order_tree = true;
227             }
228             if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) {
229                 field_str = cla.getOptionValue( decorator.FIELD_OPTION );
230                 if ( field_str.equals( NODE_NAME_FIELD ) ) {
231                     field = FIELD.NODE_NAME;
232                 }
233                 else if ( field_str.equals( SEQUENCE_ANNOTATION_DESC ) ) {
234                     field = FIELD.SEQUENCE_ANNOTATION_DESC;
235                 }
236                 else if ( field_str.equals( DS_FILED ) ) {
237                     field = FIELD.DOMAIN_STRUCTURE;
238                     extract_bracketed_scientific_name = false;
239                     extract_bracketed_tax_code = false;
240                 }
241                 else if ( field_str.equals( TAXONOMY_CODE_FIELD ) ) {
242                     field = FIELD.TAXONOMY_CODE;
243                 }
244                 else if ( field_str.equals( SEQUENCE_NAME_FIELD ) ) {
245                     field = FIELD.SEQUENCE_NAME;
246                 }
247                 else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) {
248                     field = FIELD.TAXONOMY_SCIENTIFIC_NAME;
249                     extract_bracketed_scientific_name = false;
250                     extract_bracketed_tax_code = false;
251                 }
252                 else {
253                     ForesterUtil.fatalError( decorator.PRG_NAME, "unknown value for \"" + decorator.FIELD_OPTION
254                             + "\" option: \"" + field_str + "\"" );
255                 }
256             }
257         }
258         catch ( final Exception e ) {
259             ForesterUtil.fatalError( decorator.PRG_NAME, "error in command line: " + e.getMessage() );
260         }
261         if ( ( field != FIELD.NODE_NAME ) && ( cut_name_after_space || process_name_intelligently ) ) {
262             ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x or -c option without -f=n" );
263         }
264         if ( ( field != FIELD.NODE_NAME ) && process_similar_to ) {
265             ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
266                     + " option without -f=n" );
267         }
268         if ( cut_name_after_space && process_name_intelligently ) {
269             ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x and -c option together" );
270         }
271         if ( process_similar_to && process_name_intelligently ) {
272             ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
273                     + " and -x option together" );
274         }
275         if ( process_similar_to && cut_name_after_space ) {
276             ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
277                     + " and -c option together" );
278         }
279         if ( extract_bracketed_scientific_name && extract_bracketed_tax_code ) {
280             argumentsError();
281         }
282         Phylogeny[] phylogenies = null;
283         try {
284             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
285             final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( phylogenies_infile, true );
286             phylogenies = factory.create( phylogenies_infile, pp );
287         }
288         catch ( final Exception e ) {
289             ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read phylgenies from [" + phylogenies_infile
290                     + "] [" + e.getMessage() + "]" );
291         }
292         Map<String, String> map = null;
293         if ( !advanced_table ) {
294             BasicTable<String> mapping_table = null;
295             try {
296                 mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
297             }
298             catch ( final Exception e ) {
299                 ForesterUtil.fatalError( decorator.PRG_NAME,
300                                          "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
301             }
302             if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
303                 ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
304             }
305             if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
306                 ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
307             }
308             if ( mapping_table.isEmpty() || ( mapping_table.getNumberOfColumns() < 1 ) ) {
309                 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table is empty" );
310             }
311             if ( mapping_table.getNumberOfColumns() == 1 ) {
312                 ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table has only one column" );
313             }
314             map = mapping_table.getColumnsAsMap( key_column, value_column );
315             final Iterator<Entry<String, String>> iter = map.entrySet().iterator();
316             System.out.println();
317             while ( iter.hasNext() ) {
318                 final Entry<String, String> e = iter.next();
319                 System.out.println( e.getKey() + " => " + e.getValue() );
320             }
321             System.out.println();
322         }
323         if ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id )
324                 || !ForesterUtil.isEmpty( tree_desc ) ) {
325             if ( ( phylogenies.length > 1 )
326                     && ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id ) ) ) {
327                 ForesterUtil.fatalError( decorator.PRG_NAME,
328                                          "attempt to set same name or id on more than one phylogeny" );
329             }
330             if ( !ForesterUtil.isEmpty( tree_name ) ) {
331                 phylogenies[ 0 ].setName( tree_name );
332             }
333             if ( !ForesterUtil.isEmpty( tree_id ) ) {
334                 final String[] s_ary = tree_id.split( ":" );
335                 phylogenies[ 0 ].setIdentifier( new Identifier( s_ary[ 1 ], s_ary[ 0 ] ) );
336             }
337             if ( !ForesterUtil.isEmpty( tree_desc ) ) {
338                 for( final Phylogeny phylogenie : phylogenies ) {
339                     phylogenie.setDescription( tree_desc );
340                 }
341             }
342         }
343         try {
344             if ( advanced_table ) {
345                 Map<String, Map<String, String>> table = null;
346                 try {
347                     table = PhylogenyDecorator.parseMappingTable( mapping_infile );
348                 }
349                 catch ( final IOException e ) {
350                     ForesterUtil.fatalError( decorator.PRG_NAME,
351                                              "failed to read \"" + mapping_infile + "\" [" + e.getMessage() + "]" );
352                 }
353                 PhylogenyDecorator.decorate( phylogenies,
354                                              table,
355                                              picky,
356                                              numbers_of_chars_allowed_to_remove_if_not_found_in_map );
357             }
358             else {
359                 PhylogenyDecorator.decorate( phylogenies,
360                                              map,
361                                              field,
362                                              extract_bracketed_scientific_name,
363                                              extract_bracketed_tax_code,
364                                              picky,
365                                              cut_name_after_space,
366                                              process_name_intelligently,
367                                              process_similar_to,
368                                              numbers_of_chars_allowed_to_remove_if_not_found_in_map,
369                                              trim_after_tilde );
370             }
371         }
372         catch ( final NullPointerException e ) {
373             ForesterUtil.unexpectedFatalError( decorator.PRG_NAME, e );
374         }
375         catch ( final Exception e ) {
376             ForesterUtil.fatalError( decorator.PRG_NAME, e.getLocalizedMessage() );
377         }
378         if ( midpoint_root || order_tree ) {
379             for( final Phylogeny phy : phylogenies ) {
380                 if ( midpoint_root ) {
381                     PhylogenyMethods.midpointRoot( phy );
382                 }
383                 if ( order_tree ) {
384                     PhylogenyMethods.orderAppearance( phy.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.TAXONOMY );
385                 }
386             }
387         }
388         try {
389             final PhylogenyWriter w = new PhylogenyWriter();
390             w.toPhyloXML( phylogenies, 0, phylogenies_outfile, ForesterUtil.getLineSeparator() );
391         }
392         catch ( final IOException e ) {
393             ForesterUtil.fatalError( decorator.PRG_NAME, "failed to write output [" + e.getMessage() + "]" );
394         }
395         System.out.println();
396         ForesterUtil.programMessage( PRG_NAME, "wrote: " + phylogenies_outfile );
397         ForesterUtil.programMessage( PRG_NAME, "OK." );
398     }
399
400     private static void argumentsError() {
401         System.out.println();
402         System.out.println( decorator.PRG_NAME + " -" + ADVANCED_TABLE_OPTION + " | -f=<c> <phylogenies infile> "
403                 + "[mapping table file] <phylogenies outfile>" );
404         System.out.println();
405         System.out.println( "options:" );
406         System.out.println();
407         System.out.println( " -" + ADVANCED_TABLE_OPTION + " : table instead of one to one map (-f=<c>)" );
408         System.out.println( " -r=<n> : allow to remove up to n characters from the end of the names" );
409         System.out.println( "          in phylogenies infile if not found (in map) otherwise" );
410         System.out.println( " -p     : picky, fails if node name not found in mapping table" );
411         System.out.println( " -" + TREE_NAME_OPTION + "=<s>: name for the phylogeny" );
412         System.out.println( " -" + TREE_ID_OPTION + "=<s>: identifier for the phylogeny (in the form provider:value)" );
413         System.out.println( " -" + TREE_DESC_OPTION + "=<s>: description for phylogenies" );
414         System.out.println();
415         System.out.println();
416         System.out.println( "advanced options, only available if -" + ADVANCED_TABLE_OPTION + " is not used:" );
417         System.out.println();
418         System.out.println( " -f=<c> : field to be replaced: " + NODE_NAME_FIELD + " : node name" );
419         System.out.println( "                                " + SEQUENCE_ANNOTATION_DESC
420                 + " : sequence annotation description" );
421         System.out.println( "                                " + DS_FILED + " : domain structure" );
422         System.out.println( "                                " + TAXONOMY_CODE_FIELD + " : taxonomy code" );
423         System.out.println( "                                " + TAXONOMY_SCIENTIFIC_NAME_FIELD
424                 + ": taxonomy scientific name" );
425         System.out.println( "                                " + SEQUENCE_NAME_FIELD + " : sequence name" );
426         System.out.println( " -k=<n> : key column in mapping table (0 based)," );
427         System.out.println( "          names of the node to be decorated - default is 0" );
428         System.out.println( " -v=<n> : value column in mapping table (0 based)," );
429         System.out.println( "          data which with to decorate - default is 1" );
430         System.out.println( " -" + EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION
431                 + "    : to extract bracketed scientific names, e.g. [Nematostella vectensis]" );
432         System.out.println( " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION
433                 + "    : to extract bracketed taxonomic codes, e.g. [NEMVE]" );
434         System.out.println( " -s=<c> : column separator in mapping file, default is \""
435                 + decorator.MAPPING_FILE_SEPARATOR_DEFAULT + "\"" );
436         System.out.println( " -x     : process name \"intelligently\" (only for -f=n)" );
437         System.out.println( " -" + decorator.PROCESS_SIMILAR_TO_OPTION
438                 + "    : process name \"intelligently\" and process information after \"similar to\" (only for -f=n)" );
439         System.out.println( " -c     : cut name after first space (only for -f=n)" );
440         System.out.println( " -" + decorator.TRIM_AFTER_TILDE_OPTION
441                 + "     : trim node name to be replaced after tilde" );
442         System.out.println( " -" + decorator.MIDPOINT_ROOT_OPTION + "     : to midpoint-root the tree" );
443         System.out.println( " -" + decorator.ORDER_TREE_OPTION + "     : to order tree branches" );
444         System.out.println();
445         System.exit( -1 );
446     }
447 }