in progress...
[jalview.git] / forester / java / src / org / forester / application / serin.java
1
2 package org.forester.application;
3
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.IOException;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.List;
11
12 import org.forester.io.parsers.FastaParser;
13 import org.forester.io.parsers.GeneralMsaParser;
14 import org.forester.io.writers.SequenceWriter;
15 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
16 import org.forester.msa.BasicMsa;
17 import org.forester.msa.Msa;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.sequence.BasicSequence;
20 import org.forester.sequence.MolecularSequence;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.CommandLineArguments;
23 import org.forester.util.ForesterConstants;
24 import org.forester.util.ForesterUtil;
25
26 public class serin {
27
28     final static private String PRG_NAME               = "serin";
29     final static private String PRG_DATE               = "170830";
30     final static private String PRG_DESC               = "sequence file reformatting and identifier normalization";
31     final static private String PRG_VERSION            = "1.00";
32     final static private String WWW                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
33     final static private String E_MAIL                 = "phyloxml@gmail.com";
34     final static private String OUTPUT_FORMAT_OPTION   = "o";
35     final static private String ID_NORM_OPTION         = "i";
36     final static private String HELP_OPTION_1          = "help";
37     final static private String HELP_OPTION_2          = "h";
38     private static final String OUTPUT_FORMAT_FASTA    = "f";
39     private static final String OUTPUT_FORMAT_PHYLIP   = "p";
40     private static final String OUTPUT_FORMAT_NEXUS    = "n";
41     private static final String OUTPUT_FORMAT_FASTA_L  = "fasta";
42     private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip";
43     private static final String OUTPUT_FORMAT_NEXUS_L  = "nexus";
44
45     public static void main( final String args[] ) {
46         try {
47             ForesterUtil.printProgramInformation( PRG_NAME,
48                                                   PRG_DESC,
49                                                   PRG_VERSION,
50                                                   PRG_DATE,
51                                                   E_MAIL,
52                                                   WWW,
53                                                   ForesterUtil.getForesterLibraryInformation() );
54             CommandLineArguments cla = null;
55             try {
56                 cla = new CommandLineArguments( args );
57             }
58             catch ( final Exception e ) {
59                 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
60             }
61             if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 )
62                     || cla.isOptionSet( HELP_OPTION_2 ) ) {
63                 System.out.println();
64                 print_help();
65                 System.exit( 0 );
66             }
67             String input_seqs_file_str = null;
68             String output_seqs_file_str = null;
69             String output_map_file_str = null;
70             String input_seqs_name_wo_suffix = null;
71             if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) {
72                 input_seqs_file_str = cla.getName( 0 );
73                 output_seqs_file_str = cla.getName( 1 );
74                 if ( cla.getNumberOfNames() == 3 ) {
75                     output_map_file_str = cla.getName( 2 );
76                 }
77             }
78             else if ( cla.getNumberOfNames() == 1 ) {
79                 input_seqs_file_str = cla.getName( 0 );
80                 input_seqs_name_wo_suffix = null;
81                 if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) {
82                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 );
83                 }
84                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) {
85                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
86                 }
87                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) {
88                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
89                 }
90                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) {
91                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
92                 }
93                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) {
94                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 );
95                 }
96                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) {
97                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
98                 }
99                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) {
100                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 );
101                 }
102                 else {
103                     input_seqs_name_wo_suffix = input_seqs_file_str;
104                 }
105                 output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX;
106             }
107             else {
108                 print_help();
109                 System.exit( -1 );
110             }
111             final List<String> allowed_options = new ArrayList<>();
112             allowed_options.add( OUTPUT_FORMAT_OPTION );
113             allowed_options.add( ID_NORM_OPTION );
114             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115             if ( dissallowed_options.length() > 0 ) {
116                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
117             }
118             final File input_seqs_file = new File( input_seqs_file_str );
119             final String error0 = ForesterUtil.isReadableFile( input_seqs_file );
120             if ( !ForesterUtil.isEmpty( error0 ) ) {
121                 ForesterUtil.fatalError( PRG_NAME, error0 );
122             }
123             final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file );
124             Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
125             if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) {
126                 if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) {
127                     final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION );
128                     if ( output_format_str.equals( OUTPUT_FORMAT_FASTA )
129                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) {
130                         output_format = MSA_FORMAT.FASTA;
131                     }
132                     else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP )
133                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) {
134                         output_format = MSA_FORMAT.PHYLIP;
135                     }
136                     else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS )
137                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) {
138                         output_format = MSA_FORMAT.NEXUS;
139                     }
140                     else {
141                         ForesterUtil.fatalError( PRG_NAME, "unknown format option: " + output_format_str );
142                     }
143                 }
144             }
145             final boolean normalize_identifiers;
146             if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) {
147                 normalize_identifiers = true;
148             }
149             else {
150                 normalize_identifiers = false;
151             }
152             if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) {
153                 ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" );
154             }
155             final File output_map_file;
156             if ( normalize_identifiers ) {
157                 output_map_file = new File( output_map_file_str );
158                 final String error = ForesterUtil.isWritableFile( output_map_file );
159                 if ( !ForesterUtil.isEmpty( error ) ) {
160                     ForesterUtil.fatalError( PRG_NAME, error );
161                 }
162             }
163             else {
164                 output_map_file = null;
165             }
166             if ( cla.getNumberOfNames() == 1 ) {
167                 if ( normalize_identifiers ) {
168                     if ( output_format == MSA_FORMAT.FASTA ) {
169                         output_seqs_file_str = input_seqs_name_wo_suffix
170                                 + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
171                     }
172                     else if ( output_format == MSA_FORMAT.NEXUS ) {
173                         output_seqs_file_str = input_seqs_name_wo_suffix
174                                 + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX;
175                     }
176                     else if ( output_format == MSA_FORMAT.PHYLIP ) {
177                         output_seqs_file_str = input_seqs_name_wo_suffix
178                                 + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX;
179                     }
180                 }
181                 else {
182                     if ( output_format == MSA_FORMAT.FASTA ) {
183                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX;
184                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
185                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
186                                     + ForesterConstants.FASTA_FILE_SUFFIX;
187                         }
188                     }
189                     else if ( output_format == MSA_FORMAT.NEXUS ) {
190                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX;
191                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
192                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
193                                     + ForesterConstants.NEXUS_FILE_SUFFIX;
194                         }
195                     }
196                     else if ( output_format == MSA_FORMAT.PHYLIP ) {
197                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX;
198                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
199                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
200                                     + ForesterConstants.PHYLIP_FILE_SUFFIX;
201                         }
202                     }
203                 }
204             }
205             final File outfile_seqs_file = new File( output_seqs_file_str );
206             final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file );
207             if ( !ForesterUtil.isEmpty( error1 ) ) {
208                 ForesterUtil.fatalError( PRG_NAME, error1 );
209             }
210             System.out.println();
211             if ( input_seqs_fasta_like ) {
212                 System.out.println( "Input format          : Fasta" );
213             }
214             else {
215                 System.out.println( "Input format          : Phylip like" );
216             }
217             System.out.println( "Input file            : " + input_seqs_file_str );
218             if ( output_format == MSA_FORMAT.FASTA ) {
219                 System.out.println( "Output format         : Fasta" );
220             }
221             else if ( output_format == MSA_FORMAT.NEXUS ) {
222                 System.out.println( "Output format         : Nexus" );
223             }
224             else if ( output_format == MSA_FORMAT.PHYLIP ) {
225                 System.out.println( "Output format         : Phylip" );
226             }
227             System.out.println( "Output file           : " + output_seqs_file_str );
228             System.out.println( "Shorten names         : " + normalize_identifiers );
229             if ( normalize_identifiers ) {
230                 System.out.println( "Identifier map        : " + output_map_file_str );
231             }
232             final List<MolecularSequence> input_seqs;
233             final FileInputStream is = new FileInputStream( input_seqs_file );
234             if ( FastaParser.isLikelyFasta( input_seqs_file ) ) {
235                 input_seqs = FastaParser.parse( is );
236             }
237             else {
238                 input_seqs = GeneralMsaParser.parseSeqs( is );
239             }
240             if ( input_seqs == null ) {
241                 ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" );
242             }
243             if ( input_seqs.size() < 1 ) {
244                 ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" );
245             }
246             final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
247             for( final MolecularSequence seq : input_seqs ) {
248                 stats.addValue( seq.getLength() );
249             }
250             System.out.println( "Number of sequences   : " + input_seqs.size() );
251             if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) {
252                 System.out.println( "Sequence lenght min   : " + ( int ) stats.getMin() );
253                 System.out.println( "Sequence lenght max   : " + ( int ) stats.getMax() );
254                 if ( input_seqs.size() > 2 ) {
255                     System.out.println( "Sequence lenght median: " + ( int ) stats.median() );
256                 }
257                 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
258                     ForesterUtil.fatalError( PRG_NAME,
259                                              "Input is not an alignment, cannot write in Nexus or Phylip format" );
260                 }
261             }
262             else {
263                 System.out.println( "Alignment length      : " + ( int ) stats.getMax() );
264             }
265             final List<MolecularSequence> output_seqs = new ArrayList<>();
266             int counter = 0;
267             final BufferedWriter output_map_writer;
268             if ( normalize_identifiers ) {
269                 output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str );
270             }
271             else {
272                 output_map_writer = null;
273             }
274             for( final MolecularSequence seq : input_seqs ) {
275                 final String new_name;
276                 if ( normalize_identifiers ) {
277                     new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer );
278                 }
279                 else {
280                     new_name = seq.getIdentifier();
281                 }
282                 final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name,
283                                                                                   seq.getMolecularSequenceAsString() );
284                 output_seqs.add( ns );
285             }
286             if ( normalize_identifiers ) {
287                 output_map_writer.flush();
288                 output_map_writer.close();
289                 System.out.println();
290                 System.out.println( "Wrote                 : " + output_map_file );
291             }
292             final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file );
293             if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
294                 final Msa m = BasicMsa.createInstance( output_seqs );
295                 m.write( seq_writer, output_format );
296             }
297             else if ( output_format == MSA_FORMAT.FASTA ) {
298                 SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 );
299             }
300             seq_writer.flush();
301             seq_writer.close();
302             System.out.println( "Wrote                 : " + outfile_seqs_file );
303             System.out.println();
304         }
305         catch ( final IllegalArgumentException e ) {
306             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
307         }
308         catch ( final Exception e ) {
309             e.printStackTrace();
310             ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
311         }
312     }
313
314     final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
315         desc.replaceAll( "\\s+", " " );
316         final String new_desc = Integer.toHexString( counter );
317         if ( new_desc.length() > 9 ) {
318             ForesterUtil.fatalError( PRG_NAME,
319                                      "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
320                                              + " characters)" );
321         }
322         writer.write( new_desc + "\t" + desc + "\n" );
323         return new_desc;
324     }
325
326     private final static void print_help() {
327         System.out.println( "Usage:" );
328         System.out.println();
329         System.out.println( PRG_NAME + " [options] <input sequences file> [output sequences file] [output map file]" );
330         System.out.println();
331         System.out.println( " options:" );
332         System.out.println( "  -" + OUTPUT_FORMAT_OPTION + "=<format>: output format: " + OUTPUT_FORMAT_FASTA_L + " or "
333                 + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or "
334                 + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS
335                 + " for Nexus" );
336         System.out.println( "  -" + ID_NORM_OPTION + ": to replace sequence names with short(er) identifiers" );
337         System.out.println();
338         System.out.println( "Example:" );
339         System.out.println();
340         System.out.println( " " + PRG_NAME + " -i -o=p my_seqs.fasta" );
341         System.out.println();
342     }
343 }