2 package org.forester.application;
4 import java.io.BufferedWriter;
6 import java.io.FileInputStream;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.List;
12 import org.forester.io.parsers.FastaParser;
13 import org.forester.io.parsers.GeneralMsaParser;
14 import org.forester.io.writers.SequenceWriter;
15 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
16 import org.forester.msa.BasicMsa;
17 import org.forester.msa.Msa;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.sequence.BasicSequence;
20 import org.forester.sequence.MolecularSequence;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.CommandLineArguments;
23 import org.forester.util.ForesterConstants;
24 import org.forester.util.ForesterUtil;
28 final static private String PRG_NAME = "serin";
29 final static private String PRG_DATE = "170830";
30 final static private String PRG_DESC = "sequence file reformatting and identifier normalization";
31 final static private String PRG_VERSION = "1.00";
32 final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
33 final static private String E_MAIL = "phyloxml@gmail.com";
34 final static private String OUTPUT_FORMAT_OPTION = "o";
35 final static private String ID_NORM_OPTION = "i";
36 final static private String HELP_OPTION_1 = "help";
37 final static private String HELP_OPTION_2 = "h";
38 private static final String OUTPUT_FORMAT_FASTA = "f";
39 private static final String OUTPUT_FORMAT_PHYLIP = "p";
40 private static final String OUTPUT_FORMAT_NEXUS = "n";
41 private static final String OUTPUT_FORMAT_FASTA_L = "fasta";
42 private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip";
43 private static final String OUTPUT_FORMAT_NEXUS_L = "nexus";
45 public static void main( final String args[] ) {
47 ForesterUtil.printProgramInformation( PRG_NAME,
53 ForesterUtil.getForesterLibraryInformation() );
54 CommandLineArguments cla = null;
56 cla = new CommandLineArguments( args );
58 catch ( final Exception e ) {
59 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
61 if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 )
62 || cla.isOptionSet( HELP_OPTION_2 ) ) {
67 String input_seqs_file_str = null;
68 String output_seqs_file_str = null;
69 String output_map_file_str = null;
70 String input_seqs_name_wo_suffix = null;
71 if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) {
72 input_seqs_file_str = cla.getName( 0 );
73 output_seqs_file_str = cla.getName( 1 );
74 if ( cla.getNumberOfNames() == 3 ) {
75 output_map_file_str = cla.getName( 2 );
78 else if ( cla.getNumberOfNames() == 1 ) {
79 input_seqs_file_str = cla.getName( 0 );
80 input_seqs_name_wo_suffix = null;
81 if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) {
82 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 );
84 else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) {
85 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
87 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) {
88 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
90 else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) {
91 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
93 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) {
94 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 );
96 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) {
97 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
99 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) {
100 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 );
103 input_seqs_name_wo_suffix = input_seqs_file_str;
105 output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX;
111 final List<String> allowed_options = new ArrayList<>();
112 allowed_options.add( OUTPUT_FORMAT_OPTION );
113 allowed_options.add( ID_NORM_OPTION );
114 final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115 if ( dissallowed_options.length() > 0 ) {
116 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
118 final File input_seqs_file = new File( input_seqs_file_str );
119 final String error0 = ForesterUtil.isReadableFile( input_seqs_file );
120 if ( !ForesterUtil.isEmpty( error0 ) ) {
121 ForesterUtil.fatalError( PRG_NAME, error0 );
123 final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file );
124 Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
125 if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) {
126 if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) {
127 final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION );
128 if ( output_format_str.equals( OUTPUT_FORMAT_FASTA )
129 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) {
130 output_format = MSA_FORMAT.FASTA;
132 else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP )
133 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) {
134 output_format = MSA_FORMAT.PHYLIP;
136 else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS )
137 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) {
138 output_format = MSA_FORMAT.NEXUS;
141 ForesterUtil.fatalError( PRG_NAME, "unknown format option: " + output_format_str );
145 final boolean normalize_identifiers;
146 if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) {
147 normalize_identifiers = true;
150 normalize_identifiers = false;
152 if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) {
153 ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" );
155 final File output_map_file;
156 if ( normalize_identifiers ) {
157 output_map_file = new File( output_map_file_str );
158 final String error = ForesterUtil.isWritableFile( output_map_file );
159 if ( !ForesterUtil.isEmpty( error ) ) {
160 ForesterUtil.fatalError( PRG_NAME, error );
164 output_map_file = null;
166 if ( cla.getNumberOfNames() == 1 ) {
167 if ( normalize_identifiers ) {
168 if ( output_format == MSA_FORMAT.FASTA ) {
169 output_seqs_file_str = input_seqs_name_wo_suffix
170 + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
172 else if ( output_format == MSA_FORMAT.NEXUS ) {
173 output_seqs_file_str = input_seqs_name_wo_suffix
174 + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX;
176 else if ( output_format == MSA_FORMAT.PHYLIP ) {
177 output_seqs_file_str = input_seqs_name_wo_suffix
178 + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX;
182 if ( output_format == MSA_FORMAT.FASTA ) {
183 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX;
184 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
185 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
186 + ForesterConstants.FASTA_FILE_SUFFIX;
189 else if ( output_format == MSA_FORMAT.NEXUS ) {
190 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX;
191 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
192 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
193 + ForesterConstants.NEXUS_FILE_SUFFIX;
196 else if ( output_format == MSA_FORMAT.PHYLIP ) {
197 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX;
198 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
199 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
200 + ForesterConstants.PHYLIP_FILE_SUFFIX;
205 final File outfile_seqs_file = new File( output_seqs_file_str );
206 final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file );
207 if ( !ForesterUtil.isEmpty( error1 ) ) {
208 ForesterUtil.fatalError( PRG_NAME, error1 );
210 System.out.println();
211 if ( input_seqs_fasta_like ) {
212 System.out.println( "Input format : Fasta" );
215 System.out.println( "Input format : Phylip like" );
217 System.out.println( "Input file : " + input_seqs_file_str );
218 if ( output_format == MSA_FORMAT.FASTA ) {
219 System.out.println( "Output format : Fasta" );
221 else if ( output_format == MSA_FORMAT.NEXUS ) {
222 System.out.println( "Output format : Nexus" );
224 else if ( output_format == MSA_FORMAT.PHYLIP ) {
225 System.out.println( "Output format : Phylip" );
227 System.out.println( "Output file : " + output_seqs_file_str );
228 System.out.println( "Shorten names : " + normalize_identifiers );
229 if ( normalize_identifiers ) {
230 System.out.println( "Identifier map : " + output_map_file_str );
232 final List<MolecularSequence> input_seqs;
233 final FileInputStream is = new FileInputStream( input_seqs_file );
234 if ( FastaParser.isLikelyFasta( input_seqs_file ) ) {
235 input_seqs = FastaParser.parse( is );
238 input_seqs = GeneralMsaParser.parseSeqs( is );
240 if ( input_seqs == null ) {
241 ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" );
243 if ( input_seqs.size() < 1 ) {
244 ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" );
246 final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
247 for( final MolecularSequence seq : input_seqs ) {
248 stats.addValue( seq.getLength() );
250 System.out.println( "Number of sequences : " + input_seqs.size() );
251 if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) {
252 System.out.println( "Sequence lenght min : " + ( int ) stats.getMin() );
253 System.out.println( "Sequence lenght max : " + ( int ) stats.getMax() );
254 if ( input_seqs.size() > 2 ) {
255 System.out.println( "Sequence lenght median: " + ( int ) stats.median() );
257 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
258 ForesterUtil.fatalError( PRG_NAME,
259 "Input is not an alignment, cannot write in Nexus or Phylip format" );
263 System.out.println( "Alignment length : " + ( int ) stats.getMax() );
265 final List<MolecularSequence> output_seqs = new ArrayList<>();
267 final BufferedWriter output_map_writer;
268 if ( normalize_identifiers ) {
269 output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str );
272 output_map_writer = null;
274 for( final MolecularSequence seq : input_seqs ) {
275 final String new_name;
276 if ( normalize_identifiers ) {
277 new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer );
280 new_name = seq.getIdentifier();
282 final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name,
283 seq.getMolecularSequenceAsString() );
284 output_seqs.add( ns );
286 if ( normalize_identifiers ) {
287 output_map_writer.flush();
288 output_map_writer.close();
289 System.out.println();
290 System.out.println( "Wrote : " + output_map_file );
292 final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file );
293 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
294 final Msa m = BasicMsa.createInstance( output_seqs );
295 m.write( seq_writer, output_format );
297 else if ( output_format == MSA_FORMAT.FASTA ) {
298 SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 );
302 System.out.println( "Wrote : " + outfile_seqs_file );
303 System.out.println();
305 catch ( final IllegalArgumentException e ) {
306 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
308 catch ( final Exception e ) {
310 ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
314 final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
315 desc.replaceAll( "\\s+", " " );
316 final String new_desc = Integer.toHexString( counter );
317 if ( new_desc.length() > 9 ) {
318 ForesterUtil.fatalError( PRG_NAME,
319 "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
322 writer.write( new_desc + "\t" + desc + "\n" );
326 private final static void print_help() {
327 System.out.println( "Usage:" );
328 System.out.println();
329 System.out.println( PRG_NAME + " [options] <input sequences file> [output sequences file] [output map file]" );
330 System.out.println();
331 System.out.println( " options:" );
332 System.out.println( " -" + OUTPUT_FORMAT_OPTION + "=<format>: output format: " + OUTPUT_FORMAT_FASTA_L + " or "
333 + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or "
334 + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS
336 System.out.println( " -" + ID_NORM_OPTION + ": to replace sequence names with short(er) identifiers" );
337 System.out.println();
338 System.out.println( "Example:" );
339 System.out.println();
340 System.out.println( " " + PRG_NAME + " -i -o=p my_seqs.fasta" );
341 System.out.println();