2 package org.forester.application;
4 import java.io.BufferedWriter;
6 import java.io.FileInputStream;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.List;
12 import org.forester.io.parsers.FastaParser;
13 import org.forester.io.parsers.GeneralMsaParser;
14 import org.forester.io.writers.SequenceWriter;
15 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
16 import org.forester.msa.BasicMsa;
17 import org.forester.msa.Msa;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.sequence.BasicSequence;
20 import org.forester.sequence.MolecularSequence;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.CommandLineArguments;
23 import org.forester.util.ForesterConstants;
24 import org.forester.util.ForesterUtil;
28 final static private String PRG_NAME = "rid";
29 final static private String PRG_DATE = "170902";
30 final static private String PRG_DESC = "sequence file reformatting and identifier normalization";
31 final static private String PRG_VERSION = "1.00";
32 final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
33 final static private String E_MAIL = "phyloxml@gmail.com";
34 final static private String OUTPUT_FORMAT_OPTION = "o";
35 final static private String ID_NORM_OPTION = "s";
36 final static private String HELP_OPTION_1 = "help";
37 final static private String HELP_OPTION_2 = "h";
38 private static final String OUTPUT_FORMAT_FASTA = "f";
39 private static final String OUTPUT_FORMAT_PHYLIP = "p";
40 private static final String OUTPUT_FORMAT_NEXUS = "n";
41 private static final String OUTPUT_FORMAT_FASTA_L = "fasta";
42 private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip";
43 private static final String OUTPUT_FORMAT_NEXUS_L = "nexus";
45 public static void main( final String args[] ) {
47 ForesterUtil.printProgramInformation( PRG_NAME,
53 ForesterUtil.getForesterLibraryInformation() );
54 CommandLineArguments cla = null;
56 cla = new CommandLineArguments( args );
58 catch ( final Exception e ) {
59 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
61 if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 )
62 || cla.isOptionSet( HELP_OPTION_2 ) ) {
67 String input_seqs_file_str = null;
68 String output_seqs_file_str = null;
69 String output_map_file_str = null;
70 String input_seqs_name_wo_suffix = null;
71 if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) {
72 input_seqs_file_str = cla.getName( 0 );
73 output_seqs_file_str = cla.getName( 1 );
74 if ( cla.getNumberOfNames() == 3 ) {
75 output_map_file_str = cla.getName( 2 );
78 else if ( cla.getNumberOfNames() == 1 ) {
79 input_seqs_file_str = cla.getName( 0 );
80 input_seqs_name_wo_suffix = null;
81 if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) {
82 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 );
84 else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) {
85 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
87 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) {
88 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
90 else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) {
91 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
93 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) {
94 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 );
96 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) {
97 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
99 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) {
100 input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 );
103 input_seqs_name_wo_suffix = input_seqs_file_str;
105 output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX;
111 final List<String> allowed_options = new ArrayList<String>();
112 allowed_options.add( OUTPUT_FORMAT_OPTION );
113 allowed_options.add( ID_NORM_OPTION );
114 final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115 if ( dissallowed_options.length() > 0 ) {
116 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
118 final File input_seqs_file = new File( input_seqs_file_str );
119 final String error0 = ForesterUtil.isReadableFile( input_seqs_file );
120 if ( !ForesterUtil.isEmpty( error0 ) ) {
121 ForesterUtil.fatalError( PRG_NAME, error0 );
123 final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file );
124 Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
125 if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) {
126 if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) {
127 final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION );
128 if ( output_format_str.equals( OUTPUT_FORMAT_FASTA )
129 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) {
130 output_format = MSA_FORMAT.FASTA;
132 else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP )
133 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) {
134 output_format = MSA_FORMAT.PHYLIP;
136 else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS )
137 || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) {
138 output_format = MSA_FORMAT.NEXUS;
141 ForesterUtil.fatalError( PRG_NAME, "unknown output format option: " + output_format_str );
145 ForesterUtil.fatalError( PRG_NAME, "no value for output format option" );
148 final boolean normalize_identifiers;
149 if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) {
150 normalize_identifiers = true;
153 normalize_identifiers = false;
155 if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) {
156 ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" );
158 final File output_map_file;
159 if ( normalize_identifiers ) {
160 output_map_file = new File( output_map_file_str );
161 final String error = ForesterUtil.isWritableFile( output_map_file );
162 if ( !ForesterUtil.isEmpty( error ) ) {
163 ForesterUtil.fatalError( PRG_NAME, error );
167 output_map_file = null;
169 if ( cla.getNumberOfNames() == 1 ) {
170 if ( normalize_identifiers ) {
171 if ( output_format == MSA_FORMAT.FASTA ) {
172 output_seqs_file_str = input_seqs_name_wo_suffix
173 + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
175 else if ( output_format == MSA_FORMAT.NEXUS ) {
176 output_seqs_file_str = input_seqs_name_wo_suffix
177 + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX;
179 else if ( output_format == MSA_FORMAT.PHYLIP ) {
180 output_seqs_file_str = input_seqs_name_wo_suffix
181 + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX;
185 if ( output_format == MSA_FORMAT.FASTA ) {
186 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX;
187 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
188 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
189 + ForesterConstants.FASTA_FILE_SUFFIX;
192 else if ( output_format == MSA_FORMAT.NEXUS ) {
193 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX;
194 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
195 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
196 + ForesterConstants.NEXUS_FILE_SUFFIX;
199 else if ( output_format == MSA_FORMAT.PHYLIP ) {
200 output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX;
201 if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
202 output_seqs_file_str = input_seqs_name_wo_suffix + "_"
203 + ForesterConstants.PHYLIP_FILE_SUFFIX;
208 final File outfile_seqs_file = new File( output_seqs_file_str );
209 final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file );
210 if ( !ForesterUtil.isEmpty( error1 ) ) {
211 ForesterUtil.fatalError( PRG_NAME, error1 );
213 System.out.println();
214 if ( input_seqs_fasta_like ) {
215 System.out.println( "Input format : Fasta" );
218 System.out.println( "Input format : Phylip like" );
220 System.out.println( "Input file : " + input_seqs_file_str );
221 if ( output_format == MSA_FORMAT.FASTA ) {
222 System.out.println( "Output format : Fasta" );
224 else if ( output_format == MSA_FORMAT.NEXUS ) {
225 System.out.println( "Output format : Nexus" );
227 else if ( output_format == MSA_FORMAT.PHYLIP ) {
228 System.out.println( "Output format : Phylip" );
230 System.out.println( "Output file : " + output_seqs_file_str );
231 System.out.println( "Shorten names : " + normalize_identifiers );
232 if ( normalize_identifiers ) {
233 System.out.println( "Identifier map : " + output_map_file_str );
235 final List<MolecularSequence> input_seqs;
236 final FileInputStream is = new FileInputStream( input_seqs_file );
237 if ( FastaParser.isLikelyFasta( input_seqs_file ) ) {
238 input_seqs = FastaParser.parse( is );
241 input_seqs = GeneralMsaParser.parseSeqs( is );
243 if ( input_seqs == null ) {
244 ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" );
246 if ( input_seqs.size() < 1 ) {
247 ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" );
249 final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
250 for( final MolecularSequence seq : input_seqs ) {
251 stats.addValue( seq.getLength() );
253 System.out.println( "Number of sequences : " + input_seqs.size() );
254 if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) {
255 System.out.println( "Sequence lenght min : " + ( int ) stats.getMin() );
256 System.out.println( "Sequence lenght max : " + ( int ) stats.getMax() );
257 if ( input_seqs.size() > 2 ) {
258 System.out.println( "Sequence length median: " + ( int ) stats.median() );
260 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
261 ForesterUtil.fatalError( PRG_NAME,
262 "Input is not an alignment, cannot write in Nexus or Phylip format" );
266 System.out.println( "Alignment length : " + ( int ) stats.getMax() );
268 final List<MolecularSequence> output_seqs = new ArrayList<MolecularSequence>();
270 final BufferedWriter output_map_writer;
271 if ( normalize_identifiers ) {
272 output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str );
275 output_map_writer = null;
277 for( final MolecularSequence seq : input_seqs ) {
278 final String new_name;
279 if ( normalize_identifiers ) {
280 new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer );
283 new_name = seq.getIdentifier();
285 final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name,
286 seq.getMolecularSequenceAsString() );
287 output_seqs.add( ns );
289 System.out.println();
290 if ( normalize_identifiers ) {
291 output_map_writer.flush();
292 output_map_writer.close();
293 System.out.println( "Wrote : " + output_map_file );
295 final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file );
296 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
297 final Msa m = BasicMsa.createInstance( output_seqs );
298 m.write( seq_writer, output_format );
300 else if ( output_format == MSA_FORMAT.FASTA ) {
301 SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 );
305 System.out.println( "Wrote : " + outfile_seqs_file );
306 System.out.println();
308 catch ( final IllegalArgumentException e ) {
309 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
311 catch ( final Exception e ) {
313 ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
317 final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
318 desc.replaceAll( "\\s+", " " );
319 final String new_desc = Integer.toHexString( counter );
320 if ( new_desc.length() > 9 ) {
321 ForesterUtil.fatalError( PRG_NAME,
322 "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
325 writer.write( new_desc + "\t" + desc + "\n" );
329 private final static void print_help() {
330 System.out.println( "Usage:" );
331 System.out.println();
332 System.out.println( PRG_NAME + " [options] <input sequences file> [output sequences file] [output map file]" );
333 System.out.println();
334 System.out.println( " options:" );
335 System.out.println( " -" + OUTPUT_FORMAT_OPTION + "=<format>: output format: " + OUTPUT_FORMAT_FASTA_L + " or "
336 + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or "
337 + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS
339 System.out.println( " -" + ID_NORM_OPTION + " : to replace sequence names with short(er) identifiers" );
340 System.out.println();
341 System.out.println( "Example:" );
342 System.out.println();
343 System.out.println( " " + PRG_NAME + " -s -o=p my_seqs.fasta" );
344 System.out.println();