9d25d8d1dc5a6a5c6df97ac03e8fc223c3ce3a34
[jalview.git] / forester / java / src / org / forester / application / rid.java
1
2 package org.forester.application;
3
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.IOException;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.List;
11
12 import org.forester.io.parsers.FastaParser;
13 import org.forester.io.parsers.GeneralMsaParser;
14 import org.forester.io.writers.SequenceWriter;
15 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
16 import org.forester.msa.BasicMsa;
17 import org.forester.msa.Msa;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.sequence.BasicSequence;
20 import org.forester.sequence.MolecularSequence;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.CommandLineArguments;
23 import org.forester.util.ForesterConstants;
24 import org.forester.util.ForesterUtil;
25
26 public class rid {
27
28     final static private String PRG_NAME               = "rid";
29     final static private String PRG_DATE               = "170902";
30     final static private String PRG_DESC               = "sequence file reformatting and identifier normalization";
31     final static private String PRG_VERSION            = "1.00";
32     final static private String WWW                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
33     final static private String E_MAIL                 = "phyloxml@gmail.com";
34     final static private String OUTPUT_FORMAT_OPTION   = "o";
35     final static private String ID_NORM_OPTION         = "s";
36     final static private String HELP_OPTION_1          = "help";
37     final static private String HELP_OPTION_2          = "h";
38     private static final String OUTPUT_FORMAT_FASTA    = "f";
39     private static final String OUTPUT_FORMAT_PHYLIP   = "p";
40     private static final String OUTPUT_FORMAT_NEXUS    = "n";
41     private static final String OUTPUT_FORMAT_FASTA_L  = "fasta";
42     private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip";
43     private static final String OUTPUT_FORMAT_NEXUS_L  = "nexus";
44
45     public static void main( final String args[] ) {
46         try {
47             ForesterUtil.printProgramInformation( PRG_NAME,
48                                                   PRG_DESC,
49                                                   PRG_VERSION,
50                                                   PRG_DATE,
51                                                   E_MAIL,
52                                                   WWW,
53                                                   ForesterUtil.getForesterLibraryInformation() );
54             CommandLineArguments cla = null;
55             try {
56                 cla = new CommandLineArguments( args );
57             }
58             catch ( final Exception e ) {
59                 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
60             }
61             if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 )
62                     || cla.isOptionSet( HELP_OPTION_2 ) ) {
63                 System.out.println();
64                 print_help();
65                 System.exit( 0 );
66             }
67             String input_seqs_file_str = null;
68             String output_seqs_file_str = null;
69             String output_map_file_str = null;
70             String input_seqs_name_wo_suffix = null;
71             if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) {
72                 input_seqs_file_str = cla.getName( 0 );
73                 output_seqs_file_str = cla.getName( 1 );
74                 if ( cla.getNumberOfNames() == 3 ) {
75                     output_map_file_str = cla.getName( 2 );
76                 }
77             }
78             else if ( cla.getNumberOfNames() == 1 ) {
79                 input_seqs_file_str = cla.getName( 0 );
80                 input_seqs_name_wo_suffix = null;
81                 if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) {
82                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 );
83                 }
84                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) {
85                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
86                 }
87                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) {
88                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
89                 }
90                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) {
91                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
92                 }
93                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) {
94                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 );
95                 }
96                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) {
97                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
98                 }
99                 else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) {
100                     input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 );
101                 }
102                 else {
103                     input_seqs_name_wo_suffix = input_seqs_file_str;
104                 }
105                 output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX;
106             }
107             else {
108                 print_help();
109                 System.exit( -1 );
110             }
111             final List<String> allowed_options = new ArrayList<>();
112             allowed_options.add( OUTPUT_FORMAT_OPTION );
113             allowed_options.add( ID_NORM_OPTION );
114             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115             if ( dissallowed_options.length() > 0 ) {
116                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
117             }
118             final File input_seqs_file = new File( input_seqs_file_str );
119             final String error0 = ForesterUtil.isReadableFile( input_seqs_file );
120             if ( !ForesterUtil.isEmpty( error0 ) ) {
121                 ForesterUtil.fatalError( PRG_NAME, error0 );
122             }
123             final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file );
124             Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
125             if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) {
126                 if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) {
127                     final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION );
128                     if ( output_format_str.equals( OUTPUT_FORMAT_FASTA )
129                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) {
130                         output_format = MSA_FORMAT.FASTA;
131                     }
132                     else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP )
133                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) {
134                         output_format = MSA_FORMAT.PHYLIP;
135                     }
136                     else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS )
137                             || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) {
138                         output_format = MSA_FORMAT.NEXUS;
139                     }
140                     else {
141                         ForesterUtil.fatalError( PRG_NAME, "unknown output format option: " + output_format_str );
142                     }
143                 }
144                 else {
145                     ForesterUtil.fatalError( PRG_NAME, "no value for output format option"  );
146                 }
147             }
148             final boolean normalize_identifiers;
149             if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) {
150                 normalize_identifiers = true;
151             }
152             else {
153                 normalize_identifiers = false;
154             }
155             if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) {
156                 ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" );
157             }
158             final File output_map_file;
159             if ( normalize_identifiers ) {
160                 output_map_file = new File( output_map_file_str );
161                 final String error = ForesterUtil.isWritableFile( output_map_file );
162                 if ( !ForesterUtil.isEmpty( error ) ) {
163                     ForesterUtil.fatalError( PRG_NAME, error );
164                 }
165             }
166             else {
167                 output_map_file = null;
168             }
169             if ( cla.getNumberOfNames() == 1 ) {
170                 if ( normalize_identifiers ) {
171                     if ( output_format == MSA_FORMAT.FASTA ) {
172                         output_seqs_file_str = input_seqs_name_wo_suffix
173                                 + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
174                     }
175                     else if ( output_format == MSA_FORMAT.NEXUS ) {
176                         output_seqs_file_str = input_seqs_name_wo_suffix
177                                 + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX;
178                     }
179                     else if ( output_format == MSA_FORMAT.PHYLIP ) {
180                         output_seqs_file_str = input_seqs_name_wo_suffix
181                                 + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX;
182                     }
183                 }
184                 else {
185                     if ( output_format == MSA_FORMAT.FASTA ) {
186                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX;
187                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
188                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
189                                     + ForesterConstants.FASTA_FILE_SUFFIX;
190                         }
191                     }
192                     else if ( output_format == MSA_FORMAT.NEXUS ) {
193                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX;
194                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
195                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
196                                     + ForesterConstants.NEXUS_FILE_SUFFIX;
197                         }
198                     }
199                     else if ( output_format == MSA_FORMAT.PHYLIP ) {
200                         output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX;
201                         if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
202                             output_seqs_file_str = input_seqs_name_wo_suffix + "_"
203                                     + ForesterConstants.PHYLIP_FILE_SUFFIX;
204                         }
205                     }
206                 }
207             }
208             final File outfile_seqs_file = new File( output_seqs_file_str );
209             final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file );
210             if ( !ForesterUtil.isEmpty( error1 ) ) {
211                 ForesterUtil.fatalError( PRG_NAME, error1 );
212             }
213             System.out.println();
214             if ( input_seqs_fasta_like ) {
215                 System.out.println( "Input format          : Fasta" );
216             }
217             else {
218                 System.out.println( "Input format          : Phylip like" );
219             }
220             System.out.println( "Input file            : " + input_seqs_file_str );
221             if ( output_format == MSA_FORMAT.FASTA ) {
222                 System.out.println( "Output format         : Fasta" );
223             }
224             else if ( output_format == MSA_FORMAT.NEXUS ) {
225                 System.out.println( "Output format         : Nexus" );
226             }
227             else if ( output_format == MSA_FORMAT.PHYLIP ) {
228                 System.out.println( "Output format         : Phylip" );
229             }
230             System.out.println( "Output file           : " + output_seqs_file_str );
231             System.out.println( "Shorten names         : " + normalize_identifiers );
232             if ( normalize_identifiers ) {
233                 System.out.println( "Identifier map        : " + output_map_file_str );
234             }
235             final List<MolecularSequence> input_seqs;
236             final FileInputStream is = new FileInputStream( input_seqs_file );
237             if ( FastaParser.isLikelyFasta( input_seqs_file ) ) {
238                 input_seqs = FastaParser.parse( is );
239             }
240             else {
241                 input_seqs = GeneralMsaParser.parseSeqs( is );
242             }
243             if ( input_seqs == null ) {
244                 ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" );
245             }
246             if ( input_seqs.size() < 1 ) {
247                 ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" );
248             }
249             final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
250             for( final MolecularSequence seq : input_seqs ) {
251                 stats.addValue( seq.getLength() );
252             }
253             System.out.println( "Number of sequences   : " + input_seqs.size() );
254             if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) {
255                 System.out.println( "Sequence lenght min   : " + ( int ) stats.getMin() );
256                 System.out.println( "Sequence lenght max   : " + ( int ) stats.getMax() );
257                 if ( input_seqs.size() > 2 ) {
258                     System.out.println( "Sequence length median: " + ( int ) stats.median() );
259                 }
260                 if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
261                     ForesterUtil.fatalError( PRG_NAME,
262                                              "Input is not an alignment, cannot write in Nexus or Phylip format" );
263                 }
264             }
265             else {
266                 System.out.println( "Alignment length      : " + ( int ) stats.getMax() );
267             }
268             final List<MolecularSequence> output_seqs = new ArrayList<>();
269             int counter = 0;
270             final BufferedWriter output_map_writer;
271             if ( normalize_identifiers ) {
272                 output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str );
273             }
274             else {
275                 output_map_writer = null;
276             }
277             for( final MolecularSequence seq : input_seqs ) {
278                 final String new_name;
279                 if ( normalize_identifiers ) {
280                     new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer );
281                 }
282                 else {
283                     new_name = seq.getIdentifier();
284                 }
285                 final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name,
286                                                                                   seq.getMolecularSequenceAsString() );
287                 output_seqs.add( ns );
288             }
289             System.out.println();
290             if ( normalize_identifiers ) {
291                 output_map_writer.flush();
292                 output_map_writer.close();
293                 System.out.println( "Wrote                 : " + output_map_file );
294             }
295             final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file );
296             if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
297                 final Msa m = BasicMsa.createInstance( output_seqs );
298                 m.write( seq_writer, output_format );
299             }
300             else if ( output_format == MSA_FORMAT.FASTA ) {
301                 SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 );
302             }
303             seq_writer.flush();
304             seq_writer.close();
305             System.out.println( "Wrote                 : " + outfile_seqs_file );
306             System.out.println();
307         }
308         catch ( final IllegalArgumentException e ) {
309             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
310         }
311         catch ( final Exception e ) {
312             e.printStackTrace();
313             ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
314         }
315     }
316
317     final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
318         desc.replaceAll( "\\s+", " " );
319         final String new_desc = Integer.toHexString( counter );
320         if ( new_desc.length() > 9 ) {
321             ForesterUtil.fatalError( PRG_NAME,
322                                      "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
323                                              + " characters)" );
324         }
325         writer.write( new_desc + "\t" + desc + "\n" );
326         return new_desc;
327     }
328
329     private final static void print_help() {
330         System.out.println( "Usage:" );
331         System.out.println();
332         System.out.println( PRG_NAME + " [options] <input sequences file> [output sequences file] [output map file]" );
333         System.out.println();
334         System.out.println( " options:" );
335         System.out.println( "  -" + OUTPUT_FORMAT_OPTION + "=<format>: output format: " + OUTPUT_FORMAT_FASTA_L + " or "
336                 + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or "
337                 + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS
338                 + " for Nexus" );
339         System.out.println( "  -" + ID_NORM_OPTION + "         : to replace sequence names with short(er) identifiers" );
340         System.out.println();
341         System.out.println( "Example:" );
342         System.out.println();
343         System.out.println( " " + PRG_NAME + " -s -o=p my_seqs.fasta" );
344         System.out.println();
345     }
346 }