2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // "java -Xmx1024m -cp path\to\forester.jar org.forester.application.fasta_split
28 package org.forester.application;
31 import java.io.FileInputStream;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.List;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
40 import org.forester.io.parsers.FastaParser;
41 import org.forester.io.writers.SequenceWriter;
42 import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
43 import org.forester.sequence.MolecularSequence;
44 import org.forester.util.CommandLineArguments;
45 import org.forester.util.ForesterUtil;
47 public final class fasta_split {
49 final static private String PRG_NAME = "fasta_split";
50 final static private String PRG_VERSION = "1.01";
51 final static private String PRG_DATE = "170718";
53 public static void main( final String args[] ) {
54 ForesterUtil.printProgramInformation( fasta_split.PRG_NAME, fasta_split.PRG_VERSION, fasta_split.PRG_DATE );
56 if ( ( args.length != 3 ) ) {
57 fasta_split.argumentsError();
59 CommandLineArguments cla = null;
61 cla = new CommandLineArguments( args );
63 catch ( final Exception e ) {
64 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
66 final String pattern_str = cla.getName( 0 );
67 final File infile = cla.getFile( 1 );
68 final File outdir = cla.getFile( 2 );
71 pa = Pattern.compile( pattern_str );
73 catch ( final Exception ex ) {
74 ForesterUtil.fatalError( PRG_NAME, ex.getMessage() );
76 final String error = ForesterUtil.isReadableFile( infile );
77 if ( !ForesterUtil.isEmpty( error ) ) {
78 ForesterUtil.fatalError( PRG_NAME, error );
80 if ( !outdir.exists() ) {
81 new File( outdir.toString() ).mkdir();
83 if ( !outdir.isDirectory() ) {
84 ForesterUtil.fatalError( PRG_NAME, outdir + " is not a directory" );
86 List<MolecularSequence> seqs = null;
88 seqs = FastaParser.parse( new FileInputStream( infile ) );
90 catch ( final IOException e ) {
91 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
93 if ( ( seqs == null ) || seqs.isEmpty() ) {
94 ForesterUtil.fatalError( PRG_NAME, infile + " appears empty" );
96 System.out.println( "Read " + seqs.size() + " sequences" );
97 final Map<String, List<MolecularSequence>> output = new HashMap<>();
98 for( final MolecularSequence seq : seqs ) {
99 final Matcher m = pa.matcher( seq.getIdentifier() );
101 final String key = m.group( 1 );
102 if ( !output.containsKey( key ) ) {
103 output.put( key, new ArrayList<MolecularSequence>() );
105 output.get( key ).add( seq );
108 System.out.println( "warning: " + pattern_str + " not found in sequence \"" + seq.getIdentifier()
110 final String key = "unknown";
111 if ( !output.containsKey( key ) ) {
112 output.put( key, new ArrayList<MolecularSequence>() );
114 output.get( key ).add( seq );
118 int seqs_written = 0;
119 for( final Map.Entry<String, List<MolecularSequence>> entry : output.entrySet() ) {
120 String s = entry.getKey().trim();
121 s = s.replaceAll( "[\\./\\*\\s]+", "_" );
122 s = s.replaceAll( "\\(", "~" );
123 s = s.replaceAll( "\\)", "~" );
124 final File of = new File( outdir.getAbsolutePath().toString() + "/" + s + ".fasta" );
126 ForesterUtil.fatalError( PRG_NAME, of + " already exists" );
128 System.out.println( ++c + ": writing " + of + " [" + entry.getValue().size() + " seqs]" );
131 SequenceWriter.writeSeqs( entry.getValue(), of, SEQ_FORMAT.FASTA, 60 );
133 catch ( final IOException e ) {
134 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
136 seqs_written += entry.getValue().size();
138 System.out.println( "Wrote " + seqs_written + " sequences" );
141 private static void argumentsError() {
142 System.out.println( PRG_NAME + " <pattern> <infile> <outdir>" );
143 System.out.println();
144 System.out.println( "Examples: " );
145 System.out.println( " " + PRG_NAME + " \"v-germ=(\\S+)\" tt.fasta outdir" );
146 System.out.println( " " + PRG_NAME + " \"(\\S+?)\\|\" seqs.fasta outdir" );
147 System.out.println( " " + PRG_NAME + " \"OS=(.+?)[A-Z]{2}=\" seqs.fasta outdir" );
148 System.out.println();