From e8890012b315b30af8bf6c90db6e800b7d046147 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 3 Dec 2013 00:46:27 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/application/check_fasta.java | 128 ++++++++++++++++++++ .../org/forester/io/writers/SequenceWriter.java | 8 ++ .../src/org/forester/sequence/BasicSequence.java | 6 +- 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 forester/java/src/org/forester/application/check_fasta.java diff --git a/forester/java/src/org/forester/application/check_fasta.java b/forester/java/src/org/forester/application/check_fasta.java new file mode 100644 index 0000000..2bac934 --- /dev/null +++ b/forester/java/src/org/forester/application/check_fasta.java @@ -0,0 +1,128 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester + +package org.forester.application; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.forester.io.parsers.FastaParser; +import org.forester.io.writers.SequenceWriter; +import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public final class check_fasta { + + final static private String PRG_NAME = "check_fasta"; + final static private String PRG_VERSION = "1.00"; + final static private String PRG_DATE = "131202"; + + public static void main( final String args[] ) { + ForesterUtil.printProgramInformation( check_fasta.PRG_NAME, check_fasta.PRG_VERSION, check_fasta.PRG_DATE ); + System.out.println(); + if ( ( args.length != 2 ) ) { + check_fasta.argumentsError(); + } + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + final File indir = cla.getFile( 0 ); + final File outdir = cla.getFile( 1 ); + if ( !indir.isDirectory() ) { + ForesterUtil.fatalError( PRG_NAME, indir + " is not a directory" ); + } + if ( !outdir.isDirectory() ) { + ForesterUtil.fatalError( PRG_NAME, outdir + " is not a directory" ); + } + final File[] list_of_files = indir.listFiles(); + final List infiles = new ArrayList(); + for( final File file : list_of_files ) { + if ( file.isFile() + && file.canRead() + && ( file.toString().toLowerCase().endsWith( ".fasta" ) || file.toString().toLowerCase() + .endsWith( ".fas" ) ) ) { + infiles.add( file ); + } + } + Collections.sort( infiles ); + int c = 0; + for( final File infile : infiles ) { + System.out.println( ++c + "/" + infiles.size() + ": " + infile ); + execute( outdir, infile ); + } + } + + private static void execute( final File outdir, final File infile ) { + final File outfile = new File( outdir.getAbsolutePath().toString() + "/" + infile.getName() ); + if ( outfile.exists() ) { + System.out.println( outfile + " already exists" ); + } + else { + try { + final List seqs = FastaParser.parse( new FileInputStream( infile ) ); + final Map names = new HashMap(); + for( final Sequence seq : seqs ) { + procSeq( infile.toString(), names, seq ); + } + SequenceWriter.writeSeqs( seqs, outfile, SEQ_FORMAT.FASTA, 60 ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + } + } + + private static void procSeq( final String infile, final Map names, final Sequence seq ) { + final String name = seq.getIdentifier(); + if ( !names.containsKey( name ) ) { + names.put( name, ( short ) 1 ); + } + else { + final short i = names.get( name ); + ( ( BasicSequence ) seq ).setIdentifier( name + "_" + i ); + names.put( name, ( short ) ( i + 1 ) ); + System.out.println( " " + infile + i + ": " + seq.getIdentifier() ); + } + } + + private static void argumentsError() { + System.out.println( PRG_NAME + " " ); + System.out.println(); + System.exit( -1 ); + } +} diff --git a/forester/java/src/org/forester/io/writers/SequenceWriter.java b/forester/java/src/org/forester/io/writers/SequenceWriter.java index 8e6ef95..ddcb378 100644 --- a/forester/java/src/org/forester/io/writers/SequenceWriter.java +++ b/forester/java/src/org/forester/io/writers/SequenceWriter.java @@ -1,6 +1,7 @@ package org.forester.io.writers; +import java.io.File; import java.io.IOException; import java.io.Writer; import java.util.List; @@ -66,6 +67,13 @@ public class SequenceWriter { } } + public static void writeSeqs( final List seqs, final File file, final SEQ_FORMAT format, final int width ) + throws IOException { + final Writer w = ForesterUtil.createBufferedWriter( file ); + SequenceWriter.writeSeqs( seqs, w, format, width ); + w.close(); + } + public static void writeSeqs( final List seqs, final Writer writer, final SEQ_FORMAT format, diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java index 4a4662a..f850205 100644 --- a/forester/java/src/org/forester/sequence/BasicSequence.java +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -31,7 +31,7 @@ import org.forester.util.ForesterUtil; public class BasicSequence implements Sequence { private final char[] _mol_sequence; - private final String _identifier; + private String _identifier; private final TYPE _type; private BasicSequence( final String identifier, final String mol_sequence, final TYPE type ) { @@ -59,6 +59,10 @@ public class BasicSequence implements Sequence { _type = type; } + public void setIdentifier( final String id ) { + _identifier = id; + } + @Override public String getIdentifier() { return _identifier; -- 1.7.10.2