/* Copyright (c) 2011 Peter Troshin
*
* JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0
*
* This library is free software; you can redistribute it and/or modify it under the terms of the
* Apache License version 2 as published by the Apache Software Foundation
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
* License for more details.
*
* A copy of the license is in apache_license.txt. It is also available here:
* @see: http://www.apache.org/licenses/LICENSE-2.0.txt
*
* Any republication or derived work distributed in source code form
* must include this copyright and license notice.
*/
package compbio.data.sequence;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Scanner;
import java.util.regex.MatchResult;
import javax.vecmath.MismatchedSizeException;
import compbio.util.Util;
/**
* Reads files with FASTA formatted sequences. All the information in the FASTA
* header is preserved including trailing white spaces. All the white spaces are
* removed from the sequence.
*
* Examples of the correct input:
*
*
*
* >zedpshvyzg
* GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
*
* >xovkactesa
* CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
* FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
* FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
* DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
*
* >ntazzewyvv
* CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
* EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG
* EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW
* RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN
* W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF
* FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
*
* > 12 d t y wi k jbke
* KLSHHDCD
* N
* H
* HSKCTEPHCGNSHQMLHRDP
* CCDQCQSWEAENWCASMRKAILF
*
*
*
* @author Peter Troshin
* @version 1.0 April 2011
*
*/
public class FastaReader implements Iterator {
private final Scanner input;
/**
* Delimiter for the scanner
*/
private final String DELIM=">";
/**
* Header data can contain non-ASCII symbols and read in UTF8
*
* @param inputFile
* the file containing the list of FASTA formatted sequences to
* read from
* @throws FileNotFoundException
* if the input file is not found
* @throws IllegalStateException
* if the close method was called on this instance
*
*/
public FastaReader(final String inputFile) throws FileNotFoundException {
input = new Scanner(new File(inputFile), "UTF8");
input.useDelimiter(DELIM);
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
if (input != null) {
input.close();
}
}
});
}
/**
* This class will not close the incoming stream! So the client should do
* so.
*
* @param inputStream
* @throws FileNotFoundException
*/
public FastaReader(final InputStream inputStream)
throws FileNotFoundException {
input = new Scanner(inputStream);
input.useDelimiter(DELIM);
}
/**
* {@inheritDoc}
*
* @throws IllegalStateException
* if the close method was called on this instance
*/
@Override
public boolean hasNext() {
return input.hasNext();
}
/**
* Reads the next FastaSequence from the input
*
* @throws AssertionError
* if the header or the sequence is missing
* @throws IllegalStateException
* if the close method was called on this instance
* @throws MismatchException - if there were no more FastaSequence's.
*/
@Override
public FastaSequence next() {
String fastaHeader=input.next();
while (fastaHeader.indexOf("\n")<0 && input.hasNext())
{
fastaHeader = fastaHeader.concat(">");
fastaHeader = fastaHeader.concat(input.next());
}
return FastaReader.toFastaSequence(fastaHeader);
}
/**
* Not implemented
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Call this method to close the connection to the input file if you want to
* free up the resources. The connection will be closed on the JVM shutdown
* if this method was not called explicitly. No further reading on this
* instance of the FastaReader will be possible after calling this method.
*/
public void close() {
input.close();
}
private static FastaSequence toFastaSequence(final String singleFastaEntry) {
assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";
int nlineidx = singleFastaEntry.indexOf("\n");
if (nlineidx < 0) {
throw new AssertionError(
"The FASTA sequence must contain the header information"
+ " separated by the new line from the sequence. Given sequence does not appear to "
+ "contain the header! Given data:\n "
+ singleFastaEntry);
}
String header = singleFastaEntry.substring(0, nlineidx);
// Get rid of the new line chars (should cover common cases)
header = header.replaceAll("\r", "");
String sequence = singleFastaEntry.substring(nlineidx);
if (Util.isEmpty(sequence)) {
throw new AssertionError(
"Empty sequences are not allowed! Please make sure the "
+ " data is in the FASTA format! Given data:\n "
+ singleFastaEntry);
}
return new FastaSequence(header, sequence);
}
}