1 package compbio.cassandra;
4 import java.io.FileNotFoundException;
5 import java.io.InputStream;
6 import java.util.Iterator;
7 import java.util.Scanner;
9 //import compbio.util.Util;
12 * Reads files with FASTA formatted sequences. All the information in the FASTA
13 * header is preserved including trailing white spaces. All the white spaces are
14 * removed from the sequence.
16 * Examples of the correct input:
21 * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
24 * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
25 * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
26 * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
27 * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
30 * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
31 * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG
32 * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW
33 * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN
34 * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF
35 * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
37 * > 12 d t y wi k jbke
41 * HSKCTEPHCGNSHQMLHRDP
42 * CCDQCQSWEAENWCASMRKAILF
46 * @author Peter Troshin
47 * @version 1.0 April 2011
50 public class FastaReader implements Iterator<FastaSequence> {
52 private final Scanner input;
54 * Delimiter for the scanner
56 private final String DELIM = ">";
59 * Header data can contain non-ASCII symbols and read in UTF8
62 * the file containing the list of FASTA formatted sequences to
64 * @throws FileNotFoundException
65 * if the input file is not found
66 * @throws IllegalStateException
67 * if the close method was called on this instance
70 public FastaReader(final String inputFile) throws FileNotFoundException {
71 input = new Scanner(new File(inputFile), "UTF8");
72 input.useDelimiter(DELIM);
73 Runtime.getRuntime().addShutdownHook(new Thread() {
85 * This class will not close the incoming stream! So the client should do
89 * @throws FileNotFoundException
91 public FastaReader(final InputStream inputStream)
92 throws FileNotFoundException {
93 input = new Scanner(inputStream);
94 input.useDelimiter(DELIM);
100 * @throws IllegalStateException
101 * if the close method was called on this instance
104 public boolean hasNext() {
105 return input.hasNext();
109 * Reads the next FastaSequence from the input
111 * @throws AssertionError
112 * if the header or the sequence is missing
113 * @throws IllegalStateException
114 * if the close method was called on this instance
115 * @throws MismatchException
116 * - if there were no more FastaSequence's.
119 public FastaSequence next() {
120 String fastaHeader = input.next();
121 while (fastaHeader.indexOf("\n") < 0 && input.hasNext()) {
122 fastaHeader = fastaHeader.concat(">");
123 fastaHeader = fastaHeader.concat(input.next());
125 return FastaReader.toFastaSequence(fastaHeader);
132 public void remove() {
133 throw new UnsupportedOperationException();
137 * Call this method to close the connection to the input file if you want to
138 * free up the resources. The connection will be closed on the JVM shutdown
139 * if this method was not called explicitly. No further reading on this
140 * instance of the FastaReader will be possible after calling this method.
142 public void close() {
146 private static FastaSequence toFastaSequence(final String singleFastaEntry) {
148 // assert !Util.isEmpty(singleFastaEntry) :
149 // "Empty String where FASTA sequence is expected!";
151 int nlineidx = singleFastaEntry.indexOf("\n");
153 throw new AssertionError(
154 "The FASTA sequence must contain the header information"
155 + " separated by the new line from the sequence. Given sequence does not appear to "
156 + "contain the header! Given data:\n "
159 String header = singleFastaEntry.substring(0, nlineidx);
161 // Get rid of the new line chars (should cover common cases)
162 header = header.replaceAll("\r", "");
164 String sequence = singleFastaEntry.substring(nlineidx);
167 * if (Util.isEmpty(sequence)) { throw new AssertionError(
168 * "Empty sequences are not allowed! Please make sure the " +
169 * " data is in the FASTA format! Given data:\n " + singleFastaEntry); }
171 return new FastaSequence(header, sequence);