1 /* Copyright (c) 2011 Peter Troshin
\r
3 * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0
\r
5 * This library is free software; you can redistribute it and/or modify it under the terms of the
\r
6 * Apache License version 2 as published by the Apache Software Foundation
\r
8 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
\r
9 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
\r
10 * License for more details.
\r
12 * A copy of the license is in apache_license.txt. It is also available here:
\r
13 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
15 * Any republication or derived work distributed in source code form
\r
16 * must include this copyright and license notice.
\r
18 package compbio.data.sequence;
\r
20 import java.io.File;
\r
21 import java.io.FileNotFoundException;
\r
22 import java.io.InputStream;
\r
23 import java.util.Iterator;
\r
24 import java.util.Scanner;
\r
25 import java.util.regex.MatchResult;
\r
27 import javax.vecmath.MismatchedSizeException;
\r
29 import compbio.util.Util;
\r
32 * Reads files with FASTA formatted sequences. All the information in the FASTA
\r
33 * header is preserved including trailing white spaces. All the white spaces are
\r
34 * removed from the sequence.
\r
36 * Examples of the correct input:
\r
41 * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
\r
44 * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
\r
45 * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
\r
46 * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
\r
47 * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
\r
50 * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
\r
51 * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG
\r
52 * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW
\r
53 * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN
\r
54 * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF
\r
55 * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
\r
57 * > 12 d t y wi k jbke
\r
61 * HSKCTEPHCGNSHQMLHRDP
\r
62 * CCDQCQSWEAENWCASMRKAILF
\r
66 * @author Peter Troshin
\r
67 * @version 1.0 April 2011
\r
70 public class FastaReader implements Iterator<FastaSequence> {
\r
72 private final Scanner input;
\r
74 * Delimiter for the scanner
\r
76 private final String DELIM=">";
\r
78 * Header data can contain non-ASCII symbols and read in UTF8
\r
81 * the file containing the list of FASTA formatted sequences to
\r
83 * @throws FileNotFoundException
\r
84 * if the input file is not found
\r
85 * @throws IllegalStateException
\r
86 * if the close method was called on this instance
\r
89 public FastaReader(final String inputFile) throws FileNotFoundException {
\r
90 input = new Scanner(new File(inputFile), "UTF8");
\r
91 input.useDelimiter(DELIM);
\r
92 Runtime.getRuntime().addShutdownHook(new Thread() {
\r
96 if (input != null) {
\r
104 * This class will not close the incoming stream! So the client should do
\r
107 * @param inputStream
\r
108 * @throws FileNotFoundException
\r
110 public FastaReader(final InputStream inputStream)
\r
111 throws FileNotFoundException {
\r
112 input = new Scanner(inputStream);
\r
113 input.useDelimiter(DELIM);
\r
118 * @throws IllegalStateException
\r
119 * if the close method was called on this instance
\r
122 public boolean hasNext() {
\r
123 return input.hasNext();
\r
127 * Reads the next FastaSequence from the input
\r
129 * @throws AssertionError
\r
130 * if the header or the sequence is missing
\r
131 * @throws IllegalStateException
\r
132 * if the close method was called on this instance
\r
133 * @throws MismatchException - if there were no more FastaSequence's.
\r
136 public FastaSequence next() {
\r
137 String fastaHeader=input.next();
\r
138 while (fastaHeader.indexOf("\n")<0 && input.hasNext())
\r
140 fastaHeader = fastaHeader.concat(">");
\r
141 fastaHeader = fastaHeader.concat(input.next());
\r
143 return FastaReader.toFastaSequence(fastaHeader);
\r
150 public void remove() {
\r
151 throw new UnsupportedOperationException();
\r
155 * Call this method to close the connection to the input file if you want to
\r
156 * free up the resources. The connection will be closed on the JVM shutdown
\r
157 * if this method was not called explicitly. No further reading on this
\r
158 * instance of the FastaReader will be possible after calling this method.
\r
160 public void close() {
\r
164 private static FastaSequence toFastaSequence(final String singleFastaEntry) {
\r
166 assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";
\r
168 int nlineidx = singleFastaEntry.indexOf("\n");
\r
169 if (nlineidx < 0) {
\r
170 throw new AssertionError(
\r
171 "The FASTA sequence must contain the header information"
\r
172 + " separated by the new line from the sequence. Given sequence does not appear to "
\r
173 + "contain the header! Given data:\n "
\r
174 + singleFastaEntry);
\r
176 String header = singleFastaEntry.substring(0, nlineidx);
\r
178 // Get rid of the new line chars (should cover common cases)
\r
179 header = header.replaceAll("\r", "");
\r
181 String sequence = singleFastaEntry.substring(nlineidx);
\r
183 if (Util.isEmpty(sequence)) {
\r
184 throw new AssertionError(
\r
185 "Empty sequences are not allowed! Please make sure the "
\r
186 + " data is in the FASTA format! Given data:\n "
\r
187 + singleFastaEntry);
\r
189 return new FastaSequence(header, sequence);
\r