1 package compbio.data.sequence;
\r
4 import java.io.FileNotFoundException;
\r
5 import java.util.Iterator;
\r
6 import java.util.Scanner;
\r
8 import compbio.util.Util;
\r
11 * Reads files with FASTA formatted sequences. All the information in the FASTA
\r
12 * header is preserved including trailing white spaces. All the white spaces are
\r
13 * removed from the sequence.
\r
15 * Examples of the correct input:
\r
20 * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
\r
23 * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
\r
24 * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
\r
25 * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
\r
26 * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
\r
29 * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
\r
30 * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG
\r
31 * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW
\r
32 * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN
\r
33 * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF
\r
34 * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
\r
36 * > 12 d t y wi k jbke
\r
40 * HSKCTEPHCGNSHQMLHRDP
\r
41 * CCDQCQSWEAENWCASMRKAILF
\r
45 * @author Peter Troshin
\r
46 * @version 1.0 April 2011
\r
49 public class FastaReader implements Iterator<FastaSequence> {
\r
51 private final Scanner input;
\r
54 * Header data can contain non-ASCII symbols and read in UTF8
\r
57 * the file containing the list of FASTA formatted sequences to
\r
59 * @throws FileNotFoundException
\r
60 * if the input file is not found
\r
62 public FastaReader(final String input) throws FileNotFoundException {
\r
63 this.input = new Scanner(new File(input), "UTF8");
\r
64 this.input.useDelimiter("\\s*>");
\r
71 public boolean hasNext() {
\r
72 return input.hasNext();
\r
76 * Reads the next FastaSequence from the input
\r
78 * @throws AssertionError
\r
79 * if the header or the sequence is missing
\r
82 public FastaSequence next() {
\r
83 return FastaReader.toFastaSequence(input.next());
\r
90 public void remove() {
\r
91 throw new UnsupportedOperationException();
\r
94 private static FastaSequence toFastaSequence(final String singleFastaEntry) {
\r
95 final Scanner sc = new Scanner(singleFastaEntry);
\r
96 // Use new line delimiter
\r
97 sc.useDelimiter("\n");
\r
98 if (!sc.hasNext()) {
\r
99 throw new AssertionError(
\r
100 "The FASTA sequence must contain the header information"
\r
101 + " separated by the new line from the sequence. Given sequence does not appear to "
\r
102 + "contain the header! Given data:\n "
\r
103 + singleFastaEntry);
\r
105 String header = sc.next();
\r
106 // Get rid of the new line chars (should cover common cases)
\r
107 header = header.replaceAll("\n", "").replaceAll("\r", "");
\r
109 sc.useDelimiter("\\s*");
\r
110 final StringBuilder sb = new StringBuilder();
\r
111 while (sc.hasNext()) {
\r
112 sb.append(sc.next().trim());
\r
114 final String sequence = sb.toString();
\r
115 if (Util.isEmpty(sequence)) {
\r
116 throw new AssertionError(
\r
117 "Empty sequences are not allowed! Please make sure the "
\r
118 + " data is in the FASTA format! Given data:\n "
\r
119 + singleFastaEntry);
\r
121 return new FastaSequence(header, sequence);
\r