First working code
[proteocache.git] / datadb / compbio / cassandra / FastaReader.java
1 package compbio.cassandra;
2
3 import java.io.File;
4 import java.io.FileNotFoundException;
5 import java.io.InputStream;
6 import java.util.Iterator;
7 import java.util.Scanner;
8
9 //import compbio.util.Util;
10
11 /**
12  * Reads files with FASTA formatted sequences. All the information in the FASTA
13  * header is preserved including trailing white spaces. All the white spaces are
14  * removed from the sequence.
15  * 
16  * Examples of the correct input:
17  * 
18  * <pre>
19  * 
20  * >zedpshvyzg
21  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
22  * 
23  * >xovkactesa
24  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
25  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
26  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
27  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
28  * 
29  * >ntazzewyvv
30  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
31  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      
32  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW 
33  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
34  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
35  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
36  * 
37  *    > 12 d t y wi             k       jbke    
38  *   KLSHHDCD
39  *    N
40  *     H
41  *     HSKCTEPHCGNSHQMLHRDP
42  *     CCDQCQSWEAENWCASMRKAILF
43  * 
44  * </pre>
45  * 
46  * @author Peter Troshin
47  * @version 1.0 April 2011
48  * 
49  */
50 public class FastaReader implements Iterator<FastaSequence> {
51
52         private final Scanner input;
53         /**
54          * Delimiter for the scanner
55          */
56         private final String DELIM = ">";
57
58         /**
59          * Header data can contain non-ASCII symbols and read in UTF8
60          * 
61          * @param inputFile
62          *            the file containing the list of FASTA formatted sequences to
63          *            read from
64          * @throws FileNotFoundException
65          *             if the input file is not found
66          * @throws IllegalStateException
67          *             if the close method was called on this instance
68          * 
69          */
70         public FastaReader(final String inputFile) throws FileNotFoundException {
71                 input = new Scanner(new File(inputFile), "UTF8");
72                 input.useDelimiter(DELIM);
73                 Runtime.getRuntime().addShutdownHook(new Thread() {
74
75                         @Override
76                         public void run() {
77                                 if (input != null) {
78                                         input.close();
79                                 }
80                         }
81                 });
82         }
83
84         /**
85          * This class will not close the incoming stream! So the client should do
86          * so.
87          * 
88          * @param inputStream
89          * @throws FileNotFoundException
90          */
91         public FastaReader(final InputStream inputStream)
92                         throws FileNotFoundException {
93                 input = new Scanner(inputStream);
94                 input.useDelimiter(DELIM);
95         }
96
97         /**
98          * {@inheritDoc}
99          * 
100          * @throws IllegalStateException
101          *             if the close method was called on this instance
102          */
103         @Override
104         public boolean hasNext() {
105                 return input.hasNext();
106         }
107
108         /**
109          * Reads the next FastaSequence from the input
110          * 
111          * @throws AssertionError
112          *             if the header or the sequence is missing
113          * @throws IllegalStateException
114          *             if the close method was called on this instance
115          * @throws MismatchException
116          *             - if there were no more FastaSequence's.
117          */
118         @Override
119         public FastaSequence next() {
120                 String fastaHeader = input.next();
121                 while (fastaHeader.indexOf("\n") < 0 && input.hasNext()) {
122                         fastaHeader = fastaHeader.concat(">");
123                         fastaHeader = fastaHeader.concat(input.next());
124                 }
125                 return FastaReader.toFastaSequence(fastaHeader);
126         }
127
128         /**
129          * Not implemented
130          */
131         @Override
132         public void remove() {
133                 throw new UnsupportedOperationException();
134         }
135
136         /**
137          * Call this method to close the connection to the input file if you want to
138          * free up the resources. The connection will be closed on the JVM shutdown
139          * if this method was not called explicitly. No further reading on this
140          * instance of the FastaReader will be possible after calling this method.
141          */
142         public void close() {
143                 input.close();
144         }
145
146         private static FastaSequence toFastaSequence(final String singleFastaEntry) {
147
148                 // assert !Util.isEmpty(singleFastaEntry) :
149                 // "Empty String where FASTA sequence is expected!";
150
151                 int nlineidx = singleFastaEntry.indexOf("\n");
152                 if (nlineidx < 0) {
153                         throw new AssertionError(
154                                         "The FASTA sequence must contain the header information"
155                                                         + " separated by the new line from the sequence. Given sequence does not appear to "
156                                                         + "contain the header! Given data:\n "
157                                                         + singleFastaEntry);
158                 }
159                 String header = singleFastaEntry.substring(0, nlineidx);
160
161                 // Get rid of the new line chars (should cover common cases)
162                 header = header.replaceAll("\r", "");
163
164                 String sequence = singleFastaEntry.substring(nlineidx);
165
166                 /*
167                  * if (Util.isEmpty(sequence)) { throw new AssertionError(
168                  * "Empty sequences are not allowed! Please make sure the " +
169                  * " data is in the FASTA format! Given data:\n " + singleFastaEntry); }
170                  */
171                 return new FastaSequence(header, sequence);
172         }
173 }