JWS-29 patch fixes bug - wierd workaround since "^\\s*>" caused scanner to advance...
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0\r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 package compbio.data.sequence;\r
19 \r
20 import java.io.File;\r
21 import java.io.FileNotFoundException;\r
22 import java.io.InputStream;\r
23 import java.util.Iterator;\r
24 import java.util.Scanner;\r
25 import java.util.regex.MatchResult;\r
26 \r
27 import javax.vecmath.MismatchedSizeException;\r
28 \r
29 import compbio.util.Util;\r
30 \r
31 /**\r
32  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
33  * header is preserved including trailing white spaces. All the white spaces are\r
34  * removed from the sequence.\r
35  * \r
36  * Examples of the correct input:\r
37  * \r
38  * <pre>\r
39  * \r
40  * >zedpshvyzg\r
41  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
42  * \r
43  * >xovkactesa\r
44  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
45  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
46  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
47  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
48  * \r
49  * >ntazzewyvv\r
50  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
51  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
52  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
53  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
54  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
55  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
56  * \r
57  *    > 12 d t y wi             k       jbke    \r
58  *   KLSHHDCD\r
59  *    N\r
60  *     H\r
61  *     HSKCTEPHCGNSHQMLHRDP\r
62  *     CCDQCQSWEAENWCASMRKAILF\r
63  * \r
64  * </pre>\r
65  * \r
66  * @author Peter Troshin\r
67  * @version 1.0 April 2011\r
68  * \r
69  */\r
70 public class FastaReader implements Iterator<FastaSequence> {\r
71 \r
72         private final Scanner input;\r
73         /**\r
74          * Delimiter for the scanner\r
75          */\r
76         private final String DELIM=">";\r
77         /**\r
78          * Header data can contain non-ASCII symbols and read in UTF8\r
79          * \r
80          * @param inputFile\r
81          *            the file containing the list of FASTA formatted sequences to\r
82          *            read from\r
83          * @throws FileNotFoundException\r
84          *             if the input file is not found\r
85          * @throws IllegalStateException\r
86          *             if the close method was called on this instance\r
87          * \r
88          */\r
89         public FastaReader(final String inputFile) throws FileNotFoundException {\r
90                 input = new Scanner(new File(inputFile), "UTF8");\r
91                 input.useDelimiter(DELIM);\r
92                 Runtime.getRuntime().addShutdownHook(new Thread() {\r
93 \r
94                         @Override\r
95                         public void run() {\r
96                                 if (input != null) {\r
97                                         input.close();\r
98                                 }\r
99                         }\r
100                 });\r
101         }\r
102 \r
103         /**\r
104          * This class will not close the incoming stream! So the client should do\r
105          * so.\r
106          * \r
107          * @param inputStream\r
108          * @throws FileNotFoundException\r
109          */\r
110         public FastaReader(final InputStream inputStream)\r
111                         throws FileNotFoundException {\r
112                 input = new Scanner(inputStream);\r
113                 input.useDelimiter(DELIM);\r
114         }\r
115         /**\r
116          * {@inheritDoc}\r
117          * \r
118          * @throws IllegalStateException\r
119          *             if the close method was called on this instance\r
120          */\r
121         @Override\r
122         public boolean hasNext() {\r
123                 return input.hasNext();\r
124         }\r
125 \r
126         /**\r
127          * Reads the next FastaSequence from the input\r
128          * \r
129          * @throws AssertionError\r
130          *             if the header or the sequence is missing\r
131          * @throws IllegalStateException\r
132          *             if the close method was called on this instance\r
133          *             @throws MismatchException - if there were no more FastaSequence's.\r
134          */\r
135         @Override\r
136         public FastaSequence next() {\r
137                 String fastaHeader=input.next();\r
138                 while (fastaHeader.indexOf("\n")<0 && input.hasNext())\r
139                 {\r
140                         fastaHeader = fastaHeader.concat(">");\r
141                         fastaHeader = fastaHeader.concat(input.next());\r
142                 }\r
143                 return FastaReader.toFastaSequence(fastaHeader);\r
144         }\r
145 \r
146         /**\r
147          * Not implemented\r
148          */\r
149         @Override\r
150         public void remove() {\r
151                 throw new UnsupportedOperationException();\r
152         }\r
153 \r
154         /**\r
155          * Call this method to close the connection to the input file if you want to\r
156          * free up the resources. The connection will be closed on the JVM shutdown\r
157          * if this method was not called explicitly. No further reading on this\r
158          * instance of the FastaReader will be possible after calling this method.\r
159          */\r
160         public void close() {\r
161                 input.close();\r
162         }\r
163 \r
164         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
165 \r
166                 assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";\r
167 \r
168                 int nlineidx = singleFastaEntry.indexOf("\n");\r
169                 if (nlineidx < 0) {\r
170                         throw new AssertionError(\r
171                                         "The FASTA sequence must contain the header information"\r
172                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
173                                                         + "contain the header! Given data:\n "\r
174                                                         + singleFastaEntry);\r
175                 }\r
176                 String header = singleFastaEntry.substring(0, nlineidx);\r
177 \r
178                 // Get rid of the new line chars (should cover common cases)\r
179                 header = header.replaceAll("\r", "");\r
180 \r
181                 String sequence = singleFastaEntry.substring(nlineidx);\r
182 \r
183                 if (Util.isEmpty(sequence)) {\r
184                         throw new AssertionError(\r
185                                         "Empty sequences are not allowed! Please make sure the "\r
186                                                         + " data is in the FASTA format! Given data:\n "\r
187                                                         + singleFastaEntry);\r
188                 }\r
189                 return new FastaSequence(header, sequence);\r
190         }\r
191 }\r