different corrections
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0\r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 package compbio.data.sequence;\r
19 \r
20 import java.io.File;\r
21 import java.io.FileNotFoundException;\r
22 import java.io.InputStream;\r
23 import java.util.Iterator;\r
24 import java.util.Scanner;\r
25 import java.util.regex.MatchResult;\r
26 \r
27 import compbio.util.Util;\r
28 \r
29 /**\r
30  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
31  * header is preserved including trailing white spaces. All the white spaces are\r
32  * removed from the sequence.\r
33  * \r
34  * Examples of the correct input:\r
35  * \r
36  * <pre>\r
37  * \r
38  * >zedpshvyzg\r
39  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
40  * \r
41  * >xovkactesa\r
42  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
43  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
44  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
45  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
46  * \r
47  * >ntazzewyvv\r
48  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
49  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
50  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
51  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
52  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
53  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
54  * \r
55  *    > 12 d t y wi             k       jbke    \r
56  *   KLSHHDCD\r
57  *    N\r
58  *     H\r
59  *     HSKCTEPHCGNSHQMLHRDP\r
60  *     CCDQCQSWEAENWCASMRKAILF\r
61  * \r
62  * </pre>\r
63  * \r
64  * @author Peter Troshin\r
65  * @version 1.0 April 2011\r
66  * \r
67  */\r
68 public class FastaReader implements Iterator<FastaSequence> {\r
69 \r
70         private final Scanner input;\r
71         /**\r
72          * Delimiter for the scanner\r
73          */\r
74         private final String DELIM=">";\r
75         /**\r
76          * Header data can contain non-ASCII symbols and read in UTF8\r
77          * \r
78          * @param inputFile\r
79          *            the file containing the list of FASTA formatted sequences to\r
80          *            read from\r
81          * @throws FileNotFoundException\r
82          *             if the input file is not found\r
83          * @throws IllegalStateException\r
84          *             if the close method was called on this instance\r
85          * \r
86          */\r
87         public FastaReader(final String inputFile) throws FileNotFoundException {\r
88                 input = new Scanner(new File(inputFile), "UTF8");\r
89                 input.useDelimiter(DELIM);\r
90                 Runtime.getRuntime().addShutdownHook(new Thread() {\r
91 \r
92                         @Override\r
93                         public void run() {\r
94                                 if (input != null) {\r
95                                         input.close();\r
96                                 }\r
97                         }\r
98                 });\r
99         }\r
100 \r
101         /**\r
102          * This class will not close the incoming stream! So the client should do\r
103          * so.\r
104          * \r
105          * @param inputStream\r
106          * @throws FileNotFoundException\r
107          */\r
108         public FastaReader(final InputStream inputStream)\r
109                         throws FileNotFoundException {\r
110                 input = new Scanner(inputStream);\r
111                 input.useDelimiter(DELIM);\r
112         }\r
113         /**\r
114          * {@inheritDoc}\r
115          * \r
116          * @throws IllegalStateException\r
117          *             if the close method was called on this instance\r
118          */\r
119         @Override\r
120         public boolean hasNext() {\r
121                 return input.hasNext();\r
122         }\r
123 \r
124         /**\r
125          * Reads the next FastaSequence from the input\r
126          * \r
127          * @throws AssertionError\r
128          *             if the header or the sequence is missing\r
129          * @throws IllegalStateException\r
130          *             if the close method was called on this instance\r
131          *             @throws MismatchException - if there were no more FastaSequence's.\r
132          */\r
133         @Override\r
134         public FastaSequence next() {\r
135                 String fastaHeader=input.next();\r
136                 while (fastaHeader.indexOf("\n")<0 && input.hasNext())\r
137                 {\r
138                         fastaHeader = fastaHeader.concat(">");\r
139                         fastaHeader = fastaHeader.concat(input.next());\r
140                 }\r
141                 return FastaReader.toFastaSequence(fastaHeader);\r
142         }\r
143 \r
144         /**\r
145          * Not implemented\r
146          */\r
147         @Override\r
148         public void remove() {\r
149                 throw new UnsupportedOperationException();\r
150         }\r
151 \r
152         /**\r
153          * Call this method to close the connection to the input file if you want to\r
154          * free up the resources. The connection will be closed on the JVM shutdown\r
155          * if this method was not called explicitly. No further reading on this\r
156          * instance of the FastaReader will be possible after calling this method.\r
157          */\r
158         public void close() {\r
159                 input.close();\r
160         }\r
161 \r
162         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
163 \r
164                 assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";\r
165 \r
166                 int nlineidx = singleFastaEntry.indexOf("\n");\r
167                 if (nlineidx < 0) {\r
168                         throw new AssertionError(\r
169                                         "The FASTA sequence must contain the header information"\r
170                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
171                                                         + "contain the header! Given data:\n "\r
172                                                         + singleFastaEntry);\r
173                 }\r
174                 String header = singleFastaEntry.substring(0, nlineidx);\r
175 \r
176                 // Get rid of the new line chars (should cover common cases)\r
177                 header = header.replaceAll("\r", "");\r
178 \r
179                 String sequence = singleFastaEntry.substring(nlineidx);\r
180 \r
181                 if (Util.isEmpty(sequence)) {\r
182                         throw new AssertionError(\r
183                                         "Empty sequences are not allowed! Please make sure the "\r
184                                                         + " data is in the FASTA format! Given data:\n "\r
185                                                         + singleFastaEntry);\r
186                 }\r
187                 return new FastaSequence(header, sequence);\r
188         }\r
189 }\r