38f1ccbe83c7057a553fd557bb0ad9f512dbe645
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0\r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 package compbio.data.sequence;\r
19 \r
20 import java.io.File;\r
21 import java.io.FileNotFoundException;\r
22 import java.io.InputStream;\r
23 import java.util.Iterator;\r
24 import java.util.Scanner;\r
25 \r
26 import compbio.util.Util;\r
27 \r
28 /**\r
29  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
30  * header is preserved including trailing white spaces. All the white spaces are\r
31  * removed from the sequence.\r
32  * \r
33  * Examples of the correct input:\r
34  * \r
35  * <pre>\r
36  * \r
37  * >zedpshvyzg\r
38  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
39  * \r
40  * >xovkactesa\r
41  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
42  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
43  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
44  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
45  * \r
46  * >ntazzewyvv\r
47  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
48  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
49  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
50  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
51  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
52  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
53  * \r
54  *    > 12 d t y wi             k       jbke    \r
55  *   KLSHHDCD\r
56  *    N\r
57  *     H\r
58  *     HSKCTEPHCGNSHQMLHRDP\r
59  *     CCDQCQSWEAENWCASMRKAILF\r
60  * \r
61  * </pre>\r
62  * \r
63  * @author Peter Troshin\r
64  * @version 1.0 April 2011\r
65  * \r
66  */\r
67 public class FastaReader implements Iterator<FastaSequence> {\r
68 \r
69         private final Scanner input;\r
70 \r
71         /**\r
72          * Header data can contain non-ASCII symbols and read in UTF8\r
73          * \r
74          * @param input\r
75          *            the file containing the list of FASTA formatted sequences to\r
76          *            read from\r
77          * @throws FileNotFoundException\r
78          *             if the input file is not found\r
79          * @throws IllegalStateException\r
80          *             if the close method was called on this instance\r
81          * \r
82          */\r
83         public FastaReader(final String inputFile) throws FileNotFoundException {\r
84                 input = new Scanner(new File(inputFile), "UTF8");\r
85                 input.useDelimiter("\\s*>");\r
86                 Runtime.getRuntime().addShutdownHook(new Thread() {\r
87 \r
88                         @Override\r
89                         public void run() {\r
90                                 if (input != null) {\r
91                                         input.close();\r
92                                 }\r
93                         }\r
94                 });\r
95         }\r
96 \r
97         /**\r
98          * This class will not close the incoming stream! So the client should do\r
99          * so.\r
100          * \r
101          * @param inputStream\r
102          * @throws FileNotFoundException\r
103          */\r
104         public FastaReader(final InputStream inputStream)\r
105                         throws FileNotFoundException {\r
106                 input = new Scanner(inputStream);\r
107                 input.useDelimiter("\\s*>");\r
108         }\r
109         /**\r
110          * {@inheritDoc}\r
111          * \r
112          * @throws IllegalStateException\r
113          *             if the close method was called on this instance\r
114          */\r
115         @Override\r
116         public boolean hasNext() {\r
117                 return input.hasNext();\r
118         }\r
119 \r
120         /**\r
121          * Reads the next FastaSequence from the input\r
122          * \r
123          * @throws AssertionError\r
124          *             if the header or the sequence is missing\r
125          * @throws IllegalStateException\r
126          *             if the close method was called on this instance\r
127          */\r
128         @Override\r
129         public FastaSequence next() {\r
130                 return FastaReader.toFastaSequence(input.next());\r
131         }\r
132 \r
133         /**\r
134          * Not implemented\r
135          */\r
136         @Override\r
137         public void remove() {\r
138                 throw new UnsupportedOperationException();\r
139         }\r
140 \r
141         /**\r
142          * Call this method to close the connection to the input file if you want to\r
143          * free up the resources. The connection will be closed on the JVM shutdown\r
144          * if this method was not called explicitly. No further reading on this\r
145          * instance of the FastaReader will be possible after calling this method.\r
146          */\r
147         public void close() {\r
148                 input.close();\r
149         }\r
150 \r
151         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
152 \r
153                 assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";\r
154 \r
155                 int nlineidx = singleFastaEntry.indexOf("\n");\r
156                 if (nlineidx < 0) {\r
157                         throw new AssertionError(\r
158                                         "The FASTA sequence must contain the header information"\r
159                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
160                                                         + "contain the header! Given data:\n "\r
161                                                         + singleFastaEntry);\r
162                 }\r
163                 String header = singleFastaEntry.substring(0, nlineidx);\r
164 \r
165                 // Get rid of the new line chars (should cover common cases)\r
166                 header = header.replaceAll("\r", "");\r
167 \r
168                 String sequence = singleFastaEntry.substring(nlineidx);\r
169 \r
170                 if (Util.isEmpty(sequence)) {\r
171                         throw new AssertionError(\r
172                                         "Empty sequences are not allowed! Please make sure the "\r
173                                                         + " data is in the FASTA format! Given data:\n "\r
174                                                         + singleFastaEntry);\r
175                 }\r
176                 return new FastaSequence(header, sequence);\r
177         }\r
178 }\r