A wee improvement
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0\r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 package compbio.data.sequence;\r
19 \r
20 import java.io.File;\r
21 import java.io.FileNotFoundException;\r
22 import java.util.Iterator;\r
23 import java.util.Scanner;\r
24 \r
25 import compbio.util.Util;\r
26 \r
27 /**\r
28  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
29  * header is preserved including trailing white spaces. All the white spaces are\r
30  * removed from the sequence.\r
31  * \r
32  * Examples of the correct input:\r
33  * \r
34  * <pre>\r
35  * \r
36  * >zedpshvyzg\r
37  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
38  * \r
39  * >xovkactesa\r
40  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
41  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
42  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
43  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
44  * \r
45  * >ntazzewyvv\r
46  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
47  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
48  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
49  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
50  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
51  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
52  * \r
53  *    > 12 d t y wi             k       jbke    \r
54  *   KLSHHDCD\r
55  *    N\r
56  *     H\r
57  *     HSKCTEPHCGNSHQMLHRDP\r
58  *     CCDQCQSWEAENWCASMRKAILF\r
59  * \r
60  * </pre>\r
61  * \r
62  * @author Peter Troshin\r
63  * @version 1.0 April 2011\r
64  * \r
65  */\r
66 public class FastaReader implements Iterator<FastaSequence> {\r
67 \r
68         private final Scanner input;\r
69 \r
70         /**\r
71          * Header data can contain non-ASCII symbols and read in UTF8\r
72          * \r
73          * @param input\r
74          *            the file containing the list of FASTA formatted sequences to\r
75          *            read from\r
76          * @throws FileNotFoundException\r
77          *             if the input file is not found\r
78          * @throws IllegalStateException\r
79          *             if the close method was called on this instance\r
80          * \r
81          */\r
82         public FastaReader(final String inputFile) throws FileNotFoundException {\r
83                 input = new Scanner(new File(inputFile), "UTF8");\r
84                 input.useDelimiter("\\s*>");\r
85                 Runtime.getRuntime().addShutdownHook(new Thread() {\r
86 \r
87                         @Override\r
88                         public void run() {\r
89                                 if (input != null) {\r
90                                         input.close();\r
91                                 }\r
92                         }\r
93                 });\r
94         }\r
95         /**\r
96          * {@inheritDoc}\r
97          * \r
98          * @throws IllegalStateException\r
99          *             if the close method was called on this instance\r
100          */\r
101         @Override\r
102         public boolean hasNext() {\r
103                 return input.hasNext();\r
104         }\r
105 \r
106         /**\r
107          * Reads the next FastaSequence from the input\r
108          * \r
109          * @throws AssertionError\r
110          *             if the header or the sequence is missing\r
111          * @throws IllegalStateException\r
112          *             if the close method was called on this instance\r
113          */\r
114         @Override\r
115         public FastaSequence next() {\r
116                 return FastaReader.toFastaSequence(input.next());\r
117         }\r
118 \r
119         /**\r
120          * Not implemented\r
121          */\r
122         @Override\r
123         public void remove() {\r
124                 throw new UnsupportedOperationException();\r
125         }\r
126 \r
127         /**\r
128          * Call this method to close the connection to the input file if you want to\r
129          * free up the resources. The connection will be closed on the JVM shutdown\r
130          * if this method was not called explicitly. No further reading on this\r
131          * instance of the FastaReader will be possible after calling this method.\r
132          */\r
133         public void close() {\r
134                 input.close();\r
135         }\r
136 \r
137         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
138                 final Scanner sc = new Scanner(singleFastaEntry);\r
139                 // Use new line delimiter\r
140                 sc.useDelimiter("\n");\r
141                 if (!sc.hasNext()) {\r
142                         throw new AssertionError(\r
143                                         "The FASTA sequence must contain the header information"\r
144                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
145                                                         + "contain the header! Given data:\n "\r
146                                                         + singleFastaEntry);\r
147                 }\r
148                 String header = sc.next();\r
149                 // Get rid of the new line chars (should cover common cases)\r
150                 header = header.replaceAll("\r", "");\r
151 \r
152                 sc.useDelimiter("\\s*");\r
153                 final StringBuilder sb = new StringBuilder();\r
154                 while (sc.hasNext()) {\r
155                         sb.append(sc.next());\r
156                 }\r
157                 final String sequence = sb.toString();\r
158                 if (Util.isEmpty(sequence)) {\r
159                         throw new AssertionError(\r
160                                         "Empty sequences are not allowed! Please make sure the "\r
161                                                         + " data is in the FASTA format! Given data:\n "\r
162                                                         + singleFastaEntry);\r
163                 }\r
164                 return new FastaSequence(header, sequence);\r
165         }\r
166 }\r