A wee improvement
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.File;\r
4 import java.io.FileNotFoundException;\r
5 import java.util.Iterator;\r
6 import java.util.Scanner;\r
7 \r
8 import compbio.util.Util;\r
9 \r
10 /**\r
11  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
12  * header is preserved including trailing white spaces. All the white spaces are\r
13  * removed from the sequence.\r
14  * \r
15  * Examples of the correct input:\r
16  * \r
17  * <pre>\r
18  * \r
19  * >zedpshvyzg\r
20  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
21  * \r
22  * >xovkactesa\r
23  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
24  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
25  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
26  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
27  * \r
28  * >ntazzewyvv\r
29  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
30  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
31  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
32  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
33  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
34  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
35  * \r
36  *    > 12 d t y wi             k       jbke    \r
37  *   KLSHHDCD\r
38  *    N\r
39  *     H\r
40  *     HSKCTEPHCGNSHQMLHRDP\r
41  *     CCDQCQSWEAENWCASMRKAILF\r
42  * \r
43  * </pre>\r
44  * \r
45  * @author Peter Troshin\r
46  * @version 1.0 April 2011\r
47  * \r
48  */\r
49 public class FastaReader implements Iterator<FastaSequence> {\r
50 \r
51         private final Scanner input;\r
52 \r
53         /**\r
54          * Header data can contain non-ASCII symbols and read in UTF8\r
55          * \r
56          * @param input\r
57          *            the file containing the list of FASTA formatted sequences to\r
58          *            read from\r
59          * @throws FileNotFoundException\r
60          *             if the input file is not found\r
61          * @throws IllegalStateException\r
62          *             if the close method was called on this instance\r
63          * \r
64          */\r
65         public FastaReader(final String inputFile) throws FileNotFoundException {\r
66                 input = new Scanner(new File(inputFile), "UTF8");\r
67                 input.useDelimiter("\\s*>");\r
68                 Runtime.getRuntime().addShutdownHook(new Thread() {\r
69 \r
70                         @Override\r
71                         public void run() {\r
72                                 if (input != null) {\r
73                                         input.close();\r
74                                 }\r
75                         }\r
76                 });\r
77         }\r
78         /**\r
79          * {@inheritDoc}\r
80          * \r
81          * @throws IllegalStateException\r
82          *             if the close method was called on this instance\r
83          */\r
84         @Override\r
85         public boolean hasNext() {\r
86                 return input.hasNext();\r
87         }\r
88 \r
89         /**\r
90          * Reads the next FastaSequence from the input\r
91          * \r
92          * @throws AssertionError\r
93          *             if the header or the sequence is missing\r
94          * @throws IllegalStateException\r
95          *             if the close method was called on this instance\r
96          */\r
97         @Override\r
98         public FastaSequence next() {\r
99                 return FastaReader.toFastaSequence(input.next());\r
100         }\r
101 \r
102         /**\r
103          * Not implemented\r
104          */\r
105         @Override\r
106         public void remove() {\r
107                 throw new UnsupportedOperationException();\r
108         }\r
109 \r
110         /**\r
111          * Call this method to close the connection to the input file if you want to\r
112          * free up the resources. The connection will be closed on the JVM shutdown\r
113          * if this method was not called explicitly. No further reading on this\r
114          * instance of the FastaReader will be possible after calling this method.\r
115          */\r
116         public void close() {\r
117                 input.close();\r
118         }\r
119 \r
120         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
121                 final Scanner sc = new Scanner(singleFastaEntry);\r
122                 // Use new line delimiter\r
123                 sc.useDelimiter("\n");\r
124                 if (!sc.hasNext()) {\r
125                         throw new AssertionError(\r
126                                         "The FASTA sequence must contain the header information"\r
127                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
128                                                         + "contain the header! Given data:\n "\r
129                                                         + singleFastaEntry);\r
130                 }\r
131                 String header = sc.next();\r
132                 // Get rid of the new line chars (should cover common cases)\r
133                 header = header.replaceAll("\n", "").replaceAll("\r", "");\r
134 \r
135                 sc.useDelimiter("\\s*");\r
136                 final StringBuilder sb = new StringBuilder();\r
137                 while (sc.hasNext()) {\r
138                         sb.append(sc.next().trim());\r
139                 }\r
140                 final String sequence = sb.toString();\r
141                 if (Util.isEmpty(sequence)) {\r
142                         throw new AssertionError(\r
143                                         "Empty sequences are not allowed! Please make sure the "\r
144                                                         + " data is in the FASTA format! Given data:\n "\r
145                                                         + singleFastaEntry);\r
146                 }\r
147                 return new FastaSequence(header, sequence);\r
148         }\r
149 }\r