New FastaReader implementing iterator & test case for it
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.File;\r
4 import java.io.FileNotFoundException;\r
5 import java.util.Iterator;\r
6 import java.util.Scanner;\r
7 \r
8 import compbio.util.Util;\r
9 \r
10 /**\r
11  * Reads files with FASTA formatted sequences. All the information in the FASTA\r
12  * header is preserved including trailing white spaces. All the white spaces are\r
13  * removed from the sequence.\r
14  * \r
15  * Examples of the correct input:\r
16  * \r
17  * <pre>\r
18  * \r
19  * >zedpshvyzg\r
20  * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
21  * \r
22  * >xovkactesa\r
23  * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
24  * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
25  * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
26  * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
27  * \r
28  * >ntazzewyvv\r
29  * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
30  * EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
31  * EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
32  *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
33  *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
34  *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
35  * \r
36  *    > 12 d t y wi             k       jbke    \r
37  *   KLSHHDCD\r
38  *    N\r
39  *     H\r
40  *     HSKCTEPHCGNSHQMLHRDP\r
41  *     CCDQCQSWEAENWCASMRKAILF\r
42  * \r
43  * </pre>\r
44  * \r
45  * @author Peter Troshin\r
46  * @version 1.0 April 2011\r
47  * \r
48  */\r
49 public class FastaReader implements Iterator<FastaSequence> {\r
50 \r
51         private final Scanner input;\r
52 \r
53         /**\r
54          * Header data can contain non-ASCII symbols and read in UTF8\r
55          * \r
56          * @param input\r
57          *            the file containing the list of FASTA formatted sequences to\r
58          *            read from\r
59          * @throws FileNotFoundException\r
60          *             if the input file is not found\r
61          */\r
62         public FastaReader(final String input) throws FileNotFoundException {\r
63                 this.input = new Scanner(new File(input), "UTF8");\r
64                 this.input.useDelimiter("\\s*>");\r
65         }\r
66 \r
67         /**\r
68          * {@inheritDoc}\r
69          */\r
70         @Override\r
71         public boolean hasNext() {\r
72                 return input.hasNext();\r
73         }\r
74 \r
75         /**\r
76          * Reads the next FastaSequence from the input\r
77          * \r
78          * @throws AssertionError\r
79          *             if the header or the sequence is missing\r
80          */\r
81         @Override\r
82         public FastaSequence next() {\r
83                 return FastaReader.toFastaSequence(input.next());\r
84         }\r
85 \r
86         /**\r
87          * Not implemented\r
88          */\r
89         @Override\r
90         public void remove() {\r
91                 throw new UnsupportedOperationException();\r
92         }\r
93 \r
94         private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
95                 final Scanner sc = new Scanner(singleFastaEntry);\r
96                 // Use new line delimiter\r
97                 sc.useDelimiter("\n");\r
98                 if (!sc.hasNext()) {\r
99                         throw new AssertionError(\r
100                                         "The FASTA sequence must contain the header information"\r
101                                                         + " separated by the new line from the sequence. Given sequence does not appear to "\r
102                                                         + "contain the header! Given data:\n "\r
103                                                         + singleFastaEntry);\r
104                 }\r
105                 String header = sc.next();\r
106                 // Get rid of the new line chars (should cover common cases)\r
107                 header = header.replaceAll("\n", "").replaceAll("\r", "");\r
108 \r
109                 sc.useDelimiter("\\s*");\r
110                 final StringBuilder sb = new StringBuilder();\r
111                 while (sc.hasNext()) {\r
112                         sb.append(sc.next().trim());\r
113                 }\r
114                 final String sequence = sb.toString();\r
115                 if (Util.isEmpty(sequence)) {\r
116                         throw new AssertionError(\r
117                                         "Empty sequences are not allowed! Please make sure the "\r
118                                                         + " data is in the FASTA format! Given data:\n "\r
119                                                         + singleFastaEntry);\r
120                 }\r
121                 return new FastaSequence(header, sequence);\r
122         }\r
123 }\r