--- /dev/null
+package compbio.data.sequence;\r
+\r
+import java.io.File;\r
+import java.io.FileNotFoundException;\r
+import java.util.Iterator;\r
+import java.util.Scanner;\r
+\r
+import compbio.util.Util;\r
+\r
+/**\r
+ * Reads files with FASTA formatted sequences. All the information in the FASTA\r
+ * header is preserved including trailing white spaces. All the white spaces are\r
+ * removed from the sequence.\r
+ * \r
+ * Examples of the correct input:\r
+ * \r
+ * <pre>\r
+ * \r
+ * >zedpshvyzg\r
+ * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+ * \r
+ * >xovkactesa\r
+ * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+ * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+ * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+ * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+ * \r
+ * >ntazzewyvv\r
+ * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+ * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG \r
+ * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW \r
+ * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN \r
+ * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF \r
+ * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+ * \r
+ * > 12 d t y wi k jbke \r
+ * KLSHHDCD\r
+ * N\r
+ * H\r
+ * HSKCTEPHCGNSHQMLHRDP\r
+ * CCDQCQSWEAENWCASMRKAILF\r
+ * \r
+ * </pre>\r
+ * \r
+ * @author Peter Troshin\r
+ * @version 1.0 April 2011\r
+ * \r
+ */\r
+public class FastaReader implements Iterator<FastaSequence> {\r
+\r
+ private final Scanner input;\r
+\r
+ /**\r
+ * Header data can contain non-ASCII symbols and read in UTF8\r
+ * \r
+ * @param input\r
+ * the file containing the list of FASTA formatted sequences to\r
+ * read from\r
+ * @throws FileNotFoundException\r
+ * if the input file is not found\r
+ */\r
+ public FastaReader(final String input) throws FileNotFoundException {\r
+ this.input = new Scanner(new File(input), "UTF8");\r
+ this.input.useDelimiter("\\s*>");\r
+ }\r
+\r
+ /**\r
+ * {@inheritDoc}\r
+ */\r
+ @Override\r
+ public boolean hasNext() {\r
+ return input.hasNext();\r
+ }\r
+\r
+ /**\r
+ * Reads the next FastaSequence from the input\r
+ * \r
+ * @throws AssertionError\r
+ * if the header or the sequence is missing\r
+ */\r
+ @Override\r
+ public FastaSequence next() {\r
+ return FastaReader.toFastaSequence(input.next());\r
+ }\r
+\r
+ /**\r
+ * Not implemented\r
+ */\r
+ @Override\r
+ public void remove() {\r
+ throw new UnsupportedOperationException();\r
+ }\r
+\r
+ private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
+ final Scanner sc = new Scanner(singleFastaEntry);\r
+ // Use new line delimiter\r
+ sc.useDelimiter("\n");\r
+ if (!sc.hasNext()) {\r
+ throw new AssertionError(\r
+ "The FASTA sequence must contain the header information"\r
+ + " separated by the new line from the sequence. Given sequence does not appear to "\r
+ + "contain the header! Given data:\n "\r
+ + singleFastaEntry);\r
+ }\r
+ String header = sc.next();\r
+ // Get rid of the new line chars (should cover common cases)\r
+ header = header.replaceAll("\n", "").replaceAll("\r", "");\r
+\r
+ sc.useDelimiter("\\s*");\r
+ final StringBuilder sb = new StringBuilder();\r
+ while (sc.hasNext()) {\r
+ sb.append(sc.next().trim());\r
+ }\r
+ final String sequence = sb.toString();\r
+ if (Util.isEmpty(sequence)) {\r
+ throw new AssertionError(\r
+ "Empty sequences are not allowed! Please make sure the "\r
+ + " data is in the FASTA format! Given data:\n "\r
+ + singleFastaEntry);\r
+ }\r
+ return new FastaSequence(header, sequence);\r
+ }\r
+}\r