--- /dev/null
+package compbio.data.sequence;\r
+\r
+import java.io.File;\r
+import java.io.FileNotFoundException;\r
+import java.util.Iterator;\r
+import java.util.Scanner;\r
+\r
+import compbio.util.Util;\r
+\r
+/**\r
+ * Reads files with FASTA formatted sequences. All the information in the FASTA\r
+ * header is preserved including trailing white spaces. All the white spaces are\r
+ * removed from the sequence.\r
+ * \r
+ * Examples of the correct input:\r
+ * \r
+ * <pre>\r
+ * \r
+ * >zedpshvyzg\r
+ * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+ * \r
+ * >xovkactesa\r
+ * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+ * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+ * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+ * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+ * \r
+ * >ntazzewyvv\r
+ * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+ * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG \r
+ * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW \r
+ * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN \r
+ * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF \r
+ * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+ * \r
+ * > 12 d t y wi k jbke \r
+ * KLSHHDCD\r
+ * N\r
+ * H\r
+ * HSKCTEPHCGNSHQMLHRDP\r
+ * CCDQCQSWEAENWCASMRKAILF\r
+ * \r
+ * </pre>\r
+ * \r
+ * @author Peter Troshin\r
+ * @version 1.0 April 2011\r
+ * \r
+ */\r
+public class FastaReader implements Iterator<FastaSequence> {\r
+\r
+ private final Scanner input;\r
+\r
+ /**\r
+ * Header data can contain non-ASCII symbols and read in UTF8\r
+ * \r
+ * @param input\r
+ * the file containing the list of FASTA formatted sequences to\r
+ * read from\r
+ * @throws FileNotFoundException\r
+ * if the input file is not found\r
+ */\r
+ public FastaReader(final String input) throws FileNotFoundException {\r
+ this.input = new Scanner(new File(input), "UTF8");\r
+ this.input.useDelimiter("\\s*>");\r
+ }\r
+\r
+ /**\r
+ * {@inheritDoc}\r
+ */\r
+ @Override\r
+ public boolean hasNext() {\r
+ return input.hasNext();\r
+ }\r
+\r
+ /**\r
+ * Reads the next FastaSequence from the input\r
+ * \r
+ * @throws AssertionError\r
+ * if the header or the sequence is missing\r
+ */\r
+ @Override\r
+ public FastaSequence next() {\r
+ return FastaReader.toFastaSequence(input.next());\r
+ }\r
+\r
+ /**\r
+ * Not implemented\r
+ */\r
+ @Override\r
+ public void remove() {\r
+ throw new UnsupportedOperationException();\r
+ }\r
+\r
+ private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
+ final Scanner sc = new Scanner(singleFastaEntry);\r
+ // Use new line delimiter\r
+ sc.useDelimiter("\n");\r
+ if (!sc.hasNext()) {\r
+ throw new AssertionError(\r
+ "The FASTA sequence must contain the header information"\r
+ + " separated by the new line from the sequence. Given sequence does not appear to "\r
+ + "contain the header! Given data:\n "\r
+ + singleFastaEntry);\r
+ }\r
+ String header = sc.next();\r
+ // Get rid of the new line chars (should cover common cases)\r
+ header = header.replaceAll("\n", "").replaceAll("\r", "");\r
+\r
+ sc.useDelimiter("\\s*");\r
+ final StringBuilder sb = new StringBuilder();\r
+ while (sc.hasNext()) {\r
+ sb.append(sc.next().trim());\r
+ }\r
+ final String sequence = sb.toString();\r
+ if (Util.isEmpty(sequence)) {\r
+ throw new AssertionError(\r
+ "Empty sequences are not allowed! Please make sure the "\r
+ + " data is in the FASTA format! Given data:\n "\r
+ + singleFastaEntry);\r
+ }\r
+ return new FastaSequence(header, sequence);\r
+ }\r
+}\r
--- /dev/null
+package compbio.data.sequence;\r
+\r
+import java.io.FileInputStream;\r
+import java.io.FileNotFoundException;\r
+import java.io.IOException;\r
+import java.util.ArrayList;\r
+import java.util.List;\r
+\r
+import org.testng.Assert;\r
+import org.testng.annotations.Test;\r
+\r
+import compbio.metadata.AllTestSuit;\r
+\r
+public class FastaReaderTester {\r
+\r
+ static FastaSequence s0 = new FastaSequence(\r
+ "zedpshvyzg",\r
+ "GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD");\r
+\r
+ static FastaSequence s1 = new FastaSequence(\r
+ "xovkactesa",\r
+ "CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM"\r
+ + "FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG"\r
+ + "FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH"\r
+ + "DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC");\r
+ static FastaSequence s2 = new FastaSequence(\r
+ "ntazzewyvv",\r
+ "CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD"\r
+ + "EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG"\r
+ + "EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW"\r
+ + " RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN "\r
+ + "W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF "\r
+ + "FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI");\r
+ static FastaSequence s3 = new FastaSequence(" 12 d t y wi k jbke ",\r
+ " KLSHHDCD" + " N" + " H" + " HSKCTEPHCGNSHQML\n\rHRDP"\r
+ + " CCDQCQSWEAENWCASMRKAILF");\r
+ @Test()\r
+ public void test() {\r
+\r
+ List<FastaSequence> old_seqs = null;\r
+ final List<FastaSequence> list = new ArrayList<FastaSequence>();\r
+ try {\r
+ old_seqs = SequenceUtil.readFasta(new FileInputStream(\r
+ AllTestSuit.TEST_DATA_PATH + "complicated.fasta"));\r
+ final FastaReader fr = new FastaReader(AllTestSuit.TEST_DATA_PATH\r
+ + "complicated.fasta");\r
+\r
+ while (fr.hasNext()) {\r
+ final FastaSequence fs = fr.next();\r
+ list.add(fs);\r
+ }\r
+ } catch (final FileNotFoundException e) {\r
+ e.printStackTrace();\r
+ Assert.fail(e.getLocalizedMessage());\r
+ } catch (final IOException e) {\r
+ e.printStackTrace();\r
+ Assert.fail(e.getLocalizedMessage());\r
+ }\r
+ System.out.println("OLD: " + old_seqs);\r
+ System.out.println("NEW: " + list);\r
+ Assert.assertEquals(old_seqs.size() + 1, list.size());\r
+ Assert.assertEquals(old_seqs.get(0), list.get(0));\r
+ Assert.assertEquals(old_seqs.get(1), list.get(1));\r
+ // Assert.assertEquals(old_seqs.get(2), list.get(2));\r
+ // Assert.assertEquals(seqs.get(3), list.get(3));\r
+\r
+ Assert.assertEquals(FastaReaderTester.s0, list.get(0));\r
+ Assert.assertEquals(FastaReaderTester.s1, list.get(1));\r
+ Assert.assertEquals(FastaReaderTester.s2, list.get(2));\r
+ Assert.assertEquals(FastaReaderTester.s3, list.get(3));\r
+\r
+ }\r
+}\r
--- /dev/null
+>zedpshvyzg\r
+GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+\r
+>xovkactesa\r
+CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+\r
+>ntazzewyvv\r
+CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG \r
+EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW \r
+ RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN \r
+ W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF \r
+ FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+\r
+ > 12 d t y wi k jbke \r
+ KLSHHDCD\r
+ N\r
+ H\r
+ HSKCTEPHCGNSHQMLHRDP\r
+ CCDQCQSWEAENWCASMRKAILF
\ No newline at end of file