New FastaReader implementing iterator & test case for it
authorpvtroshin <pvtroshin@e3abac25-378b-4346-85de-24260fe3988d>
Wed, 27 Apr 2011 23:10:25 +0000 (23:10 +0000)
committerpvtroshin <pvtroshin@e3abac25-378b-4346-85de-24260fe3988d>
Wed, 27 Apr 2011 23:10:25 +0000 (23:10 +0000)
git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@4041 e3abac25-378b-4346-85de-24260fe3988d

datamodel/compbio/data/sequence/FastaReader.java [new file with mode: 0644]
testsrc/compbio/data/sequence/FastaReaderTester.java [new file with mode: 0644]
testsrc/testdata/complicated.fasta [new file with mode: 0644]

diff --git a/datamodel/compbio/data/sequence/FastaReader.java b/datamodel/compbio/data/sequence/FastaReader.java
new file mode 100644 (file)
index 0000000..10eec8f
--- /dev/null
@@ -0,0 +1,123 @@
+package compbio.data.sequence;\r
+\r
+import java.io.File;\r
+import java.io.FileNotFoundException;\r
+import java.util.Iterator;\r
+import java.util.Scanner;\r
+\r
+import compbio.util.Util;\r
+\r
+/**\r
+ * Reads files with FASTA formatted sequences. All the information in the FASTA\r
+ * header is preserved including trailing white spaces. All the white spaces are\r
+ * removed from the sequence.\r
+ * \r
+ * Examples of the correct input:\r
+ * \r
+ * <pre>\r
+ * \r
+ * >zedpshvyzg\r
+ * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+ * \r
+ * >xovkactesa\r
+ * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+ * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+ * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+ * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+ * \r
+ * >ntazzewyvv\r
+ * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+ * EASINM      AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
+ * EFITEA      WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
+ *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
+ *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
+ *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+ * \r
+ *    > 12 d t y wi            k       jbke    \r
+ *   KLSHHDCD\r
+ *    N\r
+ *     H\r
+ *     HSKCTEPHCGNSHQMLHRDP\r
+ *     CCDQCQSWEAENWCASMRKAILF\r
+ * \r
+ * </pre>\r
+ * \r
+ * @author Peter Troshin\r
+ * @version 1.0 April 2011\r
+ * \r
+ */\r
+public class FastaReader implements Iterator<FastaSequence> {\r
+\r
+       private final Scanner input;\r
+\r
+       /**\r
+        * Header data can contain non-ASCII symbols and read in UTF8\r
+        * \r
+        * @param input\r
+        *            the file containing the list of FASTA formatted sequences to\r
+        *            read from\r
+        * @throws FileNotFoundException\r
+        *             if the input file is not found\r
+        */\r
+       public FastaReader(final String input) throws FileNotFoundException {\r
+               this.input = new Scanner(new File(input), "UTF8");\r
+               this.input.useDelimiter("\\s*>");\r
+       }\r
+\r
+       /**\r
+        * {@inheritDoc}\r
+        */\r
+       @Override\r
+       public boolean hasNext() {\r
+               return input.hasNext();\r
+       }\r
+\r
+       /**\r
+        * Reads the next FastaSequence from the input\r
+        * \r
+        * @throws AssertionError\r
+        *             if the header or the sequence is missing\r
+        */\r
+       @Override\r
+       public FastaSequence next() {\r
+               return FastaReader.toFastaSequence(input.next());\r
+       }\r
+\r
+       /**\r
+        * Not implemented\r
+        */\r
+       @Override\r
+       public void remove() {\r
+               throw new UnsupportedOperationException();\r
+       }\r
+\r
+       private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
+               final Scanner sc = new Scanner(singleFastaEntry);\r
+               // Use new line delimiter\r
+               sc.useDelimiter("\n");\r
+               if (!sc.hasNext()) {\r
+                       throw new AssertionError(\r
+                                       "The FASTA sequence must contain the header information"\r
+                                                       + " separated by the new line from the sequence. Given sequence does not appear to "\r
+                                                       + "contain the header! Given data:\n "\r
+                                                       + singleFastaEntry);\r
+               }\r
+               String header = sc.next();\r
+               // Get rid of the new line chars (should cover common cases)\r
+               header = header.replaceAll("\n", "").replaceAll("\r", "");\r
+\r
+               sc.useDelimiter("\\s*");\r
+               final StringBuilder sb = new StringBuilder();\r
+               while (sc.hasNext()) {\r
+                       sb.append(sc.next().trim());\r
+               }\r
+               final String sequence = sb.toString();\r
+               if (Util.isEmpty(sequence)) {\r
+                       throw new AssertionError(\r
+                                       "Empty sequences are not allowed! Please make sure the "\r
+                                                       + " data is in the FASTA format! Given data:\n "\r
+                                                       + singleFastaEntry);\r
+               }\r
+               return new FastaSequence(header, sequence);\r
+       }\r
+}\r
diff --git a/testsrc/compbio/data/sequence/FastaReaderTester.java b/testsrc/compbio/data/sequence/FastaReaderTester.java
new file mode 100644 (file)
index 0000000..77a7fd5
--- /dev/null
@@ -0,0 +1,73 @@
+package compbio.data.sequence;\r
+\r
+import java.io.FileInputStream;\r
+import java.io.FileNotFoundException;\r
+import java.io.IOException;\r
+import java.util.ArrayList;\r
+import java.util.List;\r
+\r
+import org.testng.Assert;\r
+import org.testng.annotations.Test;\r
+\r
+import compbio.metadata.AllTestSuit;\r
+\r
+public class FastaReaderTester {\r
+\r
+       static FastaSequence s0 = new FastaSequence(\r
+                       "zedpshvyzg",\r
+                       "GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD");\r
+\r
+       static FastaSequence s1 = new FastaSequence(\r
+                       "xovkactesa",\r
+                       "CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM"\r
+                                       + "FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG"\r
+                                       + "FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH"\r
+                                       + "DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC");\r
+       static FastaSequence s2 = new FastaSequence(\r
+                       "ntazzewyvv",\r
+                       "CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD"\r
+                                       + "EASINM       AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG"\r
+                                       + "EFITEA       WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW"\r
+                                       + " RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    "\r
+                                       + "W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   "\r
+                                       + "FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI");\r
+       static FastaSequence s3 = new FastaSequence(" 12 d t y wi               k       jbke    ",\r
+                       "  KLSHHDCD" + "   N" + "    H" + "    HSKCTEPHCGNSHQML\n\rHRDP"\r
+                                       + "    CCDQCQSWEAENWCASMRKAILF");\r
+       @Test()\r
+       public void test() {\r
+\r
+               List<FastaSequence> old_seqs = null;\r
+               final List<FastaSequence> list = new ArrayList<FastaSequence>();\r
+               try {\r
+                       old_seqs = SequenceUtil.readFasta(new FileInputStream(\r
+                                       AllTestSuit.TEST_DATA_PATH + "complicated.fasta"));\r
+                       final FastaReader fr = new FastaReader(AllTestSuit.TEST_DATA_PATH\r
+                                       + "complicated.fasta");\r
+\r
+                       while (fr.hasNext()) {\r
+                               final FastaSequence fs = fr.next();\r
+                               list.add(fs);\r
+                       }\r
+               } catch (final FileNotFoundException e) {\r
+                       e.printStackTrace();\r
+                       Assert.fail(e.getLocalizedMessage());\r
+               } catch (final IOException e) {\r
+                       e.printStackTrace();\r
+                       Assert.fail(e.getLocalizedMessage());\r
+               }\r
+               System.out.println("OLD: " + old_seqs);\r
+               System.out.println("NEW: " + list);\r
+               Assert.assertEquals(old_seqs.size() + 1, list.size());\r
+               Assert.assertEquals(old_seqs.get(0), list.get(0));\r
+               Assert.assertEquals(old_seqs.get(1), list.get(1));\r
+               // Assert.assertEquals(old_seqs.get(2), list.get(2));\r
+               // Assert.assertEquals(seqs.get(3), list.get(3));\r
+\r
+               Assert.assertEquals(FastaReaderTester.s0, list.get(0));\r
+               Assert.assertEquals(FastaReaderTester.s1, list.get(1));\r
+               Assert.assertEquals(FastaReaderTester.s2, list.get(2));\r
+               Assert.assertEquals(FastaReaderTester.s3, list.get(3));\r
+\r
+       }\r
+}\r
diff --git a/testsrc/testdata/complicated.fasta b/testsrc/testdata/complicated.fasta
new file mode 100644 (file)
index 0000000..3c891d6
--- /dev/null
@@ -0,0 +1,23 @@
+>zedpshvyzg\r
+GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+\r
+>xovkactesa\r
+CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+\r
+>ntazzewyvv\r
+CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+EASINM AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
+EFITEA WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
+  RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
+  W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
+  FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+\r
+   > 12 d t y wi               k       jbke    \r
+  KLSHHDCD\r
+   N\r
+    H\r
+    HSKCTEPHCGNSHQMLHRDP\r
+    CCDQCQSWEAENWCASMRKAILF
\ No newline at end of file