New FastaReader implementing iterator & test case for it
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
diff --git a/datamodel/compbio/data/sequence/FastaReader.java b/datamodel/compbio/data/sequence/FastaReader.java
new file mode 100644 (file)
index 0000000..10eec8f
--- /dev/null
@@ -0,0 +1,123 @@
+package compbio.data.sequence;\r
+\r
+import java.io.File;\r
+import java.io.FileNotFoundException;\r
+import java.util.Iterator;\r
+import java.util.Scanner;\r
+\r
+import compbio.util.Util;\r
+\r
+/**\r
+ * Reads files with FASTA formatted sequences. All the information in the FASTA\r
+ * header is preserved including trailing white spaces. All the white spaces are\r
+ * removed from the sequence.\r
+ * \r
+ * Examples of the correct input:\r
+ * \r
+ * <pre>\r
+ * \r
+ * >zedpshvyzg\r
+ * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD\r
+ * \r
+ * >xovkactesa\r
+ * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM\r
+ * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG\r
+ * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH\r
+ * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC\r
+ * \r
+ * >ntazzewyvv\r
+ * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD\r
+ * EASINM      AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      \r
+ * EFITEA      WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW \r
+ *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    \r
+ *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   \r
+ *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI\r
+ * \r
+ *    > 12 d t y wi            k       jbke    \r
+ *   KLSHHDCD\r
+ *    N\r
+ *     H\r
+ *     HSKCTEPHCGNSHQMLHRDP\r
+ *     CCDQCQSWEAENWCASMRKAILF\r
+ * \r
+ * </pre>\r
+ * \r
+ * @author Peter Troshin\r
+ * @version 1.0 April 2011\r
+ * \r
+ */\r
+public class FastaReader implements Iterator<FastaSequence> {\r
+\r
+       private final Scanner input;\r
+\r
+       /**\r
+        * Header data can contain non-ASCII symbols and read in UTF8\r
+        * \r
+        * @param input\r
+        *            the file containing the list of FASTA formatted sequences to\r
+        *            read from\r
+        * @throws FileNotFoundException\r
+        *             if the input file is not found\r
+        */\r
+       public FastaReader(final String input) throws FileNotFoundException {\r
+               this.input = new Scanner(new File(input), "UTF8");\r
+               this.input.useDelimiter("\\s*>");\r
+       }\r
+\r
+       /**\r
+        * {@inheritDoc}\r
+        */\r
+       @Override\r
+       public boolean hasNext() {\r
+               return input.hasNext();\r
+       }\r
+\r
+       /**\r
+        * Reads the next FastaSequence from the input\r
+        * \r
+        * @throws AssertionError\r
+        *             if the header or the sequence is missing\r
+        */\r
+       @Override\r
+       public FastaSequence next() {\r
+               return FastaReader.toFastaSequence(input.next());\r
+       }\r
+\r
+       /**\r
+        * Not implemented\r
+        */\r
+       @Override\r
+       public void remove() {\r
+               throw new UnsupportedOperationException();\r
+       }\r
+\r
+       private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
+               final Scanner sc = new Scanner(singleFastaEntry);\r
+               // Use new line delimiter\r
+               sc.useDelimiter("\n");\r
+               if (!sc.hasNext()) {\r
+                       throw new AssertionError(\r
+                                       "The FASTA sequence must contain the header information"\r
+                                                       + " separated by the new line from the sequence. Given sequence does not appear to "\r
+                                                       + "contain the header! Given data:\n "\r
+                                                       + singleFastaEntry);\r
+               }\r
+               String header = sc.next();\r
+               // Get rid of the new line chars (should cover common cases)\r
+               header = header.replaceAll("\n", "").replaceAll("\r", "");\r
+\r
+               sc.useDelimiter("\\s*");\r
+               final StringBuilder sb = new StringBuilder();\r
+               while (sc.hasNext()) {\r
+                       sb.append(sc.next().trim());\r
+               }\r
+               final String sequence = sb.toString();\r
+               if (Util.isEmpty(sequence)) {\r
+                       throw new AssertionError(\r
+                                       "Empty sequences are not allowed! Please make sure the "\r
+                                                       + " data is in the FASTA format! Given data:\n "\r
+                                                       + singleFastaEntry);\r
+               }\r
+               return new FastaSequence(header, sequence);\r
+       }\r
+}\r