JWS-29 patch fixes bug - wierd workaround since "^\\s*>" caused scanner to advance...
[jabaws.git] / datamodel / compbio / data / sequence / FastaReader.java
index ed91e93..6c3e943 100644 (file)
@@ -19,8 +19,12 @@ package compbio.data.sequence;
 \r
 import java.io.File;\r
 import java.io.FileNotFoundException;\r
+import java.io.InputStream;\r
 import java.util.Iterator;\r
 import java.util.Scanner;\r
+import java.util.regex.MatchResult;\r
+\r
+import javax.vecmath.MismatchedSizeException;\r
 \r
 import compbio.util.Util;\r
 \r
@@ -66,11 +70,14 @@ import compbio.util.Util;
 public class FastaReader implements Iterator<FastaSequence> {\r
 \r
        private final Scanner input;\r
-\r
+       /**\r
+        * Delimiter for the scanner\r
+        */\r
+       private final String DELIM=">";\r
        /**\r
         * Header data can contain non-ASCII symbols and read in UTF8\r
         * \r
-        * @param input\r
+        * @param inputFile\r
         *            the file containing the list of FASTA formatted sequences to\r
         *            read from\r
         * @throws FileNotFoundException\r
@@ -81,7 +88,7 @@ public class FastaReader implements Iterator<FastaSequence> {
         */\r
        public FastaReader(final String inputFile) throws FileNotFoundException {\r
                input = new Scanner(new File(inputFile), "UTF8");\r
-               input.useDelimiter("\\s*>");\r
+               input.useDelimiter(DELIM);\r
                Runtime.getRuntime().addShutdownHook(new Thread() {\r
 \r
                        @Override\r
@@ -92,6 +99,19 @@ public class FastaReader implements Iterator<FastaSequence> {
                        }\r
                });\r
        }\r
+\r
+       /**\r
+        * This class will not close the incoming stream! So the client should do\r
+        * so.\r
+        * \r
+        * @param inputStream\r
+        * @throws FileNotFoundException\r
+        */\r
+       public FastaReader(final InputStream inputStream)\r
+                       throws FileNotFoundException {\r
+               input = new Scanner(inputStream);\r
+               input.useDelimiter(DELIM);\r
+       }\r
        /**\r
         * {@inheritDoc}\r
         * \r
@@ -110,10 +130,17 @@ public class FastaReader implements Iterator<FastaSequence> {
         *             if the header or the sequence is missing\r
         * @throws IllegalStateException\r
         *             if the close method was called on this instance\r
+        *             @throws MismatchException - if there were no more FastaSequence's.\r
         */\r
        @Override\r
        public FastaSequence next() {\r
-               return FastaReader.toFastaSequence(input.next());\r
+               String fastaHeader=input.next();\r
+               while (fastaHeader.indexOf("\n")<0 && input.hasNext())\r
+               {\r
+                       fastaHeader = fastaHeader.concat(">");\r
+                       fastaHeader = fastaHeader.concat(input.next());\r
+               }\r
+               return FastaReader.toFastaSequence(fastaHeader);\r
        }\r
 \r
        /**\r
@@ -135,26 +162,24 @@ public class FastaReader implements Iterator<FastaSequence> {
        }\r
 \r
        private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
-               final Scanner sc = new Scanner(singleFastaEntry);\r
-               // Use new line delimiter\r
-               sc.useDelimiter("\n");\r
-               if (!sc.hasNext()) {\r
+\r
+               assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";\r
+\r
+               int nlineidx = singleFastaEntry.indexOf("\n");\r
+               if (nlineidx < 0) {\r
                        throw new AssertionError(\r
                                        "The FASTA sequence must contain the header information"\r
                                                        + " separated by the new line from the sequence. Given sequence does not appear to "\r
                                                        + "contain the header! Given data:\n "\r
                                                        + singleFastaEntry);\r
                }\r
-               String header = sc.next();\r
+               String header = singleFastaEntry.substring(0, nlineidx);\r
+\r
                // Get rid of the new line chars (should cover common cases)\r
                header = header.replaceAll("\r", "");\r
 \r
-               sc.useDelimiter("\\s*");\r
-               final StringBuilder sb = new StringBuilder();\r
-               while (sc.hasNext()) {\r
-                       sb.append(sc.next());\r
-               }\r
-               final String sequence = sb.toString();\r
+               String sequence = singleFastaEntry.substring(nlineidx);\r
+\r
                if (Util.isEmpty(sequence)) {\r
                        throw new AssertionError(\r
                                        "Empty sequences are not allowed! Please make sure the "\r