+/* Copyright (c) 2011 Peter Troshin\r
+ * \r
+ * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0\r
+ * \r
+ * This library is free software; you can redistribute it and/or modify it under the terms of the\r
+ * Apache License version 2 as published by the Apache Software Foundation\r
+ * \r
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
+ * License for more details.\r
+ * \r
+ * A copy of the license is in apache_license.txt. It is also available here:\r
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * \r
+ * Any republication or derived work distributed in source code form\r
+ * must include this copyright and license notice.\r
+ */\r
package compbio.data.sequence;\r
\r
import java.io.File;\r
import java.io.FileNotFoundException;\r
+import java.io.InputStream;\r
import java.util.Iterator;\r
import java.util.Scanner;\r
+import java.util.regex.MatchResult;\r
+\r
+import javax.vecmath.MismatchedSizeException;\r
\r
import compbio.util.Util;\r
\r
public class FastaReader implements Iterator<FastaSequence> {\r
\r
private final Scanner input;\r
-\r
+ /**\r
+ * Delimiter for the scanner\r
+ */\r
+ private final String DELIM=">";\r
/**\r
* Header data can contain non-ASCII symbols and read in UTF8\r
* \r
- * @param input\r
+ * @param inputFile\r
* the file containing the list of FASTA formatted sequences to\r
* read from\r
* @throws FileNotFoundException\r
* if the input file is not found\r
+ * @throws IllegalStateException\r
+ * if the close method was called on this instance\r
+ * \r
*/\r
- public FastaReader(final String input) throws FileNotFoundException {\r
- this.input = new Scanner(new File(input), "UTF8");\r
- this.input.useDelimiter("\\s*>");\r
+ public FastaReader(final String inputFile) throws FileNotFoundException {\r
+ input = new Scanner(new File(inputFile), "UTF8");\r
+ input.useDelimiter(DELIM);\r
+ Runtime.getRuntime().addShutdownHook(new Thread() {\r
+\r
+ @Override\r
+ public void run() {\r
+ if (input != null) {\r
+ input.close();\r
+ }\r
+ }\r
+ });\r
}\r
\r
/**\r
+ * This class will not close the incoming stream! So the client should do\r
+ * so.\r
+ * \r
+ * @param inputStream\r
+ * @throws FileNotFoundException\r
+ */\r
+ public FastaReader(final InputStream inputStream)\r
+ throws FileNotFoundException {\r
+ input = new Scanner(inputStream);\r
+ input.useDelimiter(DELIM);\r
+ }\r
+ /**\r
* {@inheritDoc}\r
+ * \r
+ * @throws IllegalStateException\r
+ * if the close method was called on this instance\r
*/\r
@Override\r
public boolean hasNext() {\r
* \r
* @throws AssertionError\r
* if the header or the sequence is missing\r
+ * @throws IllegalStateException\r
+ * if the close method was called on this instance\r
+ * @throws MismatchException - if there were no more FastaSequence's.\r
*/\r
@Override\r
public FastaSequence next() {\r
- return FastaReader.toFastaSequence(input.next());\r
+ String fastaHeader=input.next();\r
+ while (fastaHeader.indexOf("\n")<0 && input.hasNext())\r
+ {\r
+ fastaHeader = fastaHeader.concat(">");\r
+ fastaHeader = fastaHeader.concat(input.next());\r
+ }\r
+ return FastaReader.toFastaSequence(fastaHeader);\r
}\r
\r
/**\r
throw new UnsupportedOperationException();\r
}\r
\r
+ /**\r
+ * Call this method to close the connection to the input file if you want to\r
+ * free up the resources. The connection will be closed on the JVM shutdown\r
+ * if this method was not called explicitly. No further reading on this\r
+ * instance of the FastaReader will be possible after calling this method.\r
+ */\r
+ public void close() {\r
+ input.close();\r
+ }\r
+\r
private static FastaSequence toFastaSequence(final String singleFastaEntry) {\r
- final Scanner sc = new Scanner(singleFastaEntry);\r
- // Use new line delimiter\r
- sc.useDelimiter("\n");\r
- if (!sc.hasNext()) {\r
+\r
+ assert !Util.isEmpty(singleFastaEntry) : "Empty String where FASTA sequence is expected!";\r
+\r
+ int nlineidx = singleFastaEntry.indexOf("\n");\r
+ if (nlineidx < 0) {\r
throw new AssertionError(\r
"The FASTA sequence must contain the header information"\r
+ " separated by the new line from the sequence. Given sequence does not appear to "\r
+ "contain the header! Given data:\n "\r
+ singleFastaEntry);\r
}\r
- String header = sc.next();\r
+ String header = singleFastaEntry.substring(0, nlineidx);\r
+\r
// Get rid of the new line chars (should cover common cases)\r
- header = header.replaceAll("\n", "").replaceAll("\r", "");\r
+ header = header.replaceAll("\r", "");\r
+\r
+ String sequence = singleFastaEntry.substring(nlineidx);\r
\r
- sc.useDelimiter("\\s*");\r
- final StringBuilder sb = new StringBuilder();\r
- while (sc.hasNext()) {\r
- sb.append(sc.next().trim());\r
- }\r
- final String sequence = sb.toString();\r
if (Util.isEmpty(sequence)) {\r
throw new AssertionError(\r
"Empty sequences are not allowed! Please make sure the "\r