From: pvtroshin Date: Wed, 27 Apr 2011 23:10:25 +0000 (+0000) Subject: New FastaReader implementing iterator & test case for it X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=26166c1ca1dd3761305024cef6ee6ae67c6c135f;p=jabaws.git New FastaReader implementing iterator & test case for it git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@4041 e3abac25-378b-4346-85de-24260fe3988d --- diff --git a/datamodel/compbio/data/sequence/FastaReader.java b/datamodel/compbio/data/sequence/FastaReader.java new file mode 100644 index 0000000..10eec8f --- /dev/null +++ b/datamodel/compbio/data/sequence/FastaReader.java @@ -0,0 +1,123 @@ +package compbio.data.sequence; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Iterator; +import java.util.Scanner; + +import compbio.util.Util; + +/** + * Reads files with FASTA formatted sequences. All the information in the FASTA + * header is preserved including trailing white spaces. All the white spaces are + * removed from the sequence. + * + * Examples of the correct input: + * + *
+ * 
+ * >zedpshvyzg
+ * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
+ * 
+ * >xovkactesa
+ * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
+ * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
+ * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
+ * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
+ * 
+ * >ntazzewyvv
+ * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
+ * EASINM	AQQWRSLPPSRIMKLNG	HGCDCMHSHMEAD	DTKQSGIKGTFWNG	HDAQWLCRWG	
+ * EFITEA	WWGRWGAITFFHAH	ENKNEIQECSDQNLKE	SRTTCEIID   TCHLFTRHLDGW 
+ *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
+ *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
+ *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
+ * 
+ *    > 12 d t y wi 		k	jbke  	
+ *   KLSHHDCD
+ *    N
+ *     H
+ *     HSKCTEPHCGNSHQMLHRDP
+ *     CCDQCQSWEAENWCASMRKAILF
+ * 
+ * 
+ * + * @author Peter Troshin + * @version 1.0 April 2011 + * + */ +public class FastaReader implements Iterator { + + private final Scanner input; + + /** + * Header data can contain non-ASCII symbols and read in UTF8 + * + * @param input + * the file containing the list of FASTA formatted sequences to + * read from + * @throws FileNotFoundException + * if the input file is not found + */ + public FastaReader(final String input) throws FileNotFoundException { + this.input = new Scanner(new File(input), "UTF8"); + this.input.useDelimiter("\\s*>"); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean hasNext() { + return input.hasNext(); + } + + /** + * Reads the next FastaSequence from the input + * + * @throws AssertionError + * if the header or the sequence is missing + */ + @Override + public FastaSequence next() { + return FastaReader.toFastaSequence(input.next()); + } + + /** + * Not implemented + */ + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + private static FastaSequence toFastaSequence(final String singleFastaEntry) { + final Scanner sc = new Scanner(singleFastaEntry); + // Use new line delimiter + sc.useDelimiter("\n"); + if (!sc.hasNext()) { + throw new AssertionError( + "The FASTA sequence must contain the header information" + + " separated by the new line from the sequence. Given sequence does not appear to " + + "contain the header! Given data:\n " + + singleFastaEntry); + } + String header = sc.next(); + // Get rid of the new line chars (should cover common cases) + header = header.replaceAll("\n", "").replaceAll("\r", ""); + + sc.useDelimiter("\\s*"); + final StringBuilder sb = new StringBuilder(); + while (sc.hasNext()) { + sb.append(sc.next().trim()); + } + final String sequence = sb.toString(); + if (Util.isEmpty(sequence)) { + throw new AssertionError( + "Empty sequences are not allowed! Please make sure the " + + " data is in the FASTA format! Given data:\n " + + singleFastaEntry); + } + return new FastaSequence(header, sequence); + } +} diff --git a/testsrc/compbio/data/sequence/FastaReaderTester.java b/testsrc/compbio/data/sequence/FastaReaderTester.java new file mode 100644 index 0000000..77a7fd5 --- /dev/null +++ b/testsrc/compbio/data/sequence/FastaReaderTester.java @@ -0,0 +1,73 @@ +package compbio.data.sequence; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.testng.Assert; +import org.testng.annotations.Test; + +import compbio.metadata.AllTestSuit; + +public class FastaReaderTester { + + static FastaSequence s0 = new FastaSequence( + "zedpshvyzg", + "GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD"); + + static FastaSequence s1 = new FastaSequence( + "xovkactesa", + "CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM" + + "FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG" + + "FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH" + + "DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC"); + static FastaSequence s2 = new FastaSequence( + "ntazzewyvv", + "CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD" + + "EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG" + + "EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW" + + " RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN " + + "W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF " + + "FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI"); + static FastaSequence s3 = new FastaSequence(" 12 d t y wi k jbke ", + " KLSHHDCD" + " N" + " H" + " HSKCTEPHCGNSHQML\n\rHRDP" + + " CCDQCQSWEAENWCASMRKAILF"); + @Test() + public void test() { + + List old_seqs = null; + final List list = new ArrayList(); + try { + old_seqs = SequenceUtil.readFasta(new FileInputStream( + AllTestSuit.TEST_DATA_PATH + "complicated.fasta")); + final FastaReader fr = new FastaReader(AllTestSuit.TEST_DATA_PATH + + "complicated.fasta"); + + while (fr.hasNext()) { + final FastaSequence fs = fr.next(); + list.add(fs); + } + } catch (final FileNotFoundException e) { + e.printStackTrace(); + Assert.fail(e.getLocalizedMessage()); + } catch (final IOException e) { + e.printStackTrace(); + Assert.fail(e.getLocalizedMessage()); + } + System.out.println("OLD: " + old_seqs); + System.out.println("NEW: " + list); + Assert.assertEquals(old_seqs.size() + 1, list.size()); + Assert.assertEquals(old_seqs.get(0), list.get(0)); + Assert.assertEquals(old_seqs.get(1), list.get(1)); + // Assert.assertEquals(old_seqs.get(2), list.get(2)); + // Assert.assertEquals(seqs.get(3), list.get(3)); + + Assert.assertEquals(FastaReaderTester.s0, list.get(0)); + Assert.assertEquals(FastaReaderTester.s1, list.get(1)); + Assert.assertEquals(FastaReaderTester.s2, list.get(2)); + Assert.assertEquals(FastaReaderTester.s3, list.get(3)); + + } +} diff --git a/testsrc/testdata/complicated.fasta b/testsrc/testdata/complicated.fasta new file mode 100644 index 0000000..3c891d6 --- /dev/null +++ b/testsrc/testdata/complicated.fasta @@ -0,0 +1,23 @@ +>zedpshvyzg +GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD + +>xovkactesa +CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM +FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG +FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH +DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC + +>ntazzewyvv +CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD +EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG +EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW + RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN + W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF + FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI + + > 12 d t y wi k jbke + KLSHHDCD + N + H + HSKCTEPHCGNSHQMLHRDP + CCDQCQSWEAENWCASMRKAILF \ No newline at end of file