package compbio.data.sequence; import java.io.File; import java.io.FileNotFoundException; import java.util.Iterator; import java.util.Scanner; import compbio.util.Util; /** * Reads files with FASTA formatted sequences. All the information in the FASTA * header is preserved including trailing white spaces. All the white spaces are * removed from the sequence. * * Examples of the correct input: * *
 * 
 * >zedpshvyzg
 * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
 * 
 * >xovkactesa
 * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
 * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
 * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
 * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
 * 
 * >ntazzewyvv
 * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
 * EASINM	AQQWRSLPPSRIMKLNG	HGCDCMHSHMEAD	DTKQSGIKGTFWNG	HDAQWLCRWG	
 * EFITEA	WWGRWGAITFFHAH	ENKNEIQECSDQNLKE	SRTTCEIID   TCHLFTRHLDGW 
 *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
 *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
 *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
 * 
 *    > 12 d t y wi 		k	jbke  	
 *   KLSHHDCD
 *    N
 *     H
 *     HSKCTEPHCGNSHQMLHRDP
 *     CCDQCQSWEAENWCASMRKAILF
 * 
 * 
* * @author Peter Troshin * @version 1.0 April 2011 * */ public class FastaReader implements Iterator { private final Scanner input; /** * Header data can contain non-ASCII symbols and read in UTF8 * * @param input * the file containing the list of FASTA formatted sequences to * read from * @throws FileNotFoundException * if the input file is not found * @throws IllegalStateException * if the close method was called on this instance * */ public FastaReader(final String inputFile) throws FileNotFoundException { input = new Scanner(new File(inputFile), "UTF8"); input.useDelimiter("\\s*>"); Runtime.getRuntime().addShutdownHook(new Thread() { @Override public void run() { if (input != null) { input.close(); } } }); } /** * {@inheritDoc} * * @throws IllegalStateException * if the close method was called on this instance */ @Override public boolean hasNext() { return input.hasNext(); } /** * Reads the next FastaSequence from the input * * @throws AssertionError * if the header or the sequence is missing * @throws IllegalStateException * if the close method was called on this instance */ @Override public FastaSequence next() { return FastaReader.toFastaSequence(input.next()); } /** * Not implemented */ @Override public void remove() { throw new UnsupportedOperationException(); } /** * Call this method to close the connection to the input file if you want to * free up the resources. The connection will be closed on the JVM shutdown * if this method was not called explicitly. No further reading on this * instance of the FastaReader will be possible after calling this method. */ public void close() { input.close(); } private static FastaSequence toFastaSequence(final String singleFastaEntry) { final Scanner sc = new Scanner(singleFastaEntry); // Use new line delimiter sc.useDelimiter("\n"); if (!sc.hasNext()) { throw new AssertionError( "The FASTA sequence must contain the header information" + " separated by the new line from the sequence. Given sequence does not appear to " + "contain the header! Given data:\n " + singleFastaEntry); } String header = sc.next(); // Get rid of the new line chars (should cover common cases) header = header.replaceAll("\n", "").replaceAll("\r", ""); sc.useDelimiter("\\s*"); final StringBuilder sb = new StringBuilder(); while (sc.hasNext()) { sb.append(sc.next().trim()); } final String sequence = sb.toString(); if (Util.isEmpty(sequence)) { throw new AssertionError( "Empty sequences are not allowed! Please make sure the " + " data is in the FASTA format! Given data:\n " + singleFastaEntry); } return new FastaSequence(header, sequence); } }