+++ /dev/null
-package compbio.cassandra;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Scanner;
-
-//import compbio.util.Util;
-
-/**
- * Reads files with FASTA formatted sequences. All the information in the FASTA
- * header is preserved including trailing white spaces. All the white spaces are
- * removed from the sequence.
- *
- * Examples of the correct input:
- *
- * <pre>
- *
- * >zedpshvyzg
- * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
- *
- * >xovkactesa
- * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
- * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
- * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
- * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
- *
- * >ntazzewyvv
- * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
- * EASINM AQQWRSLPPSRIMKLNG HGCDCMHSHMEAD DTKQSGIKGTFWNG HDAQWLCRWG
- * EFITEA WWGRWGAITFFHAH ENKNEIQECSDQNLKE SRTTCEIID TCHLFTRHLDGW
- * RCEKCQANATHMTW ACTKSCAEQW FCAKELMMN
- * W KQMGWRCKIFRKLFRDNCWID FELPWWPICFCCKGLSTKSHSAHDGDQCRRW WPDCARDWLGPGIRGEF
- * FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
- *
- * > 12 d t y wi k jbke
- * KLSHHDCD
- * N
- * H
- * HSKCTEPHCGNSHQMLHRDP
- * CCDQCQSWEAENWCASMRKAILF
- *
- * </pre>
- *
- * @author Peter Troshin
- * @version 1.0 April 2011
- *
- */
-public class FastaReader implements Iterator<FastaSequence> {
-
- private final Scanner input;
- /**
- * Delimiter for the scanner
- */
- private final String DELIM = ">";
-
- /**
- * Header data can contain non-ASCII symbols and read in UTF8
- *
- * @param inputFile
- * the file containing the list of FASTA formatted sequences to
- * read from
- * @throws FileNotFoundException
- * if the input file is not found
- * @throws IllegalStateException
- * if the close method was called on this instance
- *
- */
- public FastaReader(final String inputFile) throws FileNotFoundException {
- input = new Scanner(new File(inputFile), "UTF8");
- input.useDelimiter(DELIM);
- Runtime.getRuntime().addShutdownHook(new Thread() {
-
- @Override
- public void run() {
- if (input != null) {
- input.close();
- }
- }
- });
- }
-
- /**
- * This class will not close the incoming stream! So the client should do
- * so.
- *
- * @param inputStream
- * @throws FileNotFoundException
- */
- public FastaReader(final InputStream inputStream)
- throws FileNotFoundException {
- input = new Scanner(inputStream);
- input.useDelimiter(DELIM);
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws IllegalStateException
- * if the close method was called on this instance
- */
- @Override
- public boolean hasNext() {
- return input.hasNext();
- }
-
- /**
- * Reads the next FastaSequence from the input
- *
- * @throws AssertionError
- * if the header or the sequence is missing
- * @throws IllegalStateException
- * if the close method was called on this instance
- * @throws MismatchException
- * - if there were no more FastaSequence's.
- */
- @Override
- public FastaSequence next() {
- String fastaHeader = input.next();
- while (fastaHeader.indexOf("\n") < 0 && input.hasNext()) {
- fastaHeader = fastaHeader.concat(">");
- fastaHeader = fastaHeader.concat(input.next());
- }
- return FastaReader.toFastaSequence(fastaHeader);
- }
-
- /**
- * Not implemented
- */
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- /**
- * Call this method to close the connection to the input file if you want to
- * free up the resources. The connection will be closed on the JVM shutdown
- * if this method was not called explicitly. No further reading on this
- * instance of the FastaReader will be possible after calling this method.
- */
- public void close() {
- input.close();
- }
-
- private static FastaSequence toFastaSequence(final String singleFastaEntry) {
-
- // assert !Util.isEmpty(singleFastaEntry) :
- // "Empty String where FASTA sequence is expected!";
-
- int nlineidx = singleFastaEntry.indexOf("\n");
- if (nlineidx < 0) {
- throw new AssertionError(
- "The FASTA sequence must contain the header information"
- + " separated by the new line from the sequence. Given sequence does not appear to "
- + "contain the header! Given data:\n "
- + singleFastaEntry);
- }
- String header = singleFastaEntry.substring(0, nlineidx);
-
- // Get rid of the new line chars (should cover common cases)
- header = header.replaceAll("\r", "");
-
- String sequence = singleFastaEntry.substring(nlineidx);
-
- /*
- * if (Util.isEmpty(sequence)) { throw new AssertionError(
- * "Empty sequences are not allowed! Please make sure the " +
- * " data is in the FASTA format! Given data:\n " + singleFastaEntry); }
- */
- return new FastaSequence(header, sequence);
- }
-}
+++ /dev/null
-package compbio.cassandra;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.xml.bind.annotation.XmlAccessType;
-import javax.xml.bind.annotation.XmlAccessorType;
-
-//import compbio.util.SysPrefs;
-//import compbio.util.annotation.Immutable;
-
-/**
- * A FASTA formatted sequence. Please note that this class does not make any
- * assumptions as to what sequence it stores e.g. it could be nucleotide,
- * protein or even gapped alignment sequence! The only guarantee it makes is
- * that the sequence does not contain white space characters e.g. spaces, new
- * lines etc
- *
- * @author pvtroshin
- *
- * @version 1.0 September 2009
- */
-
-@XmlAccessorType(XmlAccessType.FIELD)
-//@Immutable
-public class FastaSequence {
-
- /**
- * Sequence id
- */
- private String id;
-
- // TODO what about gapped sequence here! should be indicated
- /**
- * Returns the string representation of sequence
- */
- private String sequence;
-
- FastaSequence() {
- // Default constructor for JaxB
- }
-
- /**
- * Upon construction the any whitespace characters are removed from the
- * sequence
- *
- * @param id
- * @param sequence
- */
- public FastaSequence(String id, String sequence) {
- this.id = id;
- this.sequence = sequence;
- }
-
- /**
- * Gets the value of id
- *
- * @return the value of id
- */
- public String getId() {
- return this.id;
- }
-
- /**
- * Gets the value of sequence
- *
- * @return the value of sequence
- */
- public String getSequence() {
- return this.sequence;
- }
-
- public static int countMatchesInSequence(final String theString,
- final String theRegExp) {
- final Pattern p = Pattern.compile(theRegExp);
- final Matcher m = p.matcher(theString);
- int cnt = 0;
- while (m.find()) {
- cnt++;
- }
- return cnt;
- }
-
- public String getFormattedFasta() {
- return getFormatedSequence(80);
- }
-
- /**
- *
- * @return one line name, next line sequence, no matter what the sequence
- * length is
- */
-/* public String getOnelineFasta() {
- String fasta = ">" + getId() + SysPrefs.newlinechar;
- fasta += getSequence() + SysPrefs.newlinechar;
- return fasta;
- }
-
- /**
- * Format sequence per width letter in one string. Without spaces.
- *
- * @return multiple line formated sequence, one line width letters length
- *
- */
- public String getFormatedSequence(final int width) {
- if (sequence == null) {
- return "";
- }
-
- assert width >= 0 : "Wrong width parameter ";
-
- final StringBuilder sb = new StringBuilder(sequence);
- // int tail = nrOfWindows % WIN_SIZE;
- // final int turns = (nrOfWindows - tail) / WIN_SIZE;
-
- int tailLen = sequence.length() % width;
- // add up inserted new line chars
- int nchunks = (sequence.length() - tailLen) / width;
- int nlineCharcounter = 0;
- int insPos = 0;
- for (int i = 1; i <= nchunks; i++) {
- insPos = width * i + nlineCharcounter;
- // to prevent inserting new line in the very end of a sequence then
- // it would have failed.
- if (sb.length() <= insPos) {
- break;
- }
- sb.insert(insPos, "\n");
- nlineCharcounter++;
- }
- // sb.insert(insPos + tailLen, "\n");
- return sb.toString();
- }
-
- /**
- *
- * @return sequence length
- */
- public int getLength() {
- return this.sequence.length();
- }
-
- /**
- * Same as oneLineFasta
- */
-// @Override
-// public String toString() {
-// return this.getOnelineFasta();
- // }
-
- @Override
- public int hashCode() {
- final int prime = 17;
- int result = 1;
- result = prime * result + ((id == null) ? 0 : id.hashCode());
- result = prime * result
- + ((sequence == null) ? 0 : sequence.hashCode());
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null) {
- return false;
- }
- if (!(obj instanceof FastaSequence)) {
- return false;
- }
- FastaSequence fs = (FastaSequence) obj;
- if (!fs.getId().equals(this.getId())) {
- return false;
- }
- if (!fs.getSequence().equalsIgnoreCase(this.getSequence())) {
- return false;
- }
- return true;
- }
-
-}