From ca1259daa626e97eb62ee994addb6bd619360b64 Mon Sep 17 00:00:00 2001 From: Sasha Sherstnev Date: Tue, 19 Nov 2013 15:28:15 +0000 Subject: [PATCH] Remove internal FastaReader/FastaSequence and replace this with classes from JABAWS --- datadb/compbio/cassandra/CassandraWriter.java | 1 + datadb/compbio/cassandra/FastaReader.java | 173 ------------------- datadb/compbio/cassandra/FastaSequence.java | 179 -------------------- datadb/compbio/cassandra/JpredParserHTTP.java | 2 + datadb/compbio/cassandra/JpredParserLocalFile.java | 3 + engine/compbio/engine/JpredJob.java | 2 +- 6 files changed, 7 insertions(+), 353 deletions(-) delete mode 100644 datadb/compbio/cassandra/FastaReader.java delete mode 100644 datadb/compbio/cassandra/FastaSequence.java diff --git a/datadb/compbio/cassandra/CassandraWriter.java b/datadb/compbio/cassandra/CassandraWriter.java index 48bbda7..24abd4b 100644 --- a/datadb/compbio/cassandra/CassandraWriter.java +++ b/datadb/compbio/cassandra/CassandraWriter.java @@ -8,6 +8,7 @@ import com.datastax.driver.core.Row; import com.datastax.driver.core.Session; import com.datastax.driver.core.ResultSet; +import compbio.data.sequence.FastaSequence; import compbio.engine.JpredJob; import compbio.engine.ProteoCachePropertyHelperManager; import compbio.util.PropertyHelper; diff --git a/datadb/compbio/cassandra/FastaReader.java b/datadb/compbio/cassandra/FastaReader.java deleted file mode 100644 index 4783b14..0000000 --- a/datadb/compbio/cassandra/FastaReader.java +++ /dev/null @@ -1,173 +0,0 @@ -package compbio.cassandra; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.Scanner; - -//import compbio.util.Util; - -/** - * Reads files with FASTA formatted sequences. All the information in the FASTA - * header is preserved including trailing white spaces. All the white spaces are - * removed from the sequence. - * - * Examples of the correct input: - * - *
- * 
- * >zedpshvyzg
- * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
- * 
- * >xovkactesa
- * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
- * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
- * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
- * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
- * 
- * >ntazzewyvv
- * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
- * EASINM	AQQWRSLPPSRIMKLNG	HGCDCMHSHMEAD	DTKQSGIKGTFWNG	HDAQWLCRWG	
- * EFITEA	WWGRWGAITFFHAH	ENKNEIQECSDQNLKE	SRTTCEIID   TCHLFTRHLDGW 
- *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
- *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
- *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
- * 
- *    > 12 d t y wi 		k	jbke  	
- *   KLSHHDCD
- *    N
- *     H
- *     HSKCTEPHCGNSHQMLHRDP
- *     CCDQCQSWEAENWCASMRKAILF
- * 
- * 
- * - * @author Peter Troshin - * @version 1.0 April 2011 - * - */ -public class FastaReader implements Iterator { - - private final Scanner input; - /** - * Delimiter for the scanner - */ - private final String DELIM = ">"; - - /** - * Header data can contain non-ASCII symbols and read in UTF8 - * - * @param inputFile - * the file containing the list of FASTA formatted sequences to - * read from - * @throws FileNotFoundException - * if the input file is not found - * @throws IllegalStateException - * if the close method was called on this instance - * - */ - public FastaReader(final String inputFile) throws FileNotFoundException { - input = new Scanner(new File(inputFile), "UTF8"); - input.useDelimiter(DELIM); - Runtime.getRuntime().addShutdownHook(new Thread() { - - @Override - public void run() { - if (input != null) { - input.close(); - } - } - }); - } - - /** - * This class will not close the incoming stream! So the client should do - * so. - * - * @param inputStream - * @throws FileNotFoundException - */ - public FastaReader(final InputStream inputStream) - throws FileNotFoundException { - input = new Scanner(inputStream); - input.useDelimiter(DELIM); - } - - /** - * {@inheritDoc} - * - * @throws IllegalStateException - * if the close method was called on this instance - */ - @Override - public boolean hasNext() { - return input.hasNext(); - } - - /** - * Reads the next FastaSequence from the input - * - * @throws AssertionError - * if the header or the sequence is missing - * @throws IllegalStateException - * if the close method was called on this instance - * @throws MismatchException - * - if there were no more FastaSequence's. - */ - @Override - public FastaSequence next() { - String fastaHeader = input.next(); - while (fastaHeader.indexOf("\n") < 0 && input.hasNext()) { - fastaHeader = fastaHeader.concat(">"); - fastaHeader = fastaHeader.concat(input.next()); - } - return FastaReader.toFastaSequence(fastaHeader); - } - - /** - * Not implemented - */ - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Call this method to close the connection to the input file if you want to - * free up the resources. The connection will be closed on the JVM shutdown - * if this method was not called explicitly. No further reading on this - * instance of the FastaReader will be possible after calling this method. - */ - public void close() { - input.close(); - } - - private static FastaSequence toFastaSequence(final String singleFastaEntry) { - - // assert !Util.isEmpty(singleFastaEntry) : - // "Empty String where FASTA sequence is expected!"; - - int nlineidx = singleFastaEntry.indexOf("\n"); - if (nlineidx < 0) { - throw new AssertionError( - "The FASTA sequence must contain the header information" - + " separated by the new line from the sequence. Given sequence does not appear to " - + "contain the header! Given data:\n " - + singleFastaEntry); - } - String header = singleFastaEntry.substring(0, nlineidx); - - // Get rid of the new line chars (should cover common cases) - header = header.replaceAll("\r", ""); - - String sequence = singleFastaEntry.substring(nlineidx); - - /* - * if (Util.isEmpty(sequence)) { throw new AssertionError( - * "Empty sequences are not allowed! Please make sure the " + - * " data is in the FASTA format! Given data:\n " + singleFastaEntry); } - */ - return new FastaSequence(header, sequence); - } -} diff --git a/datadb/compbio/cassandra/FastaSequence.java b/datadb/compbio/cassandra/FastaSequence.java deleted file mode 100644 index 61f49c7..0000000 --- a/datadb/compbio/cassandra/FastaSequence.java +++ /dev/null @@ -1,179 +0,0 @@ -package compbio.cassandra; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.xml.bind.annotation.XmlAccessType; -import javax.xml.bind.annotation.XmlAccessorType; - -//import compbio.util.SysPrefs; -//import compbio.util.annotation.Immutable; - -/** - * A FASTA formatted sequence. Please note that this class does not make any - * assumptions as to what sequence it stores e.g. it could be nucleotide, - * protein or even gapped alignment sequence! The only guarantee it makes is - * that the sequence does not contain white space characters e.g. spaces, new - * lines etc - * - * @author pvtroshin - * - * @version 1.0 September 2009 - */ - -@XmlAccessorType(XmlAccessType.FIELD) -//@Immutable -public class FastaSequence { - - /** - * Sequence id - */ - private String id; - - // TODO what about gapped sequence here! should be indicated - /** - * Returns the string representation of sequence - */ - private String sequence; - - FastaSequence() { - // Default constructor for JaxB - } - - /** - * Upon construction the any whitespace characters are removed from the - * sequence - * - * @param id - * @param sequence - */ - public FastaSequence(String id, String sequence) { - this.id = id; - this.sequence = sequence; - } - - /** - * Gets the value of id - * - * @return the value of id - */ - public String getId() { - return this.id; - } - - /** - * Gets the value of sequence - * - * @return the value of sequence - */ - public String getSequence() { - return this.sequence; - } - - public static int countMatchesInSequence(final String theString, - final String theRegExp) { - final Pattern p = Pattern.compile(theRegExp); - final Matcher m = p.matcher(theString); - int cnt = 0; - while (m.find()) { - cnt++; - } - return cnt; - } - - public String getFormattedFasta() { - return getFormatedSequence(80); - } - - /** - * - * @return one line name, next line sequence, no matter what the sequence - * length is - */ -/* public String getOnelineFasta() { - String fasta = ">" + getId() + SysPrefs.newlinechar; - fasta += getSequence() + SysPrefs.newlinechar; - return fasta; - } - - /** - * Format sequence per width letter in one string. Without spaces. - * - * @return multiple line formated sequence, one line width letters length - * - */ - public String getFormatedSequence(final int width) { - if (sequence == null) { - return ""; - } - - assert width >= 0 : "Wrong width parameter "; - - final StringBuilder sb = new StringBuilder(sequence); - // int tail = nrOfWindows % WIN_SIZE; - // final int turns = (nrOfWindows - tail) / WIN_SIZE; - - int tailLen = sequence.length() % width; - // add up inserted new line chars - int nchunks = (sequence.length() - tailLen) / width; - int nlineCharcounter = 0; - int insPos = 0; - for (int i = 1; i <= nchunks; i++) { - insPos = width * i + nlineCharcounter; - // to prevent inserting new line in the very end of a sequence then - // it would have failed. - if (sb.length() <= insPos) { - break; - } - sb.insert(insPos, "\n"); - nlineCharcounter++; - } - // sb.insert(insPos + tailLen, "\n"); - return sb.toString(); - } - - /** - * - * @return sequence length - */ - public int getLength() { - return this.sequence.length(); - } - - /** - * Same as oneLineFasta - */ -// @Override -// public String toString() { -// return this.getOnelineFasta(); - // } - - @Override - public int hashCode() { - final int prime = 17; - int result = 1; - result = prime * result + ((id == null) ? 0 : id.hashCode()); - result = prime * result - + ((sequence == null) ? 0 : sequence.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (!(obj instanceof FastaSequence)) { - return false; - } - FastaSequence fs = (FastaSequence) obj; - if (!fs.getId().equals(this.getId())) { - return false; - } - if (!fs.getSequence().equalsIgnoreCase(this.getSequence())) { - return false; - } - return true; - } - -} diff --git a/datadb/compbio/cassandra/JpredParserHTTP.java b/datadb/compbio/cassandra/JpredParserHTTP.java index e53ddd8..2167a2d 100644 --- a/datadb/compbio/cassandra/JpredParserHTTP.java +++ b/datadb/compbio/cassandra/JpredParserHTTP.java @@ -15,6 +15,8 @@ import java.util.Date; import java.util.List; import compbio.cassandra.JpredParser; +import compbio.data.sequence.FastaReader; +import compbio.data.sequence.FastaSequence; import compbio.engine.JpredJob; public class JpredParserHTTP implements JpredParser { diff --git a/datadb/compbio/cassandra/JpredParserLocalFile.java b/datadb/compbio/cassandra/JpredParserLocalFile.java index d4a22e6..f48178c 100644 --- a/datadb/compbio/cassandra/JpredParserLocalFile.java +++ b/datadb/compbio/cassandra/JpredParserLocalFile.java @@ -15,6 +15,9 @@ import java.util.Calendar; import java.util.Date; import java.util.List; +import compbio.data.sequence.FastaReader; +import compbio.data.sequence.FastaSequence; + public class JpredParserLocalFile implements JpredParser { private CassandraWriter cw = new CassandraWriter(); private String dirprefix; diff --git a/engine/compbio/engine/JpredJob.java b/engine/compbio/engine/JpredJob.java index a606746..db4a7fe 100644 --- a/engine/compbio/engine/JpredJob.java +++ b/engine/compbio/engine/JpredJob.java @@ -3,7 +3,7 @@ package compbio.engine; import java.util.ArrayList; import java.util.List; -import compbio.cassandra.FastaSequence; +import compbio.data.sequence.FastaSequence; public class JpredJob extends Job { private String protein; -- 1.7.10.2