Remove internal FastaReader/FastaSequence and replace this with classes from JABAWS
authorSasha Sherstnev <a.sherstnev@dundee.ac.uk>
Tue, 19 Nov 2013 15:28:15 +0000 (15:28 +0000)
committerSasha Sherstnev <a.sherstnev@dundee.ac.uk>
Tue, 19 Nov 2013 15:28:15 +0000 (15:28 +0000)
datadb/compbio/cassandra/CassandraWriter.java
datadb/compbio/cassandra/FastaReader.java [deleted file]
datadb/compbio/cassandra/FastaSequence.java [deleted file]
datadb/compbio/cassandra/JpredParserHTTP.java
datadb/compbio/cassandra/JpredParserLocalFile.java
engine/compbio/engine/JpredJob.java

index 48bbda7..24abd4b 100644 (file)
@@ -8,6 +8,7 @@ import com.datastax.driver.core.Row;
 import com.datastax.driver.core.Session;
 import com.datastax.driver.core.ResultSet;
 
+import compbio.data.sequence.FastaSequence;
 import compbio.engine.JpredJob;
 import compbio.engine.ProteoCachePropertyHelperManager;
 import compbio.util.PropertyHelper;
diff --git a/datadb/compbio/cassandra/FastaReader.java b/datadb/compbio/cassandra/FastaReader.java
deleted file mode 100644 (file)
index 4783b14..0000000
+++ /dev/null
@@ -1,173 +0,0 @@
-package compbio.cassandra;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Scanner;
-
-//import compbio.util.Util;
-
-/**
- * Reads files with FASTA formatted sequences. All the information in the FASTA
- * header is preserved including trailing white spaces. All the white spaces are
- * removed from the sequence.
- * 
- * Examples of the correct input:
- * 
- * <pre>
- * 
- * >zedpshvyzg
- * GCQDKNNIAELNEIMGTTRSPSDWQHMKGASPRAEIGLTGKKDSWWRHCCSKEFNKTPPPIHPDMKRWGWMWNRENFEKFLIDNFLNPPCPRLMLTKGTWWRHEDLCHEIFWSTLRWLCLGNQSFSAMIWGHLCECHRMIWWESNEHMFWLKFRRALKKMNSNGPCMGPDNREWMITNRMGKEFCGPAFAGDCQSCWRKCHKTNKICFNEKKGTPTKIDHEQKDIMDILKDIDNHRNWKQCQLWLLTSKSTDQESTTMLTWSTWRDFFIIIKQPFDHKCRGALDANGDFQIAAELKWPAPMIILRQNQKTMHDKSCHHFFTNRCPLMHTTRANDKQCSWHTRKQFICQQDFTTWQHRPDTHRILPSWCMSTRRKNHIKNTPALAFSTCEMGDLPNGWAPGTIILQRQFTQAIKLPQETTGWPRCDPKFDHWNMSKWLRQLLGRDDEMIPPQCD
- * 
- * >xovkactesa
- * CPLSKWWNRRAFLSHTANHWMILMTWEGPHDGESKMRIAMMKWSPCKPTMSHFRCGLDAWAEPIRQIACESTFRM
- * FCTTPRPIHKLTEMWGHMNGWTGAFCRQLECEWMMPPRHPHPCTSTFNNNKKRLIGQIPNEGKQLFINFQKPQHG
- * FSESDIWIWKDNPTAWHEGLTIAGIGDGQHCWNWMPMPWSGAPTSNALIEFWTWLGMIGTRCKTQGMWWDAMNHH
- * DQFELSANAHIAAHHMEKKMILKPDDRNLGDDTWMPPGKIWMRMFAKNTNACWPEGCRDDNEEDDCGTHNLHRMC
- * 
- * >ntazzewyvv
- * CGCKIF D D NMKDNNRHG TDIKKHGFMH IRHPE KRDDC FDNHCIMPKHRRWGLWD
- * EASINM      AQQWRSLPPSRIMKLNG       HGCDCMHSHMEAD   DTKQSGIKGTFWNG  HDAQWLCRWG      
- * EFITEA      WWGRWGAITFFHAH  ENKNEIQECSDQNLKE        SRTTCEIID   TCHLFTRHLDGW 
- *   RCEKCQANATHMTW ACTKSCAEQW  FCAKELMMN    
- *   W        KQMGWRCKIFRKLFRDNCWID  FELPWWPICFCCKGLSTKSHSAHDGDQCRRW    WPDCARDWLGPGIRGEF   
- *   FCTHICQQLQRNFWCGCFRWNIEKRMFEIFDDNMAAHWKKCMHFKFLIRIHRHGPITMKMTWCRSGCCFGKTRRLPDSSFISAFLDPKHHRDGSGMMMWSSEMRSCAIPDPQQAWNQGKWIGQIKDWNICFAWPIRENQQCWATPHEMPSGFHFILEKWDALAHPHMHIRQKKCWAWAFLSLMSSTHSDMATFQWAIPGHNIWSNWDNIICGWPRI
- * 
- *    > 12 d t y wi            k       jbke    
- *   KLSHHDCD
- *    N
- *     H
- *     HSKCTEPHCGNSHQMLHRDP
- *     CCDQCQSWEAENWCASMRKAILF
- * 
- * </pre>
- * 
- * @author Peter Troshin
- * @version 1.0 April 2011
- * 
- */
-public class FastaReader implements Iterator<FastaSequence> {
-
-       private final Scanner input;
-       /**
-        * Delimiter for the scanner
-        */
-       private final String DELIM = ">";
-
-       /**
-        * Header data can contain non-ASCII symbols and read in UTF8
-        * 
-        * @param inputFile
-        *            the file containing the list of FASTA formatted sequences to
-        *            read from
-        * @throws FileNotFoundException
-        *             if the input file is not found
-        * @throws IllegalStateException
-        *             if the close method was called on this instance
-        * 
-        */
-       public FastaReader(final String inputFile) throws FileNotFoundException {
-               input = new Scanner(new File(inputFile), "UTF8");
-               input.useDelimiter(DELIM);
-               Runtime.getRuntime().addShutdownHook(new Thread() {
-
-                       @Override
-                       public void run() {
-                               if (input != null) {
-                                       input.close();
-                               }
-                       }
-               });
-       }
-
-       /**
-        * This class will not close the incoming stream! So the client should do
-        * so.
-        * 
-        * @param inputStream
-        * @throws FileNotFoundException
-        */
-       public FastaReader(final InputStream inputStream)
-                       throws FileNotFoundException {
-               input = new Scanner(inputStream);
-               input.useDelimiter(DELIM);
-       }
-
-       /**
-        * {@inheritDoc}
-        * 
-        * @throws IllegalStateException
-        *             if the close method was called on this instance
-        */
-       @Override
-       public boolean hasNext() {
-               return input.hasNext();
-       }
-
-       /**
-        * Reads the next FastaSequence from the input
-        * 
-        * @throws AssertionError
-        *             if the header or the sequence is missing
-        * @throws IllegalStateException
-        *             if the close method was called on this instance
-        * @throws MismatchException
-        *             - if there were no more FastaSequence's.
-        */
-       @Override
-       public FastaSequence next() {
-               String fastaHeader = input.next();
-               while (fastaHeader.indexOf("\n") < 0 && input.hasNext()) {
-                       fastaHeader = fastaHeader.concat(">");
-                       fastaHeader = fastaHeader.concat(input.next());
-               }
-               return FastaReader.toFastaSequence(fastaHeader);
-       }
-
-       /**
-        * Not implemented
-        */
-       @Override
-       public void remove() {
-               throw new UnsupportedOperationException();
-       }
-
-       /**
-        * Call this method to close the connection to the input file if you want to
-        * free up the resources. The connection will be closed on the JVM shutdown
-        * if this method was not called explicitly. No further reading on this
-        * instance of the FastaReader will be possible after calling this method.
-        */
-       public void close() {
-               input.close();
-       }
-
-       private static FastaSequence toFastaSequence(final String singleFastaEntry) {
-
-               // assert !Util.isEmpty(singleFastaEntry) :
-               // "Empty String where FASTA sequence is expected!";
-
-               int nlineidx = singleFastaEntry.indexOf("\n");
-               if (nlineidx < 0) {
-                       throw new AssertionError(
-                                       "The FASTA sequence must contain the header information"
-                                                       + " separated by the new line from the sequence. Given sequence does not appear to "
-                                                       + "contain the header! Given data:\n "
-                                                       + singleFastaEntry);
-               }
-               String header = singleFastaEntry.substring(0, nlineidx);
-
-               // Get rid of the new line chars (should cover common cases)
-               header = header.replaceAll("\r", "");
-
-               String sequence = singleFastaEntry.substring(nlineidx);
-
-               /*
-                * if (Util.isEmpty(sequence)) { throw new AssertionError(
-                * "Empty sequences are not allowed! Please make sure the " +
-                * " data is in the FASTA format! Given data:\n " + singleFastaEntry); }
-                */
-               return new FastaSequence(header, sequence);
-       }
-}
diff --git a/datadb/compbio/cassandra/FastaSequence.java b/datadb/compbio/cassandra/FastaSequence.java
deleted file mode 100644 (file)
index 61f49c7..0000000
+++ /dev/null
@@ -1,179 +0,0 @@
-package compbio.cassandra;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.xml.bind.annotation.XmlAccessType;
-import javax.xml.bind.annotation.XmlAccessorType;
-
-//import compbio.util.SysPrefs;
-//import compbio.util.annotation.Immutable;
-
-/**
- * A FASTA formatted sequence. Please note that this class does not make any
- * assumptions as to what sequence it stores e.g. it could be nucleotide,
- * protein or even gapped alignment sequence! The only guarantee it makes is
- * that the sequence does not contain white space characters e.g. spaces, new
- * lines etc
- * 
- * @author pvtroshin
- * 
- * @version 1.0 September 2009
- */
-
-@XmlAccessorType(XmlAccessType.FIELD)
-//@Immutable
-public class FastaSequence {
-
-       /**
-        * Sequence id
-        */
-       private String id;
-
-       // TODO what about gapped sequence here! should be indicated
-       /**
-        * Returns the string representation of sequence
-        */
-       private String sequence;
-
-       FastaSequence() {
-               // Default constructor for JaxB
-       }
-
-       /**
-        * Upon construction the any whitespace characters are removed from the
-        * sequence
-        * 
-        * @param id
-        * @param sequence
-        */
-       public FastaSequence(String id, String sequence) {
-               this.id = id;
-               this.sequence = sequence;
-       }
-
-       /**
-        * Gets the value of id
-        * 
-        * @return the value of id
-        */
-       public String getId() {
-               return this.id;
-       }
-
-       /**
-        * Gets the value of sequence
-        * 
-        * @return the value of sequence
-        */
-       public String getSequence() {
-               return this.sequence;
-       }
-
-       public static int countMatchesInSequence(final String theString,
-                       final String theRegExp) {
-               final Pattern p = Pattern.compile(theRegExp);
-               final Matcher m = p.matcher(theString);
-               int cnt = 0;
-               while (m.find()) {
-                       cnt++;
-               }
-               return cnt;
-       }
-
-       public String getFormattedFasta() {
-               return getFormatedSequence(80);
-       }
-
-       /**
-        * 
-        * @return one line name, next line sequence, no matter what the sequence
-        *         length is
-        */
-/*     public String getOnelineFasta() {
-               String fasta = ">" + getId() + SysPrefs.newlinechar;
-               fasta += getSequence() + SysPrefs.newlinechar;
-               return fasta;
-       }
-
-       /**
-        * Format sequence per width letter in one string. Without spaces.
-        * 
-        * @return multiple line formated sequence, one line width letters length
-        * 
-        */
-       public String getFormatedSequence(final int width) {
-               if (sequence == null) {
-                       return "";
-               }
-
-               assert width >= 0 : "Wrong width parameter ";
-
-               final StringBuilder sb = new StringBuilder(sequence);
-               // int tail = nrOfWindows % WIN_SIZE;
-               // final int turns = (nrOfWindows - tail) / WIN_SIZE;
-
-               int tailLen = sequence.length() % width;
-               // add up inserted new line chars
-               int nchunks = (sequence.length() - tailLen) / width;
-               int nlineCharcounter = 0;
-               int insPos = 0;
-               for (int i = 1; i <= nchunks; i++) {
-                       insPos = width * i + nlineCharcounter;
-                       // to prevent inserting new line in the very end of a sequence then
-                       // it would have failed.
-                       if (sb.length() <= insPos) {
-                               break;
-                       }
-                       sb.insert(insPos, "\n");
-                       nlineCharcounter++;
-               }
-               // sb.insert(insPos + tailLen, "\n");
-               return sb.toString();
-       }
-
-       /**
-        * 
-        * @return sequence length
-        */
-       public int getLength() {
-               return this.sequence.length();
-       }
-
-       /**
-        * Same as oneLineFasta
-        */
-//     @Override
-//     public String toString() {
-//             return this.getOnelineFasta();
-       // }
-
-       @Override
-       public int hashCode() {
-               final int prime = 17;
-               int result = 1;
-               result = prime * result + ((id == null) ? 0 : id.hashCode());
-               result = prime * result
-                               + ((sequence == null) ? 0 : sequence.hashCode());
-               return result;
-       }
-
-       @Override
-       public boolean equals(Object obj) {
-               if (obj == null) {
-                       return false;
-               }
-               if (!(obj instanceof FastaSequence)) {
-                       return false;
-               }
-               FastaSequence fs = (FastaSequence) obj;
-               if (!fs.getId().equals(this.getId())) {
-                       return false;
-               }
-               if (!fs.getSequence().equalsIgnoreCase(this.getSequence())) {
-                       return false;
-               }
-               return true;
-       }
-
-}
index e53ddd8..2167a2d 100644 (file)
@@ -15,6 +15,8 @@ import java.util.Date;
 import java.util.List;
 
 import compbio.cassandra.JpredParser;
+import compbio.data.sequence.FastaReader;
+import compbio.data.sequence.FastaSequence;
 import compbio.engine.JpredJob;
 
 public class JpredParserHTTP implements JpredParser {
index d4a22e6..f48178c 100644 (file)
@@ -15,6 +15,9 @@ import java.util.Calendar;
 import java.util.Date;
 import java.util.List;
 
+import compbio.data.sequence.FastaReader;
+import compbio.data.sequence.FastaSequence;
+
 public class JpredParserLocalFile implements JpredParser {
        private CassandraWriter cw = new CassandraWriter();
        private String dirprefix;
index a606746..db4a7fe 100644 (file)
@@ -3,7 +3,7 @@ package compbio.engine;
 import java.util.ArrayList;
 import java.util.List;
 
-import compbio.cassandra.FastaSequence;
+import compbio.data.sequence.FastaSequence;
 
 public class JpredJob extends Job {
        private String protein;