StringSerializer ss = StringSerializer.get();
LongSerializer ls = LongSerializer.get();
- // connect to the cluster
+ /*
+ * connect to the cluster and look weather the dababase has any data inside
+ */
public void Connection() {
cluster = HFactory.getOrCreateCluster("Protein Cluster", "127.0.0.1:9160");
KeyspaceDefinition keyspaceDef = cluster.describeKeyspace("ProteinKeyspace");
}
/*
- * parsing data from
- * http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat
+ * parsing data source and filling the database
*/
- public void Parsing() {
- /* CF ProteinRow store protein and prediction */
+ public void Parsing(String source) {
+ /*
+ * CF ProteinRow store protein and prediction
+ */
mutatorString = HFactory.createMutator(ksp, ss);
/*
*/
mutatorLog = HFactory.createMutator(ksp, ss);
- /* CF ProteinData store id and protein per data */
+ /*
+ * CF ProteinData store id and protein per data
+ */
mutatorLong = HFactory.createMutator(ksp, ls);
- System.out.println("Parsing......");
- String in = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat";
- DataParsing datParsing = new DataParsing();
- datParsing.Parsing(in, 4);
- flushData();
+ if (source.equals("http")) {
+ // get data from real Jpred production server
+ System.out.println("Parsing web data source......");
+ String datasrc = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat";
+ String prefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+ JpredParserHTTP parser = new JpredParserHTTP(prefix);
+ parser.setSource("");
+ parser.Parsing(datasrc, 4);
+ flushData();
+ } else if (source.equals("file")) {
+ // get irtifical data generated for the DB stress tests
+ System.out.println("Parsing local file data source......");
+ String datasrc = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
+ String prefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/Jpreddata";
+ JpredParserLocalFile parser = new JpredParserLocalFile(prefix);
+ parser.Parsing(datasrc, 365);
+ flushData();
+ } else {
+ System.out.println("Unknown data source......");
+ }
}
public void flushData() {
mutatorString.execute();
mutatorLong.execute();
mutatorLog.execute();
- //System.out.println("Flush new data...");
+ // System.out.println("Flush new data...");
}
public void Closing() {
System.out.println("Cassandra has been shut down");
}
- // check whether the job id exists in the DB
+ /*
+ * check whether the job id exists in the DB
+ */
public boolean CheckID(String jobid) {
SliceQuery<String, String, String> sliceQuery = HFactory.createSliceQuery(ksp, ss, ss, ss);
sliceQuery.setColumnFamily("ProteinLog").setKey(jobid).setRange("", "", false, 100);
return false;
}
+ /*
+ * prepare data for insertion into the db
+ */
public void InsertData(long dataWork, String dataBegin, String dataEnd, String ip, String id, String statusEx, String statusFinal,
String protein, List<FastaSequence> jnetpred) {
mutatorLog.addInsertion(id, "ProteinLog", HFactory.createColumn("ip", ip, ss, ss))
--- /dev/null
+package compbio.cassandra;
+
+public interface JpredParser {
+
+ /*
+ * Defines a source file with metainformation of Jpred Jobs
+ **/
+ void setSource (String newsourceprefix);
+
+ /*
+ * Makes real parsing of the source file
+ **/
+ void Parsing(String source, int nDays);
+}
import java.util.Date;
import java.util.List;
-public class DataParsing {
+import compbio.cassandra.JpredParser;
+
+public class JpredParserHTTP implements JpredParser {
private CassandraCreate cc = new CassandraCreate();
- private String dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+ private String dirprefix;
+
+ JpredParserHTTP() {
+ this.dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+ }
- public void setDirPrefix (String newprefix) {
- this.dirprefix = newprefix;
+ JpredParserHTTP(String sourceurl) {
+ this.dirprefix = sourceurl;
+ }
+
+ public void setSource (String newsourceprefix) {
+ this.dirprefix = newsourceprefix;
}
- public void Parsing(String input, int nDays) {
+ public void Parsing(String source, int nDays) {
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -nDays);
for (int i = 0; i < nDays; ++i) {
int year = cal.get(Calendar.YEAR);
int day = cal.get(Calendar.DATE);
String date = year + "/" + month + "/" + day;
- ParsingForDate(input, date);
+ ParsingForDate(source, date);
}
}
--- /dev/null
+package compbio.cassandra;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.io.FileInputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+public class JpredParserLocalFile {
+ private CassandraCreate cc = new CassandraCreate();
+ private String dirprefix;
+
+ public void setSource (String newsourceprefix) {
+ this.dirprefix = newsourceprefix;
+ }
+
+ JpredParserLocalFile() {
+ this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
+ }
+
+ JpredParserLocalFile(String sourceurl) {
+ this.dirprefix = sourceurl;
+ }
+
+ public void Parsing(String source, int nDays) {
+ Calendar cal = Calendar.getInstance();
+ cal.add(Calendar.DATE, -nDays);
+ for (int i = 0; i < nDays; ++i) {
+ cal.add(Calendar.DATE, 1);
+ int month = cal.get(Calendar.MONTH) + 1;
+ int year = cal.get(Calendar.YEAR);
+ int day = cal.get(Calendar.DATE);
+ String date = year + "/" + month + "/" + day;
+ ParsingForDate(source, date);
+ }
+ }
+
+ private void ParsingForDate(String input, String date) {
+ int totalcount = 0;
+ int countNoData = 0;
+ int countUnclearFASTAid = 0;
+ int countinsertions = 0;
+ int countinserted = 0;
+ int counAlignments = 0;
+ int countStrange = 0;
+
+ System.out.println("Inserting jobs for " + date);
+ try {
+ File file = new File(input);
+ BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
+ String line;
+
+ while ((line = alljobs.readLine()) != null) {
+ if (line.matches(date + "(.*)jp_[^\\s]+")) {
+ String[] table = line.split("\\s+");
+ String id = table[table.length - 1];
+ totalcount++;
+ if (!cc.CheckID(id)) {
+ String confilename = dirprefix + "/" + id + "/" + id + ".concise";
+ File confile = new File(confilename);
+ if (confile.exists()) {
+ try {
+ final FastaReader fr = new FastaReader(confilename);
+ final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
+ String newprotein = "";
+ while (fr.hasNext()) {
+ final FastaSequence fs = fr.next();
+ if (fs.getId().equals("QUERY") || fs.getId().equals(id))
+ newprotein = fs.getSequence().replaceAll("\n", "");
+ else
+ seqs.add(fs);
+ }
+ if (newprotein.equals("")) {
+ countUnclearFASTAid++;
+ } else {
+ SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
+ String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
+ long dateWork1 = 0;
+ try {
+ Date dat1 = formatter.parse(dateInString1);
+ dateWork1 = dat1.getTime();
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+ cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
+ ++countinsertions;
+ // flush every 100 insertions
+ if (0 == countinsertions % 100) {
+ cc.flushData();
+ }
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } else {
+ countNoData++;
+ }
+ } else {
+ ++countinserted;
+ }
+ } else {
+ if (line.matches(date + "(.*)Sequence0/(.*)")) {
+ ++counAlignments;
+ } else {
+ ++countStrange;
+ }
+ }
+ }
+ alljobs.close();
+ System.out.println("Total number of jobs = " + totalcount);
+ System.out.println(" " + countinserted + " jobs inserted already");
+ System.out.println(" " + counAlignments + " jalview jobs");
+ System.out.println(" " + countStrange + " not analysed jobs");
+ System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
+ System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+ System.out.println(" " + countinsertions + " new job insertions\n");
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}