Add another parser for local files with metainformation

author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)

committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)
author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)
committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)
diff --git a/datadb/compbio/cassandra/CassandraCreate.java b/datadb/compbio/cassandra/CassandraCreate.java

index 54fffac..ad1fcbe 100644 (file)
--- a/datadb/compbio/cassandra/CassandraCreate.java
+++ b/datadb/compbio/cassandra/CassandraCreate.java
@@ -26,7 +26,9 @@ public class CassandraCreate {
         StringSerializer ss = StringSerializer.get();
         LongSerializer ls = LongSerializer.get();
  
-       // connect to the cluster
+       /*
+        * connect to the cluster and look weather the dababase has any data inside
+        */
         public void Connection() {
                 cluster = HFactory.getOrCreateCluster("Protein Cluster", "127.0.0.1:9160");
                 KeyspaceDefinition keyspaceDef = cluster.describeKeyspace("ProteinKeyspace");
@@ -60,11 +62,12 @@ public class CassandraCreate {
         }
  
         /*
-        * parsing data from
-        * http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat
+        * parsing data source and filling the database
          */
-       public void Parsing() {
-               /* CF ProteinRow store protein and prediction */
+       public void Parsing(String source) {
+               /*
+                * CF ProteinRow store protein and prediction
+                */
                 mutatorString = HFactory.createMutator(ksp, ss);
  
                 /*
@@ -72,21 +75,38 @@ public class CassandraCreate {
                  */
                 mutatorLog = HFactory.createMutator(ksp, ss);
  
-               /* CF ProteinData store id and protein per data */
+               /*
+                * CF ProteinData store id and protein per data
+                */
                 mutatorLong = HFactory.createMutator(ksp, ls);
  
-               System.out.println("Parsing......");
-               String in = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat";
-               DataParsing datParsing = new DataParsing();
-               datParsing.Parsing(in, 4);
-               flushData();
+               if (source.equals("http")) {
+                       // get data from real Jpred production server
+                       System.out.println("Parsing web data source......");
+                       String datasrc = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat";
+                       String prefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+                       JpredParserHTTP parser = new JpredParserHTTP(prefix);
+                       parser.setSource("");
+                       parser.Parsing(datasrc, 4);
+                       flushData();
+               } else if (source.equals("file")) {
+                       // get irtifical data generated for the DB stress tests
+                       System.out.println("Parsing local file data source......");
+                       String datasrc = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
+                       String prefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/Jpreddata";
+                       JpredParserLocalFile parser = new JpredParserLocalFile(prefix);
+                       parser.Parsing(datasrc, 365);
+                       flushData();
+               } else {
+                       System.out.println("Unknown data source......");
+               }
         }
  
         public void flushData() {
                 mutatorString.execute();
                 mutatorLong.execute();
                 mutatorLog.execute();
-               //System.out.println("Flush new data...");
+               // System.out.println("Flush new data...");
         }
  
         public void Closing() {
@@ -94,7 +114,9 @@ public class CassandraCreate {
                 System.out.println("Cassandra has been shut down");
         }
  
-       // check whether the job id exists in the DB
+       /*
+        * check whether the job id exists in the DB
+        */
         public boolean CheckID(String jobid) {
                 SliceQuery<String, String, String> sliceQuery = HFactory.createSliceQuery(ksp, ss, ss, ss);
                 sliceQuery.setColumnFamily("ProteinLog").setKey(jobid).setRange("", "", false, 100);
@@ -105,6 +127,9 @@ public class CassandraCreate {
                 return false;
         }
  
+       /*
+        * prepare data for insertion into the db
+        */
         public void InsertData(long dataWork, String dataBegin, String dataEnd, String ip, String id, String statusEx, String statusFinal,
                         String protein, List<FastaSequence> jnetpred) {
                 mutatorLog.addInsertion(id, "ProteinLog", HFactory.createColumn("ip", ip, ss, ss))
diff --git a/datadb/compbio/cassandra/JpredParser.java b/datadb/compbio/cassandra/JpredParser.java

new file mode 100644 (file)

index 0000000..56f3770
--- /dev/null
+++ b/datadb/compbio/cassandra/JpredParser.java
@@ -0,0 +1,14 @@
+package compbio.cassandra;
+
+public interface JpredParser {
+
+       /*
+        * Defines a source file with metainformation of Jpred Jobs
+        **/
+       void setSource (String newsourceprefix);
+       
+       /*
+        * Makes real parsing of the source file
+        **/
+       void Parsing(String source, int nDays);
+}
diff --git a/datadb/compbio/cassandra/DataParsing.java b/datadb/compbio/cassandra/JpredParserHTTP.java

similarity index 89%

rename from datadb/compbio/cassandra/DataParsing.java

rename to datadb/compbio/cassandra/JpredParserHTTP.java

index d548f56..3616818 100644 (file)
--- a/datadb/compbio/cassandra/DataParsing.java
+++ b/datadb/compbio/cassandra/JpredParserHTTP.java
@@ -14,15 +14,25 @@ import java.util.Calendar;
  import java.util.Date;
  import java.util.List;
  
-public class DataParsing {
+import compbio.cassandra.JpredParser;
+
+public class JpredParserHTTP implements JpredParser {
         private CassandraCreate cc = new CassandraCreate();
-       private String dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+       private String dirprefix;
+       
+       JpredParserHTTP() {
+               this.dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+       }
         
-       public void setDirPrefix (String newprefix) {
-               this.dirprefix = newprefix;
+       JpredParserHTTP(String sourceurl) {
+               this.dirprefix = sourceurl;
+       }
+
+       public void setSource (String newsourceprefix) {
+               this.dirprefix = newsourceprefix;
         }
  
-       public void Parsing(String input, int nDays) {
+       public void Parsing(String source, int nDays) {
                 Calendar cal = Calendar.getInstance();
                 cal.add(Calendar.DATE, -nDays);
                 for (int i = 0; i < nDays; ++i) {
@@ -31,7 +41,7 @@ public class DataParsing {
                         int year = cal.get(Calendar.YEAR);
                         int day = cal.get(Calendar.DATE);
                         String date = year + "/" + month + "/" + day;
-                       ParsingForDate(input, date);
+                       ParsingForDate(source, date);
                 }
         }
  
diff --git a/datadb/compbio/cassandra/JpredParserLocalFile.java b/datadb/compbio/cassandra/JpredParserLocalFile.java

new file mode 100644 (file)

index 0000000..c37ec7a
--- /dev/null
+++ b/datadb/compbio/cassandra/JpredParserLocalFile.java
@@ -0,0 +1,131 @@
+package compbio.cassandra;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.io.FileInputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+public class JpredParserLocalFile {
+       private CassandraCreate cc = new CassandraCreate();
+       private String dirprefix;
+       
+       public void setSource (String newsourceprefix) {
+               this.dirprefix = newsourceprefix;
+       }
+
+       JpredParserLocalFile() {
+               this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
+       }
+       
+       JpredParserLocalFile(String sourceurl) {
+               this.dirprefix = sourceurl;
+       }
+
+       public void Parsing(String source, int nDays) {
+               Calendar cal = Calendar.getInstance();
+               cal.add(Calendar.DATE, -nDays);
+               for (int i = 0; i < nDays; ++i) {
+                       cal.add(Calendar.DATE, 1);
+                       int month = cal.get(Calendar.MONTH) + 1;
+                       int year = cal.get(Calendar.YEAR);
+                       int day = cal.get(Calendar.DATE);
+                       String date = year + "/" + month + "/" + day;
+                       ParsingForDate(source, date);
+               }
+       }
+
+       private void ParsingForDate(String input, String date) {
+               int totalcount = 0;
+               int countNoData = 0;
+               int countUnclearFASTAid = 0;
+               int countinsertions = 0;
+               int countinserted = 0;
+               int counAlignments = 0;
+               int countStrange = 0;
+
+               System.out.println("Inserting jobs for " + date);
+               try {
+                       File file = new File(input);
+                       BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
+                       String line;
+
+                       while ((line = alljobs.readLine()) != null) {
+                               if (line.matches(date + "(.*)jp_[^\\s]+")) {
+                                       String[] table = line.split("\\s+");
+                                       String id = table[table.length - 1];
+                                       totalcount++;
+                                       if (!cc.CheckID(id)) {
+                                               String confilename = dirprefix + "/" + id + "/" + id + ".concise";
+                                               File confile = new File(confilename);
+                                               if (confile.exists()) {
+                                                       try {
+                                                               final FastaReader fr = new FastaReader(confilename);
+                                                               final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
+                                                               String newprotein = "";
+                                                               while (fr.hasNext()) {
+                                                                       final FastaSequence fs = fr.next();
+                                                                       if (fs.getId().equals("QUERY") || fs.getId().equals(id))
+                                                                               newprotein = fs.getSequence().replaceAll("\n", "");
+                                                                       else
+                                                                               seqs.add(fs);
+                                                               }
+                                                               if (newprotein.equals("")) {
+                                                                       countUnclearFASTAid++;
+                                                               } else {
+                                                                       SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
+                                                                       String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
+                                                                       long dateWork1 = 0;
+                                                                       try {
+                                                                               Date dat1 = formatter.parse(dateInString1);
+                                                                               dateWork1 = dat1.getTime();
+                                                                       } catch (ParseException e) {
+                                                                               e.printStackTrace();
+                                                                       }
+                                                                       cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
+                                                                       ++countinsertions;
+                                                                       // flush every 100 insertions
+                                                                       if (0 == countinsertions % 100) {
+                                                                               cc.flushData();
+                                                                       }
+                                                               }
+                                                       } catch (IOException e) {
+                                                               e.printStackTrace();
+                                                       }
+                                               } else {
+                                                       countNoData++;
+                                               }
+                                       } else {
+                                               ++countinserted;
+                                       }
+                               } else {
+                                       if (line.matches(date + "(.*)Sequence0/(.*)")) {
+                                               ++counAlignments;
+                                       } else {
+                                               ++countStrange;
+                                       }
+                               }
+                       }
+                       alljobs.close();
+                       System.out.println("Total number of jobs = " + totalcount);
+                       System.out.println("   " + countinserted + " jobs inserted already");
+                       System.out.println("   " + counAlignments + " jalview jobs");
+                       System.out.println("   " + countStrange + " not analysed jobs");
+                       System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
+                       System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+                       System.out.println("   " + countinsertions + " new job insertions\n");
+               } catch (MalformedURLException e) {
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+       }
+}
author	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)
committer	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Sat, 26 Oct 2013 09:43:23 +0000 (10:43 +0100)
datadb/compbio/cassandra/CassandraCreate.java		patch \| blob \| history
datadb/compbio/cassandra/JpredParser.java	[new file with mode: 0644]	patch \| blob
datadb/compbio/cassandra/JpredParserHTTP.java	[moved from datadb/compbio/cassandra/DataParsing.java with 89% similarity]	patch \| blob \| history
datadb/compbio/cassandra/JpredParserLocalFile.java	[new file with mode: 0644]	patch \| blob