From: Sasha Sherstnev Date: Sat, 26 Oct 2013 09:43:23 +0000 (+0100) Subject: Add another parser for local files with metainformation X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=6a81b75c020845f9bb94c307a66347e4362da85f;p=proteocache.git Add another parser for local files with metainformation --- diff --git a/datadb/compbio/cassandra/CassandraCreate.java b/datadb/compbio/cassandra/CassandraCreate.java index 54fffac..ad1fcbe 100644 --- a/datadb/compbio/cassandra/CassandraCreate.java +++ b/datadb/compbio/cassandra/CassandraCreate.java @@ -26,7 +26,9 @@ public class CassandraCreate { StringSerializer ss = StringSerializer.get(); LongSerializer ls = LongSerializer.get(); - // connect to the cluster + /* + * connect to the cluster and look weather the dababase has any data inside + */ public void Connection() { cluster = HFactory.getOrCreateCluster("Protein Cluster", "127.0.0.1:9160"); KeyspaceDefinition keyspaceDef = cluster.describeKeyspace("ProteinKeyspace"); @@ -60,11 +62,12 @@ public class CassandraCreate { } /* - * parsing data from - * http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat + * parsing data source and filling the database */ - public void Parsing() { - /* CF ProteinRow store protein and prediction */ + public void Parsing(String source) { + /* + * CF ProteinRow store protein and prediction + */ mutatorString = HFactory.createMutator(ksp, ss); /* @@ -72,21 +75,38 @@ public class CassandraCreate { */ mutatorLog = HFactory.createMutator(ksp, ss); - /* CF ProteinData store id and protein per data */ + /* + * CF ProteinData store id and protein per data + */ mutatorLong = HFactory.createMutator(ksp, ls); - System.out.println("Parsing......"); - String in = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat"; - DataParsing datParsing = new DataParsing(); - datParsing.Parsing(in, 4); - flushData(); + if (source.equals("http")) { + // get data from real Jpred production server + System.out.println("Parsing web data source......"); + String datasrc = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat"; + String prefix = "http://www.compbio.dundee.ac.uk/www-jpred/results"; + JpredParserHTTP parser = new JpredParserHTTP(prefix); + parser.setSource(""); + parser.Parsing(datasrc, 4); + flushData(); + } else if (source.equals("file")) { + // get irtifical data generated for the DB stress tests + System.out.println("Parsing local file data source......"); + String datasrc = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat"; + String prefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/Jpreddata"; + JpredParserLocalFile parser = new JpredParserLocalFile(prefix); + parser.Parsing(datasrc, 365); + flushData(); + } else { + System.out.println("Unknown data source......"); + } } public void flushData() { mutatorString.execute(); mutatorLong.execute(); mutatorLog.execute(); - //System.out.println("Flush new data..."); + // System.out.println("Flush new data..."); } public void Closing() { @@ -94,7 +114,9 @@ public class CassandraCreate { System.out.println("Cassandra has been shut down"); } - // check whether the job id exists in the DB + /* + * check whether the job id exists in the DB + */ public boolean CheckID(String jobid) { SliceQuery sliceQuery = HFactory.createSliceQuery(ksp, ss, ss, ss); sliceQuery.setColumnFamily("ProteinLog").setKey(jobid).setRange("", "", false, 100); @@ -105,6 +127,9 @@ public class CassandraCreate { return false; } + /* + * prepare data for insertion into the db + */ public void InsertData(long dataWork, String dataBegin, String dataEnd, String ip, String id, String statusEx, String statusFinal, String protein, List jnetpred) { mutatorLog.addInsertion(id, "ProteinLog", HFactory.createColumn("ip", ip, ss, ss)) diff --git a/datadb/compbio/cassandra/JpredParser.java b/datadb/compbio/cassandra/JpredParser.java new file mode 100644 index 0000000..56f3770 --- /dev/null +++ b/datadb/compbio/cassandra/JpredParser.java @@ -0,0 +1,14 @@ +package compbio.cassandra; + +public interface JpredParser { + + /* + * Defines a source file with metainformation of Jpred Jobs + **/ + void setSource (String newsourceprefix); + + /* + * Makes real parsing of the source file + **/ + void Parsing(String source, int nDays); +} diff --git a/datadb/compbio/cassandra/DataParsing.java b/datadb/compbio/cassandra/JpredParserHTTP.java similarity index 89% rename from datadb/compbio/cassandra/DataParsing.java rename to datadb/compbio/cassandra/JpredParserHTTP.java index d548f56..3616818 100644 --- a/datadb/compbio/cassandra/DataParsing.java +++ b/datadb/compbio/cassandra/JpredParserHTTP.java @@ -14,15 +14,25 @@ import java.util.Calendar; import java.util.Date; import java.util.List; -public class DataParsing { +import compbio.cassandra.JpredParser; + +public class JpredParserHTTP implements JpredParser { private CassandraCreate cc = new CassandraCreate(); - private String dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results"; + private String dirprefix; + + JpredParserHTTP() { + this.dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results"; + } - public void setDirPrefix (String newprefix) { - this.dirprefix = newprefix; + JpredParserHTTP(String sourceurl) { + this.dirprefix = sourceurl; + } + + public void setSource (String newsourceprefix) { + this.dirprefix = newsourceprefix; } - public void Parsing(String input, int nDays) { + public void Parsing(String source, int nDays) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DATE, -nDays); for (int i = 0; i < nDays; ++i) { @@ -31,7 +41,7 @@ public class DataParsing { int year = cal.get(Calendar.YEAR); int day = cal.get(Calendar.DATE); String date = year + "/" + month + "/" + day; - ParsingForDate(input, date); + ParsingForDate(source, date); } } diff --git a/datadb/compbio/cassandra/JpredParserLocalFile.java b/datadb/compbio/cassandra/JpredParserLocalFile.java new file mode 100644 index 0000000..c37ec7a --- /dev/null +++ b/datadb/compbio/cassandra/JpredParserLocalFile.java @@ -0,0 +1,131 @@ +package compbio.cassandra; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.io.FileInputStream; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +public class JpredParserLocalFile { + private CassandraCreate cc = new CassandraCreate(); + private String dirprefix; + + public void setSource (String newsourceprefix) { + this.dirprefix = newsourceprefix; + } + + JpredParserLocalFile() { + this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat"; + } + + JpredParserLocalFile(String sourceurl) { + this.dirprefix = sourceurl; + } + + public void Parsing(String source, int nDays) { + Calendar cal = Calendar.getInstance(); + cal.add(Calendar.DATE, -nDays); + for (int i = 0; i < nDays; ++i) { + cal.add(Calendar.DATE, 1); + int month = cal.get(Calendar.MONTH) + 1; + int year = cal.get(Calendar.YEAR); + int day = cal.get(Calendar.DATE); + String date = year + "/" + month + "/" + day; + ParsingForDate(source, date); + } + } + + private void ParsingForDate(String input, String date) { + int totalcount = 0; + int countNoData = 0; + int countUnclearFASTAid = 0; + int countinsertions = 0; + int countinserted = 0; + int counAlignments = 0; + int countStrange = 0; + + System.out.println("Inserting jobs for " + date); + try { + File file = new File(input); + BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + String line; + + while ((line = alljobs.readLine()) != null) { + if (line.matches(date + "(.*)jp_[^\\s]+")) { + String[] table = line.split("\\s+"); + String id = table[table.length - 1]; + totalcount++; + if (!cc.CheckID(id)) { + String confilename = dirprefix + "/" + id + "/" + id + ".concise"; + File confile = new File(confilename); + if (confile.exists()) { + try { + final FastaReader fr = new FastaReader(confilename); + final List seqs = new ArrayList(); + String newprotein = ""; + while (fr.hasNext()) { + final FastaSequence fs = fr.next(); + if (fs.getId().equals("QUERY") || fs.getId().equals(id)) + newprotein = fs.getSequence().replaceAll("\n", ""); + else + seqs.add(fs); + } + if (newprotein.equals("")) { + countUnclearFASTAid++; + } else { + SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd"); + String dateInString1 = table[0].substring(0, table[0].indexOf(":")); + long dateWork1 = 0; + try { + Date dat1 = formatter.parse(dateInString1); + dateWork1 = dat1.getTime(); + } catch (ParseException e) { + e.printStackTrace(); + } + cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs); + ++countinsertions; + // flush every 100 insertions + if (0 == countinsertions % 100) { + cc.flushData(); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } else { + countNoData++; + } + } else { + ++countinserted; + } + } else { + if (line.matches(date + "(.*)Sequence0/(.*)")) { + ++counAlignments; + } else { + ++countStrange; + } + } + } + alljobs.close(); + System.out.println("Total number of jobs = " + totalcount); + System.out.println(" " + countinserted + " jobs inserted already"); + System.out.println(" " + counAlignments + " jalview jobs"); + System.out.println(" " + countStrange + " not analysed jobs"); + System.out.println(" " + countNoData + " jobs without *.concise.fasta file"); + System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta"); + System.out.println(" " + countinsertions + " new job insertions\n"); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } +}