1 package compbio.cassandra;
3 import java.io.BufferedReader;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.net.HttpURLConnection;
9 import java.net.MalformedURLException;
10 import java.io.FileInputStream;
11 import java.text.ParseException;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Calendar;
15 import java.util.Date;
16 import java.util.List;
18 public class JpredParserLocalFile implements JpredParser {
19 private CassandraNativeConnector cc = new CassandraNativeConnector();
20 private String dirprefix;
22 public void setSource(String newsourceprefix) {
23 this.dirprefix = newsourceprefix;
26 JpredParserLocalFile() {
27 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
30 JpredParserLocalFile(String sourceurl) {
31 this.dirprefix = sourceurl;
34 public void Parsing(String source, int nDays) throws IOException {
35 Calendar cal = Calendar.getInstance();
36 cal.add(Calendar.DATE, -nDays);
37 List<String> alljobs = new ArrayList<String>();
38 File file = new File(source);
39 BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
42 while ((line = alljobsfile.readLine()) != null) {
47 System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
48 final long startTime = System.currentTimeMillis();
49 for (int i = 0; i < nDays; ++i) {
50 cal.add(Calendar.DATE, 1);
51 int month = cal.get(Calendar.MONTH) + 1;
52 int year = cal.get(Calendar.YEAR);
53 int day = cal.get(Calendar.DATE);
54 String date = year + "/" + month + "/" + day;
55 ParsingForDate(alljobs, date);
57 final long execTime = System.currentTimeMillis() - startTime;
58 System.out.println("Execution Time = " + execTime + " ms");
61 private int ParsingForDate(List<String> input, String date) {
64 int countUnclearFASTAid = 0;
65 int countinsertions = 0;
66 int countinserted = 0;
67 int counAlignments = 0;
71 System.out.println("Inserting jobs for " + date);
72 for (String in : input) {
73 if (in.matches(date + "(.*)jp_[^\\s]+")) {
74 String[] table = in.split("\\s+");
75 String starttime = table[0];
76 String finishtime = table[1];
78 String id = table[table.length - 1];
80 //if (!cc.CheckID(id)) {
82 String confilename = dirprefix + "/" + id + "/" + id + ".concise";
83 File confile = new File(confilename);
84 if (confile.exists()) {
86 final FastaReader fr = new FastaReader(confilename);
87 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
88 String newprotein = "";
89 while (fr.hasNext()) {
90 final FastaSequence fs = fr.next();
91 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
92 newprotein = fs.getSequence().replaceAll("\n", "");
93 else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
97 if (newprotein.equals("")) {
98 countUnclearFASTAid++;
100 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
101 String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
104 Date dat = formatter.parse(dateInString1);
105 dateWork1 = dat.getTime();
106 } catch (ParseException e) {
109 cc.InsertData(dateWork1, starttime, finishtime, ip, id, "OK", "OK", newprotein, seqs);
112 // flush every 50 insertions
113 //if (0 == countinsertions % 50) {
119 } catch (IOException e) {
129 if (in.matches(date + "(.*)Sequence0/(.*)")) {
137 System.out.println("Total number of jobs = " + totalcount);
138 System.out.println(" " + countinserted + " jobs inserted already");
139 System.out.println(" " + counAlignments + " jalview jobs");
140 System.out.println(" " + countStrange + " not analysed jobs");
141 System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
142 System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
143 System.out.println(" " + countinsertions + " new job insertions\n");