27d4252c19d5e9594de6742f1dda011d7901e7d7
[proteocache.git] / datadb / compbio / cassandra / JpredParserLocalFile.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.net.HttpURLConnection;
9 import java.net.MalformedURLException;
10 import java.io.FileInputStream;
11 import java.text.ParseException;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Calendar;
15 import java.util.Date;
16 import java.util.List;
17
18 public class JpredParserLocalFile implements JpredParser {
19         private CassandraNativeConnector cc = new CassandraNativeConnector();
20         private String dirprefix;
21
22         public void setSource(String newsourceprefix) {
23                 this.dirprefix = newsourceprefix;
24         }
25
26         JpredParserLocalFile() {
27                 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
28         }
29
30         JpredParserLocalFile(String sourceurl) {
31                 this.dirprefix = sourceurl;
32         }
33
34         public void Parsing(String source, int nDays) throws IOException {
35                 Calendar cal = Calendar.getInstance();
36                 cal.add(Calendar.DATE, -nDays);
37                 List<String> alljobs = new ArrayList<String>();
38                 File file = new File(source);
39                 BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
40                 String line;
41
42                 while ((line = alljobsfile.readLine()) != null) {
43                         alljobs.add(line);
44                 }
45                 alljobsfile.close();
46
47                 System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
48                 final long startTime = System.currentTimeMillis();
49                 for (int i = 0; i < nDays; ++i) {
50                         cal.add(Calendar.DATE, 1);
51                         int month = cal.get(Calendar.MONTH) + 1;
52                         int year = cal.get(Calendar.YEAR);
53                         int day = cal.get(Calendar.DATE);
54                         String date = year + "/" + month + "/" + day;
55                         ParsingForDate(alljobs, date);
56                 }
57                 final long execTime = System.currentTimeMillis() - startTime;
58                 System.out.println("Execution Time = " + execTime + " ms");
59         }
60
61         private int ParsingForDate(List<String> input, String date) {
62                 int totalcount = 0;
63                 int countNoData = 0;
64                 int countUnclearFASTAid = 0;
65                 int countinsertions = 0;
66                 int countinserted = 0;
67                 int counAlignments = 0;
68                 int countStrange = 0;
69                 int njobs = 0;
70
71                 System.out.println("Inserting jobs for " + date);
72                 for (String in : input) {
73                         if (in.matches(date + "(.*)jp_[^\\s]+")) {
74                                 String[] table = in.split("\\s+");
75                                 String starttime = table[0];
76                                 String finishtime = table[1];
77                                 String ip = table[2];
78                                 String id = table[table.length - 1];
79                                 totalcount++;
80                                 //if (!cc.CheckID(id)) {
81                                         if (true) {
82                                         String confilename = dirprefix + "/" + id + "/" + id + ".concise";
83                                         File confile = new File(confilename);
84                                         if (confile.exists()) {
85                                                 try {
86                                                         final FastaReader fr = new FastaReader(confilename);
87                                                         final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
88                                                         String newprotein = "";
89                                                         while (fr.hasNext()) {
90                                                                 final FastaSequence fs = fr.next();
91                                                                 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
92                                                                         newprotein = fs.getSequence().replaceAll("\n", "");
93                                                                 else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
94                                                                         seqs.add(fs);
95                                                                 }
96                                                         }
97                                                         if (newprotein.equals("")) {
98                                                                 countUnclearFASTAid++;
99                                                         } else {
100                                                                 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
101                                                                 String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
102                                                                 long dateWork1 = 0;
103                                                                 try {
104                                                                         Date dat = formatter.parse(dateInString1);
105                                                                         dateWork1 = dat.getTime();
106                                                                 } catch (ParseException e) {
107                                                                         e.printStackTrace();
108                                                                 }
109                                                                 cc.InsertData(dateWork1, starttime, finishtime, ip, id, "OK", "OK", newprotein, seqs);
110                                                                 ++countinsertions;
111                                                                 ++njobs;
112                                                                 // flush every 50 insertions
113                                                                 //if (0 == countinsertions % 50) {
114                                                                 //      cc.flushData();
115                                                                 //      njobs -= 50;
116                                                                 //}
117                                                         }
118                                                         fr.close();
119                                                 } catch (IOException e) {
120                                                         e.printStackTrace();
121                                                 }
122                                         } else {
123                                                 countNoData++;
124                                         }
125                                 } else {
126                                         ++countinserted;
127                                 }
128                         } else {
129                                 if (in.matches(date + "(.*)Sequence0/(.*)")) {
130                                         ++counAlignments;
131                                 } else {
132                                         ++countStrange;
133                                 }
134                         }
135                 }
136                 if (true) {
137                         System.out.println("Total number of jobs = " + totalcount);
138                         System.out.println("   " + countinserted + " jobs inserted already");
139                         System.out.println("   " + counAlignments + " jalview jobs");
140                         System.out.println("   " + countStrange + " not analysed jobs");
141                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
142                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
143                         System.out.println("   " + countinsertions + " new job insertions\n");
144                 }
145                 return njobs;
146         }
147
148 }