Apply formatting
[proteocache.git] / datadb / compbio / cassandra / JpredParserLocalFile.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.net.HttpURLConnection;
9 import java.net.MalformedURLException;
10 import java.io.FileInputStream;
11 import java.text.ParseException;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Calendar;
15 import java.util.Date;
16 import java.util.List;
17
18 import compbio.data.sequence.FastaReader;
19 import compbio.data.sequence.FastaSequence;
20
21 public class JpredParserLocalFile implements JpredParser {
22         private CassandraWriter cw = new CassandraWriter();
23         private String dirprefix;
24
25         public void setSource(String newsourceprefix) {
26                 this.dirprefix = newsourceprefix;
27         }
28
29         public JpredParserLocalFile() {
30                 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
31         }
32
33         public JpredParserLocalFile(String sourceurl) {
34                 this.dirprefix = sourceurl;
35         }
36
37         public void Parsing(String source, int nDays) throws IOException {
38                 Calendar cal = Calendar.getInstance();
39                 cal.add(Calendar.DATE, -nDays);
40                 List<String> alljobs = new ArrayList<String>();
41                 File file = new File(source);
42                 BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
43                 String line;
44
45                 while ((line = alljobsfile.readLine()) != null) {
46                         alljobs.add(line);
47                 }
48                 alljobsfile.close();
49
50                 System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
51                 final long startTime = System.currentTimeMillis();
52                 for (int i = 0; i < nDays; ++i) {
53                         cal.add(Calendar.DATE, 1);
54                         int month = cal.get(Calendar.MONTH) + 1;
55                         int year = cal.get(Calendar.YEAR);
56                         int day = cal.get(Calendar.DATE);
57                         String date = year + "/" + month + "/" + day;
58                         ParsingForDate(alljobs, date);
59                 }
60                 final long execTime = System.currentTimeMillis() - startTime;
61                 System.out.println("Execution Time = " + execTime + " ms");
62         }
63
64         private void ParsingForDate(List<String> input, String date) {
65                 int totalcount = 0;
66                 int countNoData = 0;
67                 int countUnclearFASTAid = 0;
68                 int countinsertions = 0;
69                 int countinserted = 0;
70                 int counAlignments = 0;
71                 int countStrange = 0;
72
73                 System.out.println("Inserting jobs for " + date);
74                 for (String in : input) {
75                         if (in.matches(date + ":(.*)jp_[^\\s]+")) {
76                                 String[] table = in.split("\\s+");
77                                 String starttime = table[0];
78                                 String finishtime = table[1];
79                                 String ip = table[2];
80                                 String id = table[table.length - 1];
81                                 totalcount++;
82                                 String confilename = dirprefix + "/" + id + "/" + id + ".concise";
83                                 File confile = new File(confilename);
84                                 if (confile.exists()) {
85                                         try {
86                                                 final FastaReader fr = new FastaReader(confilename);
87                                                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
88                                                 String newprotein = "";
89                                                 while (fr.hasNext()) {
90                                                         final FastaSequence fs = fr.next();
91                                                         if (fs.getId().equals("QUERY") || fs.getId().equals(id))
92                                                                 newprotein = fs.getSequence().replaceAll("\n", "");
93                                                         else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
94                                                                 seqs.add(fs);
95                                                         }
96                                                 }
97                                                 if (newprotein.equals("")) {
98                                                         countUnclearFASTAid++;
99                                                 } else {
100                                                         SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
101                                                         String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
102                                                         long insertdate = 0;
103                                                         try {
104                                                                 Date dat = formatter.parse(dateInString1);
105                                                                 insertdate = dat.getTime();
106                                                         } catch (ParseException e) {
107                                                                 e.printStackTrace();
108                                                         }
109                                                         // countinsertions += cw.FormQueryTables(insertdate,
110                                                         // starttime, finishtime, ip, id, "OK", "OK",
111                                                         // newprotein, seqs);
112                                                 }
113                                                 fr.close();
114                                         } catch (IOException e) {
115                                                 e.printStackTrace();
116                                         }
117                                 } else {
118                                         countNoData++;
119                                 }
120                         } else {
121                                 if (in.matches(date + "(.*)Sequence0/(.*)")) {
122                                         ++counAlignments;
123                                 } else {
124                                         ++countStrange;
125                                 }
126                         }
127                 }
128                 if (true) {
129                         System.out.println("Total number of jobs = " + totalcount);
130                         System.out.println("   " + countinserted + " jobs inserted already");
131                         System.out.println("   " + counAlignments + " jalview jobs");
132                         System.out.println("   " + countStrange + " not analysed jobs");
133                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
134                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
135                         System.out.println("   " + countinsertions + " new job insertions\n");
136                 }
137         }
138
139 }