Clean up the code
[proteocache.git] / datadb / compbio / cassandra / JpredParserLocalFile.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.io.FileInputStream;
8 import java.text.ParseException;
9 import java.text.SimpleDateFormat;
10 import java.util.ArrayList;
11 import java.util.Calendar;
12 import java.util.Date;
13 import java.util.List;
14
15 import compbio.data.sequence.FastaReader;
16 import compbio.data.sequence.FastaSequence;
17
18 public class JpredParserLocalFile implements JpredParser {
19         private String dirprefix;
20
21         public void setSource(String newsourceprefix) {
22                 this.dirprefix = newsourceprefix;
23         }
24
25         public JpredParserLocalFile() {
26                 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
27         }
28
29         public JpredParserLocalFile(String sourceurl) {
30                 this.dirprefix = sourceurl;
31         }
32
33         public void Parsing(String source, int nDays) throws IOException {
34                 Calendar cal = Calendar.getInstance();
35                 cal.add(Calendar.DATE, -nDays);
36                 List<String> alljobs = new ArrayList<String>();
37                 File file = new File(source);
38                 BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
39                 String line;
40
41                 while ((line = alljobsfile.readLine()) != null) {
42                         alljobs.add(line);
43                 }
44                 alljobsfile.close();
45
46                 System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
47                 final long startTime = System.currentTimeMillis();
48                 for (int i = 0; i < nDays; ++i) {
49                         cal.add(Calendar.DATE, 1);
50                         int month = cal.get(Calendar.MONTH) + 1;
51                         int year = cal.get(Calendar.YEAR);
52                         int day = cal.get(Calendar.DATE);
53                         String date = year + "/" + month + "/" + day;
54                         ParsingForDate(alljobs, date);
55                 }
56                 final long execTime = System.currentTimeMillis() - startTime;
57                 System.out.println("Execution Time = " + execTime + " ms");
58         }
59
60         private void ParsingForDate(List<String> input, String date) {
61                 int totalcount = 0;
62                 int countNoData = 0;
63                 int countUnclearFASTAid = 0;
64                 int countinsertions = 0;
65                 int countinserted = 0;
66                 int counAlignments = 0;
67                 int countStrange = 0;
68
69                 System.out.println("Inserting jobs for " + date);
70                 for (String in : input) {
71                         if (in.matches(date + ":(.*)jp_[^\\s]+")) {
72                                 String[] table = in.split("\\s+");
73                                 String starttime = table[0];
74                                 String id = table[table.length - 1];
75                                 totalcount++;
76                                 String confilename = dirprefix + "/" + id + "/" + id + ".concise";
77                                 File confile = new File(confilename);
78                                 if (confile.exists()) {
79                                         try {
80                                                 final FastaReader fr = new FastaReader(confilename);
81                                                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
82                                                 String newprotein = "";
83                                                 while (fr.hasNext()) {
84                                                         final FastaSequence fs = fr.next();
85                                                         if (fs.getId().equals("QUERY") || fs.getId().equals(id))
86                                                                 newprotein = fs.getSequence().replaceAll("\n", "");
87                                                         else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
88                                                                 seqs.add(fs);
89                                                         }
90                                                 }
91                                                 if (newprotein.equals("")) {
92                                                         countUnclearFASTAid++;
93                                                 } else {
94                                                         SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
95                                                         String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
96                                                         try {
97                                                                 Date dat = formatter.parse(dateInString1);
98                                                         } catch (ParseException e) {
99                                                                 e.printStackTrace();
100                                                         }
101                                                         // countinsertions += cw.FormQueryTables(insertdate,
102                                                         // starttime, finishtime, ip, id, "OK", "OK",
103                                                         // newprotein, seqs);
104                                                 }
105                                                 fr.close();
106                                         } catch (IOException e) {
107                                                 e.printStackTrace();
108                                         }
109                                 } else {
110                                         countNoData++;
111                                 }
112                         } else {
113                                 if (in.matches(date + "(.*)Sequence0/(.*)")) {
114                                         ++counAlignments;
115                                 } else {
116                                         ++countStrange;
117                                 }
118                         }
119                 }
120                 if (true) {
121                         System.out.println("Total number of jobs = " + totalcount);
122                         System.out.println("   " + countinserted + " jobs inserted already");
123                         System.out.println("   " + counAlignments + " jalview jobs");
124                         System.out.println("   " + countStrange + " not analysed jobs");
125                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
126                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
127                         System.out.println("   " + countinsertions + " new job insertions\n");
128                 }
129         }
130
131 }