a3e152062d54bfedc5a5e8376af9d0c4435401a7
[proteocache.git] / datadb / compbio / cassandra / JpredParserLocalFile.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.HttpURLConnection;
8 import java.net.MalformedURLException;
9 import java.io.FileInputStream;
10 import java.text.ParseException;
11 import java.text.SimpleDateFormat;
12 import java.util.ArrayList;
13 import java.util.Calendar;
14 import java.util.Date;
15 import java.util.List;
16
17 public class JpredParserLocalFile implements JpredParser {
18         private CassandraCreate cc = new CassandraCreate();
19         private String dirprefix;
20
21         public void setSource (String newsourceprefix) {
22                 this.dirprefix = newsourceprefix;
23         }
24
25         JpredParserLocalFile() {
26                 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
27         }
28         
29         JpredParserLocalFile(String sourceurl) {
30                 this.dirprefix = sourceurl;
31         }
32
33         public void Parsing(String source, int nDays) {
34                 Calendar cal = Calendar.getInstance();
35                 cal.add(Calendar.DATE, -nDays);
36                 for (int i = 0; i < nDays; ++i) {
37                         cal.add(Calendar.DATE, 1);
38                         int month = cal.get(Calendar.MONTH) + 1;
39                         int year = cal.get(Calendar.YEAR);
40                         int day = cal.get(Calendar.DATE);
41                         String date = year + "/" + month + "/" + day;
42                         if (0 < ParsingForDate(source, date)) {
43                                 cc.flushData();
44                         }
45                 }
46         }
47
48         private int ParsingForDate(String input, String date) {
49                 int totalcount = 0;
50                 int countNoData = 0;
51                 int countUnclearFASTAid = 0;
52                 int countinsertions = 0;
53                 int countinserted = 0;
54                 int counAlignments = 0;
55                 int countStrange = 0;
56                 int njobs = 0;
57
58                 System.out.println("Inserting jobs for " + date);
59                 try {
60                         File file = new File(input);
61                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
62                         String line;
63
64                         while ((line = alljobs.readLine()) != null) {
65                                 if (line.matches(date + "(.*)jp_[^\\s]+")) {
66                                         String[] table = line.split("\\s+");
67                                         String id = table[table.length - 1];
68                                         totalcount++;
69                                         if (!cc.CheckID(id)) {
70                                                 String confilename = dirprefix + "/" + id + "/" + id + ".concise";
71                                                 File confile = new File(confilename);
72                                                 if (confile.exists()) {
73                                                         try {
74                                                                 final FastaReader fr = new FastaReader(confilename);
75                                                                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
76                                                                 String newprotein = "";
77                                                                 while (fr.hasNext()) {
78                                                                         final FastaSequence fs = fr.next();
79                                                                         if (fs.getId().equals("QUERY") || fs.getId().equals(id))
80                                                                                 newprotein = fs.getSequence().replaceAll("\n", "");
81                                                                         else
82                                                                                 seqs.add(fs);
83                                                                 }
84                                                                 if (newprotein.equals("")) {
85                                                                         countUnclearFASTAid++;
86                                                                 } else {
87                                                                         SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
88                                                                         String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
89                                                                         long dateWork1 = 0;
90                                                                         try {
91                                                                                 Date dat1 = formatter.parse(dateInString1);
92                                                                                 dateWork1 = dat1.getTime();
93                                                                         } catch (ParseException e) {
94                                                                                 e.printStackTrace();
95                                                                         }
96                                                                         cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
97                                                                         ++countinsertions;
98                                                                         ++njobs;
99                                                                         // flush every 50 insertions
100                                                                         if (0 == countinsertions % 50) {
101                                                                                 cc.flushData();
102                                                                                 njobs -= 50;
103                                                                         }
104                                                                 }
105                                                                 fr.close();
106                                                         } catch (IOException e) {
107                                                                 e.printStackTrace();
108                                                         }
109                                                 } else {
110                                                         countNoData++;
111                                                 }
112                                         } else {
113                                                 ++countinserted;
114                                         }
115                                 } else {
116                                         if (line.matches(date + "(.*)Sequence0/(.*)")) {
117                                                 ++counAlignments;
118                                         } else {
119                                                 ++countStrange;
120                                         }
121                                 }
122                         }
123                         alljobs.close();
124                         System.out.println("Total number of jobs = " + totalcount);
125                         System.out.println("   " + countinserted + " jobs inserted already");
126                         System.out.println("   " + counAlignments + " jalview jobs");
127                         System.out.println("   " + countStrange + " not analysed jobs");
128                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
129                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
130                         System.out.println("   " + countinsertions + " new job insertions\n");
131                 } catch (MalformedURLException e) {
132                         e.printStackTrace();
133                 } catch (IOException e) {
134                         e.printStackTrace();
135                 }
136                 return njobs;
137         }
138 }