Fix problem with too many open files and problem with not-flushed
[proteocache.git] / datadb / compbio / cassandra / JpredParserHTTP.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStreamReader;
6 import java.net.HttpURLConnection;
7 import java.net.MalformedURLException;
8 import java.net.URL;
9 import java.net.URLConnection;
10 import java.text.ParseException;
11 import java.text.SimpleDateFormat;
12 import java.util.ArrayList;
13 import java.util.Calendar;
14 import java.util.Date;
15 import java.util.List;
16
17 import compbio.cassandra.JpredParser;
18
19 public class JpredParserHTTP implements JpredParser {
20         private CassandraCreate cc = new CassandraCreate();
21         private String dirprefix;
22
23         JpredParserHTTP() {
24                 dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
25         }
26
27         JpredParserHTTP(String sourceurl) {
28                 dirprefix = sourceurl;
29         }
30
31         public void setSource(String newsourceprefix) {
32                 dirprefix = newsourceprefix;
33         }
34
35         public void Parsing(String source, int nDays) {
36                 Calendar cal = Calendar.getInstance();
37                 cal.add(Calendar.DATE, -nDays);
38                 for (int i = 0; i < nDays; ++i) {
39                         cal.add(Calendar.DATE, 1);
40                         int month = cal.get(Calendar.MONTH) + 1;
41                         int year = cal.get(Calendar.YEAR);
42                         int day = cal.get(Calendar.DATE);
43                         String date = year + "/" + month + "/" + day;
44                         if (0 < ParsingForDate(source, date)) {
45                                 cc.flushData();
46                         }
47                 }
48         }
49
50         private int ParsingForDate(String input, String date) {
51                 int totalcount = 0;
52                 int countNoData = 0;
53                 int countUnclearFASTAid = 0;
54                 int countinsertions = 0;
55                 int countinserted = 0;
56                 int counAlignments = 0;
57                 int countStrange = 0;
58                 int njobs = 0;
59
60                 System.out.println("Inserting jobs for " + date);
61                 try {
62                         URL url = new URL(input);
63                         URLConnection conn = url.openConnection();
64                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
65                         String line;
66
67                         while ((line = alljobs.readLine()) != null) {
68                                 if (line.matches(date + "(.*)jp_[^\\s]+")) {
69                                         String[] table = line.split("\\s+");
70                                         // Format of a record:
71                                         // starttime endtime ip email jobid (directory)
72                                         // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 unknown_email jp_J9HBCBT
73                                         String id = table[table.length - 1];
74                                         totalcount++;
75                                         if (!cc.CheckID(id)) {
76                                                 String datalink = dirprefix + "/" + id + "/" + id + ".concise.fasta";
77                                                 URL urltable = new URL(datalink);
78                                                 HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
79                                                 int responsecode = httpConnection.getResponseCode();
80                                                 if (199 < responsecode && responsecode < 300) {
81                                                         try {
82                                                                 final FastaReader fr = new FastaReader(urltable.openStream());
83                                                                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
84                                                                 String newprotein = "";
85                                                                 while (fr.hasNext()) {
86                                                                         final FastaSequence fs = fr.next();
87                                                                         if (fs.getId().equals("QUERY") || fs.getId().equals(id))
88                                                                                 newprotein = fs.getSequence().replaceAll("\n", "");
89                                                                         else
90                                                                                 seqs.add(fs);
91                                                                 }
92                                                                 if (newprotein.equals("")) {
93                                                                         countUnclearFASTAid++;
94                                                                 } else {
95                                                                         SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
96                                                                         String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
97                                                                         long dateWork1 = 0;
98                                                                         try {
99                                                                                 Date dat1 = formatter.parse(dateInString1);
100                                                                                 dateWork1 = dat1.getTime();
101                                                                         } catch (ParseException e) {
102                                                                                 e.printStackTrace();
103                                                                         }
104                                                                         cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
105                                                                         ++countinsertions;
106                                                                         ++njobs;
107                                                                         // flush every 50 insertions
108                                                                         if (0 == countinsertions % 50) {
109                                                                                 cc.flushData();
110                                                                                 njobs -= 50;
111                                                                         }
112                                                                 }
113                                                         } catch (IOException e) {
114                                                                 e.printStackTrace();
115                                                         }
116                                                 } else {
117                                                         countNoData++;
118                                                 }
119                                         } else {
120                                                 ++countinserted;
121                                         }
122                                 } else {
123                                         if (line.matches(date + "(.*)Sequence0/(.*)")) {
124                                                 ++counAlignments;
125                                         } else {
126                                                 ++countStrange;
127                                         }
128                                 }
129                         }
130                         alljobs.close();
131                         System.out.println("Total number of jobs = " + totalcount);
132                         System.out.println("   " + countinserted + " jobs inserted already");
133                         System.out.println("   " + counAlignments + " jalview jobs");
134                         System.out.println("   " + countStrange + " not analysed jobs");
135                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
136                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
137                         System.out.println("   " + countinsertions + " new job insertions\n");
138                 } catch (MalformedURLException e) {
139                         e.printStackTrace();
140                 } catch (IOException e) {
141                         e.printStackTrace();
142                 }
143                 return njobs;
144         }
145 }