Re-design the parser for inserting alignment-only and failed jobs
[proteocache.git] / datadb / compbio / cassandra / JpredParserHTTP.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.DataInputStream;
5 import java.io.EOFException;
6 import java.io.FileNotFoundException;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.HttpURLConnection;
11 import java.net.MalformedURLException;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.text.ParseException;
15 import java.text.SimpleDateFormat;
16 import java.util.ArrayList;
17 import java.util.Calendar;
18 import java.util.Date;
19 import java.util.List;
20
21 import compbio.cassandra.JpredParser;
22
23 public class JpredParserHTTP implements JpredParser {
24         private CassandraWriter cw = new CassandraWriter();
25         private String dirprefix;
26         private List<FastaSequence> alignment;
27         private List<FastaSequence> predictions;
28         private int countNoData;
29
30         public JpredParserHTTP() {
31                 dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
32         }
33
34         public JpredParserHTTP(String sourceurl) {
35                 dirprefix = sourceurl;
36         }
37
38         public void setSource(String newsourceprefix) {
39                 dirprefix = newsourceprefix;
40         }
41
42         public void Parsing(String source, int nDays) throws IOException {
43                 Calendar cal = Calendar.getInstance();
44                 cal.add(Calendar.DATE, -nDays);
45                 for (int i = 0; i < nDays; ++i) {
46                         cal.add(Calendar.DATE, 1);
47                         int month = cal.get(Calendar.MONTH) + 1;
48                         int year = cal.get(Calendar.YEAR);
49                         int day = cal.get(Calendar.DATE);
50                         String date = year + "/" + month + "/" + day;
51                         ParsingForDate(source, date);
52                 }
53         }
54
55         /*
56          * The method parses the Jpred output concise file in the FASTA format If
57          * there is a record with ID = QUERY or jobid, this a "one protein" job
58          * otherwise this is an alignment job
59          */
60         private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
61                 final FastaReader fr = new FastaReader(stream);
62                 String protein = "";
63                 alignment = new ArrayList<FastaSequence>();
64                 predictions = new ArrayList<FastaSequence>();
65                 while (fr.hasNext()) {
66                         final FastaSequence fs = fr.next();
67                         String seqid = fs.getId();
68                         String seq = fs.getSequence().replaceAll("\n", "");
69                         if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
70                                         || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
71                                         || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) {
72                                 predictions.add(fs);
73                         } else {
74                                 alignment.add(fs);
75                                 if (seqid.equals("QUERY") || seqid.equals(jobid))
76                                         protein = seq;
77                         }
78                 }
79                 return protein;
80         }
81
82         private String parseLogFile(final InputStream stream) throws IOException {
83                 String out = "";
84                 BufferedReader buffer = new BufferedReader(new InputStreamReader(stream));
85                 String line;
86                 while (null != (line = buffer.readLine())) {
87                         out += line;
88                 }
89                 return out;
90         }
91
92         private List<Byte> parseArchiveFile(final InputStream stream) throws IOException {
93                 DataInputStream data_in = new DataInputStream(stream);
94                 List<Byte> out = new ArrayList<Byte>();
95                 while (true) {
96                         try {
97                                 out.add(data_in.readByte());
98                         } catch (EOFException eof) {
99                                 break;
100                         }
101                 }
102                 return out;
103         }
104
105         private int analyseJob(String[] job) throws IOException {
106                 boolean running = true;
107                 boolean ConcisefileExists = false;
108                 boolean LogfileExists = false;
109                 String id = job[job.length - 1];
110                 String startdatestring = job[0].substring(0, job[0].indexOf(":"));
111                 Date startdate = new Date(0);
112                 Date starttime = new Date(0);
113                 Date endtime = new Date(0);
114                 Date currDate = new Date();
115                 String ip = job[2];
116                 String execstatus = "OK";
117                 String finalstatus = "OK";
118                 String protein = "";
119                 long exectime = 0;
120                 String log = "";
121                 String maindir = dirprefix + "/" + id + "/";
122                 String concisefile = dirprefix + "/" + id + "/" + id + ".concise.fasta";
123                 String archivefile = dirprefix + "/" + id + "/" + id + ".tar.gz";
124                 String logfile = dirprefix + "/" + id + "/LOG";
125                 SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd");
126                 SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s");
127                 try {
128                         startdate = dateformatter.parse(startdatestring);
129                         starttime = timeformatter.parse(job[0]);
130                         endtime = timeformatter.parse(job[1]);
131                         exectime = (endtime.getTime() - starttime.getTime()) / 1000;
132                 } catch (ParseException e) {
133                         e.printStackTrace();
134                 }
135
136                 try {
137                         URL dirurl = new URL(maindir);
138                         HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection();
139                         if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) {
140                                 return 0;
141                         }
142                         URL conciseurl = new URL(concisefile);
143                         URL archiveurl = new URL(archivefile);
144                         URL logurl = new URL(logfile);
145                         HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection();
146                         HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection();
147                         HttpURLConnection httpConnection_archiveurl = (HttpURLConnection) archiveurl.openConnection();
148                         if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) {
149                                 ConcisefileExists = true;
150                                 running = false;
151                                 try {
152                                         protein = parsePredictions(conciseurl.openStream(), id);
153                                 } catch (IOException e) {
154                                         e.printStackTrace();
155                                 }
156                         } else {
157                                 // The job still can be running of failed...
158                                 ++countNoData;
159                                 alignment = new ArrayList<FastaSequence>();
160                                 predictions = new ArrayList<FastaSequence>();
161                         }
162                         if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) {
163                                 LogfileExists = true;
164                                 log = parseLogFile(logurl.openStream());
165                         } else {
166                                 // The job has not been started at all...
167                                 execstatus = "FAIL";
168                                 finalstatus = "STOPPED";
169                                 running = false;
170                         }
171                         if (log.matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) {
172                                 // blast job was too long (more than 3600 secs by default)...
173                                 execstatus = "FAIL";
174                                 finalstatus = "TIMEDOUT";
175                                 running = false;
176                         } else if (log.matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) {
177                                 // an internal Jpred error...
178                                 execstatus = "FAIL";
179                                 finalstatus = "JPREDERROR";
180                                 running = false;
181                         } else if ((currDate.getTime() - endtime.getTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) {
182                                 // the job was stopped with unknown reason...
183                                 execstatus = "FAIL";
184                                 finalstatus = "STOPPED";
185                                 running = false;
186                         }
187
188                         httpConnection_conciseurl.disconnect();
189                         httpConnection_logurl.disconnect();
190                         httpConnection_archiveurl.disconnect();
191                 } catch (MalformedURLException e) {
192                         e.printStackTrace();
193                 }
194
195                 if (!running) {
196                         long t = startdate.getTime();
197                         cw.FormQueryTables(t, job[0], job[1], ip, id, execstatus, finalstatus, protein, predictions);
198                         cw.ArchiveData(t, exectime, ip, id, execstatus, finalstatus, protein, predictions, alignment, log, archivefile);
199                         return 1;
200                 } else
201                         System.out.println("job " + id + " is running");
202
203                 return 0;
204         }
205
206         private void ParsingForDate(String input, String date) {
207                 int totalcount = 0;
208                 int countinsertions = 0;
209                 int countinserted = 0;
210                 int countNotanalyzed = 0;
211                 countNoData = 0;
212
213                 System.out.println("Inserting jobs for " + date);
214                 try {
215                         URL url = new URL(input);
216                         URLConnection conn = url.openConnection();
217                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
218                         String line;
219
220                         while ((line = alljobs.readLine()) != null) {
221                                 if (line.matches(date + ":(.*)jp_[^\\s]+")) {
222                                         totalcount++;
223                                         String[] job = line.split("\\s+");
224                                         String jobid = job[job.length - 1];
225                                         if (cw.JobisNotInsterted(jobid)) {
226                                                 countinsertions += analyseJob(job);
227                                         } else {
228                                                 ++countinserted;
229                                         }
230                                 } else {
231                                         ++countNotanalyzed;
232                                 }
233                         }
234                         alljobs.close();
235                         System.out.println("Total number of jobs = " + totalcount);
236                         System.out.println("   " + countinserted + " jobs inserted already");
237                         System.out.println("   " + countNotanalyzed + " not analysed jobs");
238                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)");
239                         System.out.println("   " + countinsertions + " new job insertions\n");
240                 } catch (MalformedURLException e) {
241                         e.printStackTrace();
242                 } catch (IOException e) {
243                         e.printStackTrace();
244                 }
245                 ;
246         }
247 }