Merge branch 'master' into servlets
[proteocache.git] / datadb / compbio / cassandra / JpredParserHTTP.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.DataInputStream;
5 import java.io.EOFException;
6 import java.io.FileNotFoundException;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.HttpURLConnection;
11 import java.net.MalformedURLException;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.text.ParseException;
15 import java.text.SimpleDateFormat;
16 import java.util.ArrayList;
17 import java.util.Calendar;
18 import java.util.Date;
19 import java.util.List;
20
21 import compbio.cassandra.JpredParser;
22
23 public class JpredParserHTTP implements JpredParser {
24         private CassandraNativeConnector cc = new CassandraNativeConnector();
25         private String dirprefix;
26         private List<FastaSequence> alignment;
27         private List<FastaSequence> predictions;
28         private String jnetpred;
29
30         JpredParserHTTP() {
31                 dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
32         }
33
34         JpredParserHTTP(String sourceurl) {
35                 dirprefix = sourceurl;
36         }
37
38         public void setSource(String newsourceprefix) {
39                 dirprefix = newsourceprefix;
40         }
41
42         public void Parsing(String source, int nDays) throws IOException {
43                 Calendar cal = Calendar.getInstance();
44                 cal.add(Calendar.DATE, -nDays);
45                 for (int i = 0; i < nDays; ++i) {
46                         cal.add(Calendar.DATE, 1);
47                         int month = cal.get(Calendar.MONTH) + 1;
48                         int year = cal.get(Calendar.YEAR);
49                         int day = cal.get(Calendar.DATE);
50                         String date = year + "/" + month + "/" + day;
51                         ParsingForDate(source, date);
52                 }
53         }
54
55         private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
56                 final FastaReader fr = new FastaReader(stream);
57                 String query = "";
58                 alignment = new ArrayList<FastaSequence>();
59                 predictions = new ArrayList<FastaSequence>();
60                 while (fr.hasNext()) {
61                         final FastaSequence fs = fr.next();
62                         String seqid = fs.getId();
63                         String seq = fs.getSequence().replaceAll("\n", "");
64                         if (seqid.equals("QUERY") || seqid.equals(jobid)) {
65                                 query = seq;
66                                 alignment.add(fs);
67                         } else if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
68                                         || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
69                                         || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM")) {
70                                 predictions.add(fs);
71                                 if (seqid.equals("jnetpred"))
72                                         jnetpred = seq;
73                         } else {
74                                 alignment.add(fs);
75                         }
76                 }
77                 return query;
78         }
79
80         private String parseLogFile(final InputStream stream) throws IOException {
81                 String out = "";
82                 BufferedReader buffer = new BufferedReader(new InputStreamReader(stream));
83                 String line;
84                 while (null != (line = buffer.readLine())) {
85                         out += line;
86                 }
87                 return out;
88         }
89
90         private List<Byte> parseArchiveFile(final InputStream stream) throws IOException {
91                 DataInputStream data_in = new DataInputStream(stream);
92                 List<Byte> out = new ArrayList<Byte>();
93                 while (true) {
94                         try {
95                                 out.add(data_in.readByte());
96                         } catch (EOFException eof) {
97                                 break;
98                         }
99                 }
100                 return out;
101         }
102
103         private void ParsingForDate(String input, String date) {
104                 int totalcount = 0;
105                 int countNoData = 0;
106                 int countUnclearFASTAid = 0;
107                 int countinsertions = 0;
108                 int countinserted = 0;
109                 int counAlignments = 0;
110                 int countStrange = 0;
111
112                 System.out.println("Inserting jobs for " + date);
113                 try {
114                         URL url = new URL(input);
115                         URLConnection conn = url.openConnection();
116                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
117                         String line;
118
119                         while ((line = alljobs.readLine()) != null) {
120                                 if (line.matches(date + ":(.*)jp_[^\\s]+")) {
121                                         String[] table = line.split("\\s+");
122                                         // Format of a record:
123                                         // starttime endtime ip email jobid (directory)
124                                         // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172
125                                         // unknown_email jp_J9HBCBT
126                                         String id = table[table.length - 1];
127                                         totalcount++;
128                                         if (cc.JobisNotInsterted(id)) {
129                                                 URL dataurl = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
130                                                 URL archiveurl = new URL(dirprefix + "/" + id + "/" + id + ".tar.gz");
131                                                 URL logurl = new URL(dirprefix + "/" + id + "/LOG");
132                                                 HttpURLConnection httpConnection1 = (HttpURLConnection) dataurl.openConnection();
133                                                 HttpURLConnection httpConnection2 = (HttpURLConnection) logurl.openConnection();
134                                                 HttpURLConnection httpConnection3 = (HttpURLConnection) archiveurl.openConnection();
135                                                 int response1 = httpConnection1.getResponseCode();
136                                                 int response2 = httpConnection2.getResponseCode();
137                                                 if (199 < response1 && response1 < 300) {
138                                                         try {
139                                                                 String protein = parsePredictions(dataurl.openStream(), id);
140                                                                 if (protein.equals("")) {
141                                                                         countUnclearFASTAid++;
142                                                                 } else {
143                                                                         SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd");
144                                                                         SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s");
145                                                                         String startdatestring = table[0].substring(0, table[0].indexOf(":"));
146                                                                         try {
147                                                                                 Date startdate = dateformatter.parse(startdatestring);
148                                                                                 Date starttime = timeformatter.parse(table[0]);
149                                                                                 Date endtime = timeformatter.parse(table[1]);
150                                                                                 String ip = table[2];
151                                                                                 String execstatus = "OK";
152                                                                                 String finalstatus = "OK";
153                                                                                 countinsertions += cc.FormQueryTables(startdate.getTime(), table[0], table[1], ip, id, execstatus,
154                                                                                                 finalstatus, protein, predictions);
155
156                                                                                 long exectime = (endtime.getTime() - starttime.getTime()) / 1000;
157                                                                                 String log = "";
158                                                                                 if (199 < response2 && response2 < 300) {
159                                                                                         log = parseLogFile(logurl.openStream());
160                                                                                 }
161                                                                                 cc.ArchiveData(startdate.getTime(), exectime, ip, id, execstatus, finalstatus, protein,
162                                                                                                 predictions, alignment, log, archiveurl.toString());
163                                                                         } catch (ParseException e) {
164                                                                                 e.printStackTrace();
165                                                                         }
166                                                                 }
167                                                         } catch (IOException e) {
168                                                                 e.printStackTrace();
169                                                         }
170                                                 } else {
171                                                         countNoData++;
172                                                 }
173                                                 httpConnection1.disconnect();
174                                                 httpConnection2.disconnect();
175                                                 httpConnection3.disconnect();
176                                         } else {
177                                                 ++countinserted;
178                                         }
179                                 } else {
180                                         if (line.matches(date + "(.*)Sequence0/(.*)")) {
181                                                 ++counAlignments;
182                                         } else {
183                                                 ++countStrange;
184                                         }
185                                 }
186                         }
187                         alljobs.close();
188                         System.out.println("Total number of jobs = " + totalcount);
189                         System.out.println("   " + countinserted + " jobs inserted already");
190                         System.out.println("   " + counAlignments + " jalview jobs");
191                         System.out.println("   " + countStrange + " not analysed jobs");
192                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
193                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
194                         System.out.println("   " + countinsertions + " new job insertions\n");
195                 } catch (MalformedURLException e) {
196                         e.printStackTrace();
197                 } catch (IOException e) {
198                         e.printStackTrace();
199                 }
200                 ;
201         }
202 }