1 package compbio.cassandra;
3 import java.io.BufferedReader;
4 import java.io.FileNotFoundException;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.HttpURLConnection;
8 import java.net.MalformedURLException;
10 import java.net.URLConnection;
11 import java.text.ParseException;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Calendar;
15 import java.util.Date;
16 import java.util.List;
18 import compbio.cassandra.JpredParser;
20 public class JpredParserHTTP implements JpredParser {
21 private CassandraNativeConnector cc = new CassandraNativeConnector();
22 private String dirprefix;
25 dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
28 JpredParserHTTP(String sourceurl) {
29 dirprefix = sourceurl;
32 public void setSource(String newsourceprefix) {
33 dirprefix = newsourceprefix;
36 public void Parsing(String source, int nDays) throws IOException {
37 Calendar cal = Calendar.getInstance();
38 cal.add(Calendar.DATE, -nDays);
39 for (int i = 0; i < nDays; ++i) {
40 cal.add(Calendar.DATE, 1);
41 int month = cal.get(Calendar.MONTH) + 1;
42 int year = cal.get(Calendar.YEAR);
43 int day = cal.get(Calendar.DATE);
44 String date = year + "/" + month + "/" + day;
45 ParsingForDate(source, date);
49 private int ParsingForDate(String input, String date) {
52 int countUnclearFASTAid = 0;
53 int countinsertions = 0;
54 int countinserted = 0;
55 int counAlignments = 0;
59 System.out.println("Inserting jobs for " + date);
61 URL url = new URL(input);
62 URLConnection conn = url.openConnection();
63 BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
66 while ((line = alljobs.readLine()) != null) {
67 if (line.matches(date + "(.*)jp_[^\\s]+")) {
68 String[] table = line.split("\\s+");
69 // Format of a record:
70 // starttime endtime ip email jobid (directory)
71 // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 unknown_email jp_J9HBCBT
72 String id = table[table.length - 1];
74 if (!cc.CheckID(id)) {
75 String datalink = dirprefix + "/" + id + "/" + id + ".concise.fasta";
76 URL urltable = new URL(datalink);
77 HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
78 int responsecode = httpConnection.getResponseCode();
79 if (199 < responsecode && responsecode < 300) {
81 final FastaReader fr = new FastaReader(urltable.openStream());
82 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
83 String newprotein = "";
84 while (fr.hasNext()) {
85 final FastaSequence fs = fr.next();
86 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
87 newprotein = fs.getSequence().replaceAll("\n", "");
88 else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
92 if (newprotein.equals("")) {
93 countUnclearFASTAid++;
95 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
96 String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
99 Date dat1 = formatter.parse(dateInString1);
100 dateWork1 = dat1.getTime();
101 } catch (ParseException e) {
104 cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
107 // flush every 50 insertions
108 //if (0 == countinsertions % 50) {
113 } catch (IOException e) {
123 if (line.matches(date + "(.*)Sequence0/(.*)")) {
131 System.out.println("Total number of jobs = " + totalcount);
132 System.out.println(" " + countinserted + " jobs inserted already");
133 System.out.println(" " + counAlignments + " jalview jobs");
134 System.out.println(" " + countStrange + " not analysed jobs");
135 System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
136 System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
137 System.out.println(" " + countinsertions + " new job insertions\n");
138 } catch (MalformedURLException e) {
140 } catch (IOException e) {