Add Jpred archive table
[proteocache.git] / datadb / compbio / cassandra / JpredParserHTTP.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.FileNotFoundException;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.HttpURLConnection;
8 import java.net.MalformedURLException;
9 import java.net.URL;
10 import java.net.URLConnection;
11 import java.text.ParseException;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Calendar;
15 import java.util.Date;
16 import java.util.List;
17
18 import compbio.cassandra.JpredParser;
19
20 public class JpredParserHTTP implements JpredParser {
21         private CassandraNativeConnector cc = new CassandraNativeConnector();
22         private String dirprefix;
23
24         JpredParserHTTP() {
25                 dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
26         }
27
28         JpredParserHTTP(String sourceurl) {
29                 dirprefix = sourceurl;
30         }
31
32         public void setSource(String newsourceprefix) {
33                 dirprefix = newsourceprefix;
34         }
35
36         public void Parsing(String source, int nDays) throws IOException {
37                 Calendar cal = Calendar.getInstance();
38                 cal.add(Calendar.DATE, -nDays);
39                 for (int i = 0; i < nDays; ++i) {
40                         cal.add(Calendar.DATE, 1);
41                         int month = cal.get(Calendar.MONTH) + 1;
42                         int year = cal.get(Calendar.YEAR);
43                         int day = cal.get(Calendar.DATE);
44                         String date = year + "/" + month + "/" + day;
45                         ParsingForDate(source, date);
46                 }
47         }
48
49         private int ParsingForDate(String input, String date) {
50                 int totalcount = 0;
51                 int countNoData = 0;
52                 int countUnclearFASTAid = 0;
53                 int countinsertions = 0;
54                 int countinserted = 0;
55                 int counAlignments = 0;
56                 int countStrange = 0;
57                 int njobs = 0;
58
59                 System.out.println("Inserting jobs for " + date);
60                 try {
61                         URL url = new URL(input);
62                         URLConnection conn = url.openConnection();
63                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
64                         String line;
65
66                         while ((line = alljobs.readLine()) != null) {
67                                 if (line.matches(date + "(.*)jp_[^\\s]+")) {
68                                         String[] table = line.split("\\s+");
69                                         // Format of a record:
70                                         // starttime endtime ip email jobid (directory)
71                                         // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 unknown_email jp_J9HBCBT
72                                         String id = table[table.length - 1];
73                                         totalcount++;
74                                         String datalink = dirprefix + "/" + id + "/" + id + ".concise.fasta";
75                                         URL urltable = new URL(datalink);
76                                         HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
77                                         int responsecode = httpConnection.getResponseCode();
78                                         if (199 < responsecode && responsecode < 300) {
79                                                 try {
80                                                         final FastaReader fr = new FastaReader(urltable.openStream());
81                                                         final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
82                                                         String newprotein = "";
83                                                         while (fr.hasNext()) {
84                                                                 final FastaSequence fs = fr.next();
85                                                                 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
86                                                                         newprotein = fs.getSequence().replaceAll("\n", "");
87                                                                 else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
88                                                                         seqs.add(fs);
89                                                                 }
90                                                         }
91                                                         if (newprotein.equals("")) {
92                                                                 countUnclearFASTAid++;
93                                                         } else {
94                                                                 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
95                                                                 String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
96                                                                 long dateWork1 = 0;
97                                                                 try {
98                                                                         Date dat1 = formatter.parse(dateInString1);
99                                                                         dateWork1 = dat1.getTime();
100                                                                 } catch (ParseException e) {
101                                                                         e.printStackTrace();
102                                                                 }
103                                                                 cc.FormQueryTables(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
104                                                                 ++countinsertions;
105                                                                 ++njobs;
106                                                         }
107                                                 } catch (IOException e) {
108                                                         e.printStackTrace();
109                                                 }
110                                         } else {
111                                                 countNoData++;
112                                         }
113                                 } else {
114                                         if (line.matches(date + "(.*)Sequence0/(.*)")) {
115                                                 ++counAlignments;
116                                         } else {
117                                                 ++countStrange;
118                                         }
119                                 }
120                         }
121                         alljobs.close();
122                         System.out.println("Total number of jobs = " + totalcount);
123                         System.out.println("   " + countinserted + " jobs inserted already");
124                         System.out.println("   " + counAlignments + " jalview jobs");
125                         System.out.println("   " + countStrange + " not analysed jobs");
126                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
127                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
128                         System.out.println("   " + countinsertions + " new job insertions\n");
129                 } catch (MalformedURLException e) {
130                         e.printStackTrace();
131                 } catch (IOException e) {
132                         e.printStackTrace();
133                 }
134                 return njobs;
135         }
136 }