1 package compbio.cassandra;
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStreamReader;
6 import java.net.HttpURLConnection;
7 import java.net.MalformedURLException;
9 import java.net.URLConnection;
10 import java.text.ParseException;
11 import java.text.SimpleDateFormat;
12 import java.util.ArrayList;
13 import java.util.Calendar;
14 import java.util.Date;
15 import java.util.List;
17 public class DataParsing {
18 private CassandraCreate cc = new CassandraCreate();
19 private String dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
21 public void setDirPrefix (String newprefix) {
22 this.dirprefix = newprefix;
25 public void Parsing(String input, int nDays) {
26 Calendar cal = Calendar.getInstance();
27 cal.add(Calendar.DATE, -nDays);
28 for (int i = 0; i < nDays; ++i) {
29 cal.add(Calendar.DATE, 1);
30 int month = cal.get(Calendar.MONTH) + 1;
31 int year = cal.get(Calendar.YEAR);
32 int day = cal.get(Calendar.DATE);
33 String date = year + "/" + month + "/" + day;
34 ParsingForDate(input, date);
38 private void ParsingForDate(String input, String date) {
41 int countUnclearFASTAid = 0;
42 int countinsertions = 0;
43 int countinserted = 0;
44 int counAlignments = 0;
47 System.out.println("Inserting jobs for " + date);
49 URL url = new URL(input);
50 URLConnection conn = url.openConnection();
51 BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
54 while ((line = alljobs.readLine()) != null) {
55 if (line.matches(date + "(.*)jp_[^\\s]+")) {
56 String[] table = line.split("\\s+");
57 String id = table[table.length - 1];
59 if (!cc.CheckID(id)) {
60 URL urltable = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
61 HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
62 int responsecode = httpConnection.getResponseCode();
63 if (199 < responsecode && responsecode < 300) {
65 final FastaReader fr = new FastaReader(urltable.openStream());
66 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
67 String newprotein = "";
68 while (fr.hasNext()) {
69 final FastaSequence fs = fr.next();
70 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
71 newprotein = fs.getSequence().replaceAll("\n", "");
75 if (newprotein.equals("")) {
76 countUnclearFASTAid++;
78 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
79 String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
82 Date dat1 = formatter.parse(dateInString1);
83 dateWork1 = dat1.getTime();
84 } catch (ParseException e) {
87 cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
89 // flush every 100 insertions
90 if (0 == countinsertions % 100) {
94 } catch (IOException e) {
104 if (line.matches(date + "(.*)Sequence0/(.*)")) {
112 System.out.println("Total number of jobs = " + totalcount);
113 System.out.println(" " + countinserted + " jobs inserted already");
114 System.out.println(" " + counAlignments + " jalview jobs");
115 System.out.println(" " + countStrange + " not analysed jobs");
116 System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
117 System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
118 System.out.println(" " + countinsertions + " new job insertions\n");
119 } catch (MalformedURLException e) {
121 } catch (IOException e) {