d548f56670b3b87df975447da59f98c2dbae381f
[proteocache.git] / datadb / compbio / cassandra / DataParsing.java
1 package compbio.cassandra;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.InputStreamReader;
6 import java.net.HttpURLConnection;
7 import java.net.MalformedURLException;
8 import java.net.URL;
9 import java.net.URLConnection;
10 import java.text.ParseException;
11 import java.text.SimpleDateFormat;
12 import java.util.ArrayList;
13 import java.util.Calendar;
14 import java.util.Date;
15 import java.util.List;
16
17 public class DataParsing {
18         private CassandraCreate cc = new CassandraCreate();
19         private String dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
20         
21         public void setDirPrefix (String newprefix) {
22                 this.dirprefix = newprefix;
23         }
24
25         public void Parsing(String input, int nDays) {
26                 Calendar cal = Calendar.getInstance();
27                 cal.add(Calendar.DATE, -nDays);
28                 for (int i = 0; i < nDays; ++i) {
29                         cal.add(Calendar.DATE, 1);
30                         int month = cal.get(Calendar.MONTH) + 1;
31                         int year = cal.get(Calendar.YEAR);
32                         int day = cal.get(Calendar.DATE);
33                         String date = year + "/" + month + "/" + day;
34                         ParsingForDate(input, date);
35                 }
36         }
37
38         private void ParsingForDate(String input, String date) {
39                 int totalcount = 0;
40                 int countNoData = 0;
41                 int countUnclearFASTAid = 0;
42                 int countinsertions = 0;
43                 int countinserted = 0;
44                 int counAlignments = 0;
45                 int countStrange = 0;
46
47                 System.out.println("Inserting jobs for " + date);
48                 try {
49                         URL url = new URL(input);
50                         URLConnection conn = url.openConnection();
51                         BufferedReader alljobs = new BufferedReader(new InputStreamReader(conn.getInputStream()));
52                         String line;
53
54                         while ((line = alljobs.readLine()) != null) {
55                                 if (line.matches(date + "(.*)jp_[^\\s]+")) {
56                                         String[] table = line.split("\\s+");
57                                         String id = table[table.length - 1];
58                                         totalcount++;
59                                         if (!cc.CheckID(id)) {
60                                                 URL urltable = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
61                                                 HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
62                                                 int responsecode = httpConnection.getResponseCode();
63                                                 if (199 < responsecode && responsecode < 300) {
64                                                         try {
65                                                                 final FastaReader fr = new FastaReader(urltable.openStream());
66                                                                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
67                                                                 String newprotein = "";
68                                                                 while (fr.hasNext()) {
69                                                                         final FastaSequence fs = fr.next();
70                                                                         if (fs.getId().equals("QUERY") || fs.getId().equals(id))
71                                                                                 newprotein = fs.getSequence().replaceAll("\n", "");
72                                                                         else
73                                                                                 seqs.add(fs);
74                                                                 }
75                                                                 if (newprotein.equals("")) {
76                                                                         countUnclearFASTAid++;
77                                                                 } else {
78                                                                         SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
79                                                                         String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
80                                                                         long dateWork1 = 0;
81                                                                         try {
82                                                                                 Date dat1 = formatter.parse(dateInString1);
83                                                                                 dateWork1 = dat1.getTime();
84                                                                         } catch (ParseException e) {
85                                                                                 e.printStackTrace();
86                                                                         }
87                                                                         cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
88                                                                         ++countinsertions;
89                                                                         // flush every 100 insertions
90                                                                         if (0 == countinsertions % 100) {
91                                                                                 cc.flushData();
92                                                                         }
93                                                                 }
94                                                         } catch (IOException e) {
95                                                                 e.printStackTrace();
96                                                         }
97                                                 } else {
98                                                         countNoData++;
99                                                 }
100                                         } else {
101                                                 ++countinserted;
102                                         }
103                                 } else {
104                                         if (line.matches(date + "(.*)Sequence0/(.*)")) {
105                                                 ++counAlignments;
106                                         } else {
107                                                 ++countStrange;
108                                         }
109                                 }
110                         }
111                         alljobs.close();
112                         System.out.println("Total number of jobs = " + totalcount);
113                         System.out.println("   " + countinserted + " jobs inserted already");
114                         System.out.println("   " + counAlignments + " jalview jobs");
115                         System.out.println("   " + countStrange + " not analysed jobs");
116                         System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
117                         System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
118                         System.out.println("   " + countinsertions + " new job insertions\n");
119                 } catch (MalformedURLException e) {
120                         e.printStackTrace();
121                 } catch (IOException e) {
122                         e.printStackTrace();
123                 }
124         }
125 }