1 package compbio.cassandra;
3 import java.io.BufferedReader;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.HttpURLConnection;
8 import java.net.MalformedURLException;
9 import java.io.FileInputStream;
10 import java.text.ParseException;
11 import java.text.SimpleDateFormat;
12 import java.util.ArrayList;
13 import java.util.Calendar;
14 import java.util.Date;
15 import java.util.List;
17 public class JpredParserLocalFile {
18 private CassandraCreate cc = new CassandraCreate();
19 private String dirprefix;
21 public void setSource (String newsourceprefix) {
22 this.dirprefix = newsourceprefix;
25 JpredParserLocalFile() {
26 this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
29 JpredParserLocalFile(String sourceurl) {
30 this.dirprefix = sourceurl;
33 public void Parsing(String source, int nDays) {
34 Calendar cal = Calendar.getInstance();
35 cal.add(Calendar.DATE, -nDays);
36 for (int i = 0; i < nDays; ++i) {
37 cal.add(Calendar.DATE, 1);
38 int month = cal.get(Calendar.MONTH) + 1;
39 int year = cal.get(Calendar.YEAR);
40 int day = cal.get(Calendar.DATE);
41 String date = year + "/" + month + "/" + day;
42 ParsingForDate(source, date);
46 private void ParsingForDate(String input, String date) {
49 int countUnclearFASTAid = 0;
50 int countinsertions = 0;
51 int countinserted = 0;
52 int counAlignments = 0;
55 System.out.println("Inserting jobs for " + date);
57 File file = new File(input);
58 BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
61 while ((line = alljobs.readLine()) != null) {
62 if (line.matches(date + "(.*)jp_[^\\s]+")) {
63 String[] table = line.split("\\s+");
64 String id = table[table.length - 1];
66 if (!cc.CheckID(id)) {
67 String confilename = dirprefix + "/" + id + "/" + id + ".concise";
68 File confile = new File(confilename);
69 if (confile.exists()) {
71 final FastaReader fr = new FastaReader(confilename);
72 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
73 String newprotein = "";
74 while (fr.hasNext()) {
75 final FastaSequence fs = fr.next();
76 if (fs.getId().equals("QUERY") || fs.getId().equals(id))
77 newprotein = fs.getSequence().replaceAll("\n", "");
81 if (newprotein.equals("")) {
82 countUnclearFASTAid++;
84 SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
85 String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
88 Date dat1 = formatter.parse(dateInString1);
89 dateWork1 = dat1.getTime();
90 } catch (ParseException e) {
93 cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
95 // flush every 100 insertions
96 if (0 == countinsertions % 100) {
100 } catch (IOException e) {
110 if (line.matches(date + "(.*)Sequence0/(.*)")) {
118 System.out.println("Total number of jobs = " + totalcount);
119 System.out.println(" " + countinserted + " jobs inserted already");
120 System.out.println(" " + counAlignments + " jalview jobs");
121 System.out.println(" " + countStrange + " not analysed jobs");
122 System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
123 System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
124 System.out.println(" " + countinsertions + " new job insertions\n");
125 } catch (MalformedURLException e) {
127 } catch (IOException e) {