import java.io.BufferedReader;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.util.List;
public class JpredParserLocalFile implements JpredParser {
- private CassandraCreate cc = new CassandraCreate();
+ private CassandraNativeConnector cc = new CassandraNativeConnector();
private String dirprefix;
- public void setSource (String newsourceprefix) {
+ public void setSource(String newsourceprefix) {
this.dirprefix = newsourceprefix;
}
JpredParserLocalFile() {
this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
}
-
+
JpredParserLocalFile(String sourceurl) {
this.dirprefix = sourceurl;
}
- public void Parsing(String source, int nDays) {
+ public void Parsing(String source, int nDays) throws IOException {
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -nDays);
+ List<String> alljobs = new ArrayList<String>();
+ File file = new File(source);
+ BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
+ String line;
+
+ while ((line = alljobsfile.readLine()) != null) {
+ alljobs.add(line);
+ }
+ alljobsfile.close();
+
+ System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
+ final long startTime = System.currentTimeMillis();
for (int i = 0; i < nDays; ++i) {
cal.add(Calendar.DATE, 1);
int month = cal.get(Calendar.MONTH) + 1;
int year = cal.get(Calendar.YEAR);
int day = cal.get(Calendar.DATE);
String date = year + "/" + month + "/" + day;
- if (0 < ParsingForDate(source, date)) {
- cc.flushData();
- }
+ ParsingForDate(alljobs, date);
}
+ final long execTime = System.currentTimeMillis() - startTime;
+ System.out.println("Execution Time = " + execTime + " ms");
}
- private int ParsingForDate(String input, String date) {
+ private int ParsingForDate(List<String> input, String date) {
int totalcount = 0;
int countNoData = 0;
int countUnclearFASTAid = 0;
int njobs = 0;
System.out.println("Inserting jobs for " + date);
- try {
- File file = new File(input);
- BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
- String line;
-
- while ((line = alljobs.readLine()) != null) {
- if (line.matches(date + "(.*)jp_[^\\s]+")) {
- String[] table = line.split("\\s+");
- String id = table[table.length - 1];
- totalcount++;
- if (!cc.CheckID(id)) {
- String confilename = dirprefix + "/" + id + "/" + id + ".concise";
- File confile = new File(confilename);
- if (confile.exists()) {
- try {
- final FastaReader fr = new FastaReader(confilename);
- final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
- String newprotein = "";
- while (fr.hasNext()) {
- final FastaSequence fs = fr.next();
- if (fs.getId().equals("QUERY") || fs.getId().equals(id))
- newprotein = fs.getSequence().replaceAll("\n", "");
- else
- seqs.add(fs);
+ for (String in : input) {
+ if (in.matches(date + "(.*)jp_[^\\s]+")) {
+ String[] table = in.split("\\s+");
+ String starttime = table[0];
+ String finishtime = table[1];
+ String ip = table[2];
+ String id = table[table.length - 1];
+ totalcount++;
+ //if (!cc.CheckID(id)) {
+ if (true) {
+ String confilename = dirprefix + "/" + id + "/" + id + ".concise";
+ File confile = new File(confilename);
+ if (confile.exists()) {
+ try {
+ final FastaReader fr = new FastaReader(confilename);
+ final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
+ String newprotein = "";
+ while (fr.hasNext()) {
+ final FastaSequence fs = fr.next();
+ if (fs.getId().equals("QUERY") || fs.getId().equals(id))
+ newprotein = fs.getSequence().replaceAll("\n", "");
+ else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
+ seqs.add(fs);
}
- if (newprotein.equals("")) {
- countUnclearFASTAid++;
- } else {
- SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
- String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
- long dateWork1 = 0;
- try {
- Date dat1 = formatter.parse(dateInString1);
- dateWork1 = dat1.getTime();
- } catch (ParseException e) {
- e.printStackTrace();
- }
- cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
- ++countinsertions;
- ++njobs;
- // flush every 50 insertions
- if (0 == countinsertions % 50) {
- cc.flushData();
- njobs -= 50;
- }
+ }
+ if (newprotein.equals("")) {
+ countUnclearFASTAid++;
+ } else {
+ SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
+ String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
+ long dateWork1 = 0;
+ try {
+ Date dat = formatter.parse(dateInString1);
+ dateWork1 = dat.getTime();
+ } catch (ParseException e) {
+ e.printStackTrace();
}
- fr.close();
- } catch (IOException e) {
- e.printStackTrace();
+ cc.InsertData(dateWork1, starttime, finishtime, ip, id, "OK", "OK", newprotein, seqs);
+ ++countinsertions;
+ ++njobs;
+ // flush every 50 insertions
+ //if (0 == countinsertions % 50) {
+ // cc.flushData();
+ // njobs -= 50;
+ //}
}
- } else {
- countNoData++;
+ fr.close();
+ } catch (IOException e) {
+ e.printStackTrace();
}
} else {
- ++countinserted;
+ countNoData++;
}
} else {
- if (line.matches(date + "(.*)Sequence0/(.*)")) {
- ++counAlignments;
- } else {
- ++countStrange;
- }
+ ++countinserted;
+ }
+ } else {
+ if (in.matches(date + "(.*)Sequence0/(.*)")) {
+ ++counAlignments;
+ } else {
+ ++countStrange;
}
}
- alljobs.close();
+ }
+ if (true) {
System.out.println("Total number of jobs = " + totalcount);
System.out.println(" " + countinserted + " jobs inserted already");
System.out.println(" " + counAlignments + " jalview jobs");
System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
System.out.println(" " + countinsertions + " new job insertions\n");
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
}
return njobs;
}
+
}