package compbio.cassandra; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.io.FileInputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; public class JpredParserLocalFile implements JpredParser { private CassandraWriter cw = new CassandraWriter(); private String dirprefix; public void setSource(String newsourceprefix) { this.dirprefix = newsourceprefix; } public JpredParserLocalFile() { this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat"; } public JpredParserLocalFile(String sourceurl) { this.dirprefix = sourceurl; } public void Parsing(String source, int nDays) throws IOException { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DATE, -nDays); List alljobs = new ArrayList(); File file = new File(source); BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; while ((line = alljobsfile.readLine()) != null) { alljobs.add(line); } alljobsfile.close(); System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total"); final long startTime = System.currentTimeMillis(); for (int i = 0; i < nDays; ++i) { cal.add(Calendar.DATE, 1); int month = cal.get(Calendar.MONTH) + 1; int year = cal.get(Calendar.YEAR); int day = cal.get(Calendar.DATE); String date = year + "/" + month + "/" + day; ParsingForDate(alljobs, date); } final long execTime = System.currentTimeMillis() - startTime; System.out.println("Execution Time = " + execTime + " ms"); } private void ParsingForDate(List input, String date) { int totalcount = 0; int countNoData = 0; int countUnclearFASTAid = 0; int countinsertions = 0; int countinserted = 0; int counAlignments = 0; int countStrange = 0; System.out.println("Inserting jobs for " + date); for (String in : input) { if (in.matches(date + ":(.*)jp_[^\\s]+")) { String[] table = in.split("\\s+"); String starttime = table[0]; String finishtime = table[1]; String ip = table[2]; String id = table[table.length - 1]; totalcount++; String confilename = dirprefix + "/" + id + "/" + id + ".concise"; File confile = new File(confilename); if (confile.exists()) { try { final FastaReader fr = new FastaReader(confilename); final List seqs = new ArrayList(); String newprotein = ""; while (fr.hasNext()) { final FastaSequence fs = fr.next(); if (fs.getId().equals("QUERY") || fs.getId().equals(id)) newprotein = fs.getSequence().replaceAll("\n", ""); else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) { seqs.add(fs); } } if (newprotein.equals("")) { countUnclearFASTAid++; } else { SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd"); String dateInString1 = starttime.substring(0, starttime.indexOf(":")); long insertdate = 0; try { Date dat = formatter.parse(dateInString1); insertdate = dat.getTime(); } catch (ParseException e) { e.printStackTrace(); } countinsertions += cw.FormQueryTables(insertdate, starttime, finishtime, ip, id, "OK", "OK", newprotein, seqs); } fr.close(); } catch (IOException e) { e.printStackTrace(); } } else { countNoData++; } } else { if (in.matches(date + "(.*)Sequence0/(.*)")) { ++counAlignments; } else { ++countStrange; } } } if (true) { System.out.println("Total number of jobs = " + totalcount); System.out.println(" " + countinserted + " jobs inserted already"); System.out.println(" " + counAlignments + " jalview jobs"); System.out.println(" " + countStrange + " not analysed jobs"); System.out.println(" " + countNoData + " jobs without *.concise.fasta file"); System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta"); System.out.println(" " + countinsertions + " new job insertions\n"); } } }