From 3a1addd7da5dfe1337db2805b88429648ef4340b Mon Sep 17 00:00:00 2001 From: Sasha Sherstnev Date: Tue, 12 Nov 2013 15:23:04 +0000 Subject: [PATCH] Re-design the parser for inserting alignment-only and failed jobs --- datadb/compbio/cassandra/JpredParserHTTP.java | 199 +++++++++++++++---------- 1 file changed, 122 insertions(+), 77 deletions(-) diff --git a/datadb/compbio/cassandra/JpredParserHTTP.java b/datadb/compbio/cassandra/JpredParserHTTP.java index 5687a83..bf4c460 100644 --- a/datadb/compbio/cassandra/JpredParserHTTP.java +++ b/datadb/compbio/cassandra/JpredParserHTTP.java @@ -25,7 +25,7 @@ public class JpredParserHTTP implements JpredParser { private String dirprefix; private List alignment; private List predictions; - private String jnetpred; + private int countNoData; public JpredParserHTTP() { dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results"; @@ -52,29 +52,31 @@ public class JpredParserHTTP implements JpredParser { } } + /* + * The method parses the Jpred output concise file in the FASTA format If + * there is a record with ID = QUERY or jobid, this a "one protein" job + * otherwise this is an alignment job + */ private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException { final FastaReader fr = new FastaReader(stream); - String query = ""; + String protein = ""; alignment = new ArrayList(); predictions = new ArrayList(); while (fr.hasNext()) { final FastaSequence fs = fr.next(); String seqid = fs.getId(); String seq = fs.getSequence().replaceAll("\n", ""); - if (seqid.equals("QUERY") || seqid.equals(jobid)) { - query = seq; - alignment.add(fs); - } else if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28") + if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28") || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF") - || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM")) { + || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) { predictions.add(fs); - if (seqid.equals("jnetpred")) - jnetpred = seq; } else { alignment.add(fs); + if (seqid.equals("QUERY") || seqid.equals(jobid)) + protein = seq; } } - return query; + return protein; } private String parseLogFile(final InputStream stream) throws IOException { @@ -100,14 +102,113 @@ public class JpredParserHTTP implements JpredParser { return out; } + private int analyseJob(String[] job) throws IOException { + boolean running = true; + boolean ConcisefileExists = false; + boolean LogfileExists = false; + String id = job[job.length - 1]; + String startdatestring = job[0].substring(0, job[0].indexOf(":")); + Date startdate = new Date(0); + Date starttime = new Date(0); + Date endtime = new Date(0); + Date currDate = new Date(); + String ip = job[2]; + String execstatus = "OK"; + String finalstatus = "OK"; + String protein = ""; + long exectime = 0; + String log = ""; + String maindir = dirprefix + "/" + id + "/"; + String concisefile = dirprefix + "/" + id + "/" + id + ".concise.fasta"; + String archivefile = dirprefix + "/" + id + "/" + id + ".tar.gz"; + String logfile = dirprefix + "/" + id + "/LOG"; + SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd"); + SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s"); + try { + startdate = dateformatter.parse(startdatestring); + starttime = timeformatter.parse(job[0]); + endtime = timeformatter.parse(job[1]); + exectime = (endtime.getTime() - starttime.getTime()) / 1000; + } catch (ParseException e) { + e.printStackTrace(); + } + + try { + URL dirurl = new URL(maindir); + HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection(); + if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) { + return 0; + } + URL conciseurl = new URL(concisefile); + URL archiveurl = new URL(archivefile); + URL logurl = new URL(logfile); + HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection(); + HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection(); + HttpURLConnection httpConnection_archiveurl = (HttpURLConnection) archiveurl.openConnection(); + if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) { + ConcisefileExists = true; + running = false; + try { + protein = parsePredictions(conciseurl.openStream(), id); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + // The job still can be running of failed... + ++countNoData; + alignment = new ArrayList(); + predictions = new ArrayList(); + } + if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) { + LogfileExists = true; + log = parseLogFile(logurl.openStream()); + } else { + // The job has not been started at all... + execstatus = "FAIL"; + finalstatus = "STOPPED"; + running = false; + } + if (log.matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) { + // blast job was too long (more than 3600 secs by default)... + execstatus = "FAIL"; + finalstatus = "TIMEDOUT"; + running = false; + } else if (log.matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) { + // an internal Jpred error... + execstatus = "FAIL"; + finalstatus = "JPREDERROR"; + running = false; + } else if ((currDate.getTime() - endtime.getTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) { + // the job was stopped with unknown reason... + execstatus = "FAIL"; + finalstatus = "STOPPED"; + running = false; + } + + httpConnection_conciseurl.disconnect(); + httpConnection_logurl.disconnect(); + httpConnection_archiveurl.disconnect(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + + if (!running) { + long t = startdate.getTime(); + cw.FormQueryTables(t, job[0], job[1], ip, id, execstatus, finalstatus, protein, predictions); + cw.ArchiveData(t, exectime, ip, id, execstatus, finalstatus, protein, predictions, alignment, log, archivefile); + return 1; + } else + System.out.println("job " + id + " is running"); + + return 0; + } + private void ParsingForDate(String input, String date) { int totalcount = 0; - int countNoData = 0; - int countUnclearFASTAid = 0; int countinsertions = 0; int countinserted = 0; - int counAlignments = 0; - int countStrange = 0; + int countNotanalyzed = 0; + countNoData = 0; System.out.println("Inserting jobs for " + date); try { @@ -118,79 +219,23 @@ public class JpredParserHTTP implements JpredParser { while ((line = alljobs.readLine()) != null) { if (line.matches(date + ":(.*)jp_[^\\s]+")) { - String[] table = line.split("\\s+"); - // Format of a record: - // starttime endtime ip email jobid (directory) - // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 - // unknown_email jp_J9HBCBT - String id = table[table.length - 1]; totalcount++; - if (cw.JobisNotInsterted(id)) { - URL dataurl = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta"); - URL archiveurl = new URL(dirprefix + "/" + id + "/" + id + ".tar.gz"); - URL logurl = new URL(dirprefix + "/" + id + "/LOG"); - HttpURLConnection httpConnection1 = (HttpURLConnection) dataurl.openConnection(); - HttpURLConnection httpConnection2 = (HttpURLConnection) logurl.openConnection(); - HttpURLConnection httpConnection3 = (HttpURLConnection) archiveurl.openConnection(); - int response1 = httpConnection1.getResponseCode(); - int response2 = httpConnection2.getResponseCode(); - if (199 < response1 && response1 < 300) { - try { - String protein = parsePredictions(dataurl.openStream(), id); - if (protein.equals("")) { - countUnclearFASTAid++; - } else { - SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd"); - SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s"); - String startdatestring = table[0].substring(0, table[0].indexOf(":")); - try { - Date startdate = dateformatter.parse(startdatestring); - Date starttime = timeformatter.parse(table[0]); - Date endtime = timeformatter.parse(table[1]); - String ip = table[2]; - String execstatus = "OK"; - String finalstatus = "OK"; - countinsertions += cw.FormQueryTables(startdate.getTime(), table[0], table[1], ip, id, execstatus, - finalstatus, protein, predictions); - - long exectime = (endtime.getTime() - starttime.getTime()) / 1000; - String log = ""; - if (199 < response2 && response2 < 300) { - log = parseLogFile(logurl.openStream()); - } - cw.ArchiveData(startdate.getTime(), exectime, ip, id, execstatus, finalstatus, protein, - predictions, alignment, log, archiveurl.toString()); - } catch (ParseException e) { - e.printStackTrace(); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } else { - countNoData++; - } - httpConnection1.disconnect(); - httpConnection2.disconnect(); - httpConnection3.disconnect(); + String[] job = line.split("\\s+"); + String jobid = job[job.length - 1]; + if (cw.JobisNotInsterted(jobid)) { + countinsertions += analyseJob(job); } else { ++countinserted; } } else { - if (line.matches(date + "(.*)Sequence0/(.*)")) { - ++counAlignments; - } else { - ++countStrange; - } + ++countNotanalyzed; } } alljobs.close(); System.out.println("Total number of jobs = " + totalcount); System.out.println(" " + countinserted + " jobs inserted already"); - System.out.println(" " + counAlignments + " jalview jobs"); - System.out.println(" " + countStrange + " not analysed jobs"); - System.out.println(" " + countNoData + " jobs without *.concise.fasta file"); - System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta"); + System.out.println(" " + countNotanalyzed + " not analysed jobs"); + System.out.println(" " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)"); System.out.println(" " + countinsertions + " new job insertions\n"); } catch (MalformedURLException e) { e.printStackTrace(); -- 1.7.10.2