Re-design the parser for inserting alignment-only and failed jobs
authorSasha Sherstnev <a.sherstnev@dundee.ac.uk>
Tue, 12 Nov 2013 15:23:04 +0000 (15:23 +0000)
committerSasha Sherstnev <a.sherstnev@dundee.ac.uk>
Tue, 12 Nov 2013 15:23:04 +0000 (15:23 +0000)
datadb/compbio/cassandra/JpredParserHTTP.java

index 5687a83..bf4c460 100644 (file)
@@ -25,7 +25,7 @@ public class JpredParserHTTP implements JpredParser {
        private String dirprefix;
        private List<FastaSequence> alignment;
        private List<FastaSequence> predictions;
-       private String jnetpred;
+       private int countNoData;
 
        public JpredParserHTTP() {
                dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
@@ -52,29 +52,31 @@ public class JpredParserHTTP implements JpredParser {
                }
        }
 
+       /*
+        * The method parses the Jpred output concise file in the FASTA format If
+        * there is a record with ID = QUERY or jobid, this a "one protein" job
+        * otherwise this is an alignment job
+        */
        private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
                final FastaReader fr = new FastaReader(stream);
-               String query = "";
+               String protein = "";
                alignment = new ArrayList<FastaSequence>();
                predictions = new ArrayList<FastaSequence>();
                while (fr.hasNext()) {
                        final FastaSequence fs = fr.next();
                        String seqid = fs.getId();
                        String seq = fs.getSequence().replaceAll("\n", "");
-                       if (seqid.equals("QUERY") || seqid.equals(jobid)) {
-                               query = seq;
-                               alignment.add(fs);
-                       } else if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
+                       if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
                                        || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
-                                       || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM")) {
+                                       || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) {
                                predictions.add(fs);
-                               if (seqid.equals("jnetpred"))
-                                       jnetpred = seq;
                        } else {
                                alignment.add(fs);
+                               if (seqid.equals("QUERY") || seqid.equals(jobid))
+                                       protein = seq;
                        }
                }
-               return query;
+               return protein;
        }
 
        private String parseLogFile(final InputStream stream) throws IOException {
@@ -100,14 +102,113 @@ public class JpredParserHTTP implements JpredParser {
                return out;
        }
 
+       private int analyseJob(String[] job) throws IOException {
+               boolean running = true;
+               boolean ConcisefileExists = false;
+               boolean LogfileExists = false;
+               String id = job[job.length - 1];
+               String startdatestring = job[0].substring(0, job[0].indexOf(":"));
+               Date startdate = new Date(0);
+               Date starttime = new Date(0);
+               Date endtime = new Date(0);
+               Date currDate = new Date();
+               String ip = job[2];
+               String execstatus = "OK";
+               String finalstatus = "OK";
+               String protein = "";
+               long exectime = 0;
+               String log = "";
+               String maindir = dirprefix + "/" + id + "/";
+               String concisefile = dirprefix + "/" + id + "/" + id + ".concise.fasta";
+               String archivefile = dirprefix + "/" + id + "/" + id + ".tar.gz";
+               String logfile = dirprefix + "/" + id + "/LOG";
+               SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd");
+               SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s");
+               try {
+                       startdate = dateformatter.parse(startdatestring);
+                       starttime = timeformatter.parse(job[0]);
+                       endtime = timeformatter.parse(job[1]);
+                       exectime = (endtime.getTime() - starttime.getTime()) / 1000;
+               } catch (ParseException e) {
+                       e.printStackTrace();
+               }
+
+               try {
+                       URL dirurl = new URL(maindir);
+                       HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection();
+                       if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) {
+                               return 0;
+                       }
+                       URL conciseurl = new URL(concisefile);
+                       URL archiveurl = new URL(archivefile);
+                       URL logurl = new URL(logfile);
+                       HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection();
+                       HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection();
+                       HttpURLConnection httpConnection_archiveurl = (HttpURLConnection) archiveurl.openConnection();
+                       if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) {
+                               ConcisefileExists = true;
+                               running = false;
+                               try {
+                                       protein = parsePredictions(conciseurl.openStream(), id);
+                               } catch (IOException e) {
+                                       e.printStackTrace();
+                               }
+                       } else {
+                               // The job still can be running of failed...
+                               ++countNoData;
+                               alignment = new ArrayList<FastaSequence>();
+                               predictions = new ArrayList<FastaSequence>();
+                       }
+                       if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) {
+                               LogfileExists = true;
+                               log = parseLogFile(logurl.openStream());
+                       } else {
+                               // The job has not been started at all...
+                               execstatus = "FAIL";
+                               finalstatus = "STOPPED";
+                               running = false;
+                       }
+                       if (log.matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) {
+                               // blast job was too long (more than 3600 secs by default)...
+                               execstatus = "FAIL";
+                               finalstatus = "TIMEDOUT";
+                               running = false;
+                       } else if (log.matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) {
+                               // an internal Jpred error...
+                               execstatus = "FAIL";
+                               finalstatus = "JPREDERROR";
+                               running = false;
+                       } else if ((currDate.getTime() - endtime.getTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) {
+                               // the job was stopped with unknown reason...
+                               execstatus = "FAIL";
+                               finalstatus = "STOPPED";
+                               running = false;
+                       }
+
+                       httpConnection_conciseurl.disconnect();
+                       httpConnection_logurl.disconnect();
+                       httpConnection_archiveurl.disconnect();
+               } catch (MalformedURLException e) {
+                       e.printStackTrace();
+               }
+
+               if (!running) {
+                       long t = startdate.getTime();
+                       cw.FormQueryTables(t, job[0], job[1], ip, id, execstatus, finalstatus, protein, predictions);
+                       cw.ArchiveData(t, exectime, ip, id, execstatus, finalstatus, protein, predictions, alignment, log, archivefile);
+                       return 1;
+               } else
+                       System.out.println("job " + id + " is running");
+
+               return 0;
+       }
+
        private void ParsingForDate(String input, String date) {
                int totalcount = 0;
-               int countNoData = 0;
-               int countUnclearFASTAid = 0;
                int countinsertions = 0;
                int countinserted = 0;
-               int counAlignments = 0;
-               int countStrange = 0;
+               int countNotanalyzed = 0;
+               countNoData = 0;
 
                System.out.println("Inserting jobs for " + date);
                try {
@@ -118,79 +219,23 @@ public class JpredParserHTTP implements JpredParser {
 
                        while ((line = alljobs.readLine()) != null) {
                                if (line.matches(date + ":(.*)jp_[^\\s]+")) {
-                                       String[] table = line.split("\\s+");
-                                       // Format of a record:
-                                       // starttime endtime ip email jobid (directory)
-                                       // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172
-                                       // unknown_email jp_J9HBCBT
-                                       String id = table[table.length - 1];
                                        totalcount++;
-                                       if (cw.JobisNotInsterted(id)) {
-                                               URL dataurl = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
-                                               URL archiveurl = new URL(dirprefix + "/" + id + "/" + id + ".tar.gz");
-                                               URL logurl = new URL(dirprefix + "/" + id + "/LOG");
-                                               HttpURLConnection httpConnection1 = (HttpURLConnection) dataurl.openConnection();
-                                               HttpURLConnection httpConnection2 = (HttpURLConnection) logurl.openConnection();
-                                               HttpURLConnection httpConnection3 = (HttpURLConnection) archiveurl.openConnection();
-                                               int response1 = httpConnection1.getResponseCode();
-                                               int response2 = httpConnection2.getResponseCode();
-                                               if (199 < response1 && response1 < 300) {
-                                                       try {
-                                                               String protein = parsePredictions(dataurl.openStream(), id);
-                                                               if (protein.equals("")) {
-                                                                       countUnclearFASTAid++;
-                                                               } else {
-                                                                       SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd");
-                                                                       SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s");
-                                                                       String startdatestring = table[0].substring(0, table[0].indexOf(":"));
-                                                                       try {
-                                                                               Date startdate = dateformatter.parse(startdatestring);
-                                                                               Date starttime = timeformatter.parse(table[0]);
-                                                                               Date endtime = timeformatter.parse(table[1]);
-                                                                               String ip = table[2];
-                                                                               String execstatus = "OK";
-                                                                               String finalstatus = "OK";
-                                                                               countinsertions += cw.FormQueryTables(startdate.getTime(), table[0], table[1], ip, id, execstatus,
-                                                                                               finalstatus, protein, predictions);
-
-                                                                               long exectime = (endtime.getTime() - starttime.getTime()) / 1000;
-                                                                               String log = "";
-                                                                               if (199 < response2 && response2 < 300) {
-                                                                                       log = parseLogFile(logurl.openStream());
-                                                                               }
-                                                                               cw.ArchiveData(startdate.getTime(), exectime, ip, id, execstatus, finalstatus, protein,
-                                                                                               predictions, alignment, log, archiveurl.toString());
-                                                                       } catch (ParseException e) {
-                                                                               e.printStackTrace();
-                                                                       }
-                                                               }
-                                                       } catch (IOException e) {
-                                                               e.printStackTrace();
-                                                       }
-                                               } else {
-                                                       countNoData++;
-                                               }
-                                               httpConnection1.disconnect();
-                                               httpConnection2.disconnect();
-                                               httpConnection3.disconnect();
+                                       String[] job = line.split("\\s+");
+                                       String jobid = job[job.length - 1];
+                                       if (cw.JobisNotInsterted(jobid)) {
+                                               countinsertions += analyseJob(job);
                                        } else {
                                                ++countinserted;
                                        }
                                } else {
-                                       if (line.matches(date + "(.*)Sequence0/(.*)")) {
-                                               ++counAlignments;
-                                       } else {
-                                               ++countStrange;
-                                       }
+                                       ++countNotanalyzed;
                                }
                        }
                        alljobs.close();
                        System.out.println("Total number of jobs = " + totalcount);
                        System.out.println("   " + countinserted + " jobs inserted already");
-                       System.out.println("   " + counAlignments + " jalview jobs");
-                       System.out.println("   " + countStrange + " not analysed jobs");
-                       System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
-                       System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+                       System.out.println("   " + countNotanalyzed + " not analysed jobs");
+                       System.out.println("   " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)");
                        System.out.println("   " + countinsertions + " new job insertions\n");
                } catch (MalformedURLException e) {
                        e.printStackTrace();