Fix problem with new table column names: ProgrammeName -> ProgramName, Version -...
[proteocache.git] / datadb / compbio / cassandra / JpredParserHTTP.java
index 27f66cc..825689e 100644 (file)
@@ -1,8 +1,6 @@
 package compbio.cassandra;
 
 import java.io.BufferedReader;
-import java.io.DataInputStream;
-import java.io.EOFException;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -11,27 +9,38 @@ import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.URLConnection;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Date;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import compbio.cassandra.JpredParser;
+import compbio.data.sequence.FastaReader;
+import compbio.data.sequence.FastaSequence;
+import compbio.engine.JpredJob;
+import compbio.engine.ProteoCachePropertyHelperManager;
+import compbio.engine.archive.Archive;
+import compbio.engine.archive.ArchivedJob;
+import compbio.util.PropertyHelper;
+import compbio.util.Util;
 
 public class JpredParserHTTP implements JpredParser {
-       private CassandraNativeConnector cc = new CassandraNativeConnector();
+       private CassandraWriter cw = new CassandraWriter();
+       private static Archive archive;
        private String dirprefix;
        private List<FastaSequence> alignment;
        private List<FastaSequence> predictions;
-       private String jnetpred;
+       private int countNoData;
+       private static boolean archiving = false;
+       private static final PropertyHelper ph = ProteoCachePropertyHelperManager.getPropertyHelper();
 
-       JpredParserHTTP() {
+       public JpredParserHTTP() {
                dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
        }
 
-       JpredParserHTTP(String sourceurl) {
+       public JpredParserHTTP(String sourceurl) {
                dirprefix = sourceurl;
        }
 
@@ -39,75 +48,188 @@ public class JpredParserHTTP implements JpredParser {
                dirprefix = newsourceprefix;
        }
 
+       private boolean initBooleanValue(String key) {
+               assert key != null;
+               String status = ph.getProperty(key);
+               if (Util.isEmpty(status)) {
+                       return false;
+               }
+               return new Boolean(status.trim()).booleanValue();
+       }
+
        public void Parsing(String source, int nDays) throws IOException {
                Calendar cal = Calendar.getInstance();
                cal.add(Calendar.DATE, -nDays);
+               archiving = initBooleanValue("archive.enable");
+               if (archiving) {
+                       archive = new Archive();
+               }
                for (int i = 0; i < nDays; ++i) {
                        cal.add(Calendar.DATE, 1);
-                       int month = cal.get(Calendar.MONTH) + 1;
-                       int year = cal.get(Calendar.YEAR);
-                       int day = cal.get(Calendar.DATE);
-                       String date = year + "/" + month + "/" + day;
-                       ParsingForDate(source, date);
+                       String date = cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DATE);
+                       ParsingOneDay(source, date);
                }
        }
 
+       /*
+        * The method parses the Jpred output concise file in the FASTA format If
+        * there is a record with ID = QUERY or jobid, this a "one protein" job
+        * otherwise this is an alignment job
+        */
        private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
                final FastaReader fr = new FastaReader(stream);
-               String query = "";
-               alignment = new ArrayList<FastaSequence>();
-               predictions = new ArrayList<FastaSequence>();
+               String protein = "";
                while (fr.hasNext()) {
                        final FastaSequence fs = fr.next();
                        String seqid = fs.getId();
                        String seq = fs.getSequence().replaceAll("\n", "");
-                       if (seqid.equals("QUERY") || seqid.equals(jobid)) {
-                               query = seq;
-                               alignment.add(fs);
-                       } else if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
+                       if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
                                        || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
-                                       || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM")) {
+                                       || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) {
                                predictions.add(fs);
-                               if (seqid.equals("jnetpred"))
-                                       jnetpred = seq;
                        } else {
                                alignment.add(fs);
+                               if (seqid.equals("QUERY") || seqid.equals(jobid))
+                                       protein = seq;
                        }
                }
-               return query;
+               return protein;
        }
 
-       private String parseLogFile(final InputStream stream) throws IOException {
+       private String parseSeqFile(final InputStream stream, String jobid) throws FileNotFoundException {
+               final FastaReader fr = new FastaReader(stream);
+               String protein = "";
+               final FastaSequence fs = fr.next();
+               protein = fs.getSequence().replaceAll("\n", "");
+               if (fr.hasNext()) {
+                       // this is an aligment job...
+                       return "alignment";
+               }
+               return protein;
+       }
+
+       private String parseLogFile(final InputStream stream, JpredJob job) throws IOException {
                String out = "";
                BufferedReader buffer = new BufferedReader(new InputStreamReader(stream));
                String line;
+               if (null != (out = buffer.readLine()) && (out.contains("version"))) {
+                       Matcher matcher = Pattern.compile("((\\d|\\.)+)").matcher(out);
+                       if (matcher.find())
+                               job.setProgramVersion(matcher.group(0));
+               }
                while (null != (line = buffer.readLine())) {
-                       out += line;
+                       out += line;            
                }
                return out;
        }
 
-       private List<Byte> parseArchiveFile(final InputStream stream) throws IOException {
-               DataInputStream data_in = new DataInputStream(stream);
-               List<Byte> out = new ArrayList<Byte>();
-               while (true) {
-                       try {
-                               out.add(data_in.readByte());
-                       } catch (EOFException eof) {
-                               break;
+       private int analyseJob(String[] jobinfo) throws IOException {
+               alignment = new ArrayList<FastaSequence>();
+               predictions = new ArrayList<FastaSequence>();
+               boolean running = true;
+               boolean ConcisefileExists = false;
+               boolean LogfileExists = false;
+               JpredJob job = new JpredJob(jobinfo[jobinfo.length - 1], jobinfo[0], jobinfo[1]);
+               job.setIP(jobinfo[2]);
+               job.setProgramName("Jpred");
+               Date currDate = new Date();
+               String maindir = dirprefix + "/" + job.getJobID() + "/";
+
+               try {
+                       URL dirurl = new URL(maindir);
+                       HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection();
+                       if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) {
+                               return 0;
+                       }
+                       URL conciseurl = new URL(maindir + job.getJobID() + ".concise.fasta");
+                       URL logurl = new URL(maindir + "LOG");
+                       HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection();
+                       HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection();
+                       if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) {
+                               ConcisefileExists = true;
+                               running = false;
+                               try {                           
+                                       job.setProtein(parsePredictions(conciseurl.openStream(), job.getJobID()));
+                               } catch (IOException e) {
+                                       e.printStackTrace();
+                               }
+                       } else {
+                               // The job still can be running of failed...
+                               ++countNoData;
+                       }
+                       if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) {
+                               LogfileExists = true;
+                               job.setLog(parseLogFile(logurl.openStream(), job));
+                       } else {
+                               // The job has not been started at all...
+                               job.setExecutionStatus("FAIL");
+                               job.setFinalStatus("STOPPED");
+                               running = false;
+                       }
+                       if (job.getLog().matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) {
+                               // blast job was too long (more than 3600 secs by default)...
+                               job.setExecutionStatus("FAIL");
+                               job.setFinalStatus("TIMEDOUT");
+                               running = false;
+                       } else if (job.getLog().matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) {
+                               // an internal Jpred error...
+                               job.setExecutionStatus("FAIL");
+                               job.setFinalStatus("JPREDERROR");
+                               running = false;
+                       } else if ((currDate.getTime() - job.getEndTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) {
+                               // the job was stopped with unknown reason...
+                               job.setExecutionStatus("FAIL");
+                               job.setFinalStatus("STOPPED");
+                               running = false;
+                       }
+
+                       httpConnection_conciseurl.disconnect();
+                       httpConnection_logurl.disconnect();
+               } catch (MalformedURLException e) {
+                       e.printStackTrace();
+               }
+
+               if (!running) {
+                       // logging the job
+                       job.setAlignment(alignment);
+                       job.setPredictions(predictions);
+                       if (job.getExecutionStatus().equals("FAIL")) {
+                               URL sequrl = new URL(maindir + job.getJobID() + ".seq");
+                               HttpURLConnection httpConnection_sequrl = (HttpURLConnection) sequrl.openConnection();
+                               if (199 < httpConnection_sequrl.getResponseCode() && httpConnection_sequrl.getResponseCode() < 300) {
+                                       try {
+                                               job.setProtein(parseSeqFile(sequrl.openStream(), job.getJobID()));
+                                       } catch (IOException e) {
+                                               e.printStackTrace();
+                                       }
+                               }
+                       }
+                       cw.FormQueryTables(job);
+
+                       // archiving the job
+                       if (archiving) {
+                               ArchivedJob ajob = new ArchivedJob(job.getJobID());
+                               String arlink = archive.createJob(job.getJobID());
+                               if (job.getFinalStatus().equals("OK")) {
+                                       ajob.setArchivePath(arlink);
+                                       ajob.copyArchiveFromWeb(maindir + job.getJobID() + ".tar.gz");
+                                       cw.ArchiveData(job, arlink);
+                               } else {
+                                       cw.ArchiveData(job, "undefined");
+                               }
                        }
+                       return 1;
                }
-               return out;
+
+               return 0;
        }
 
-       private void ParsingForDate(String input, String date) {
+       private void ParsingOneDay(String input, String date) {
                int totalcount = 0;
-               int countNoData = 0;
-               int countUnclearFASTAid = 0;
                int countinsertions = 0;
                int countinserted = 0;
-               int counAlignments = 0;
-               int countStrange = 0;
+               int countNotanalyzed = 0;
+               countNoData = 0;
 
                System.out.println("Inserting jobs for " + date);
                try {
@@ -118,79 +240,23 @@ public class JpredParserHTTP implements JpredParser {
 
                        while ((line = alljobs.readLine()) != null) {
                                if (line.matches(date + ":(.*)jp_[^\\s]+")) {
-                                       String[] table = line.split("\\s+");
-                                       // Format of a record:
-                                       // starttime endtime ip email jobid (directory)
-                                       // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172
-                                       // unknown_email jp_J9HBCBT
-                                       String id = table[table.length - 1];
                                        totalcount++;
-                                       if (cc.JobisNotInsterted(id)) {
-                                               URL dataurl = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
-                                               URL archiveurl = new URL(dirprefix + "/" + id + "/" + id + ".tar.gz");
-                                               URL logurl = new URL(dirprefix + "/" + id + "/LOG");
-                                               HttpURLConnection httpConnection1 = (HttpURLConnection) dataurl.openConnection();
-                                               HttpURLConnection httpConnection2 = (HttpURLConnection) logurl.openConnection();
-                                               HttpURLConnection httpConnection3 = (HttpURLConnection) archiveurl.openConnection();
-                                               int response1 = httpConnection1.getResponseCode();
-                                               int response2 = httpConnection2.getResponseCode();
-                                               if (199 < response1 && response1 < 300) {
-                                                       try {
-                                                               String protein = parsePredictions(dataurl.openStream(), id);
-                                                               if (protein.equals("")) {
-                                                                       countUnclearFASTAid++;
-                                                               } else {
-                                                                       SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd");
-                                                                       SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s");
-                                                                       String startdatestring = table[0].substring(0, table[0].indexOf(":"));
-                                                                       try {
-                                                                               Date startdate = dateformatter.parse(startdatestring);
-                                                                               Date starttime = timeformatter.parse(table[0]);
-                                                                               Date endtime = timeformatter.parse(table[1]);
-                                                                               String ip = table[2];
-                                                                               String execstatus = "OK";
-                                                                               String finalstatus = "OK";
-                                                                               countinsertions += cc.FormQueryTables(startdate.getTime(), table[0], table[1], ip, id, execstatus,
-                                                                                               finalstatus, protein, predictions);
-
-                                                                               long exectime = (endtime.getTime() - starttime.getTime()) / 1000;
-                                                                               String log = "";
-                                                                               if (199 < response2 && response2 < 300) {
-                                                                                       log = parseLogFile(logurl.openStream());
-                                                                               }
-                                                                               cc.ArchiveData(startdate.getTime(), exectime, ip, id, execstatus, finalstatus, protein,
-                                                                                               predictions, alignment, log, archiveurl.toString());
-                                                                       } catch (ParseException e) {
-                                                                               e.printStackTrace();
-                                                                       }
-                                                               }
-                                                       } catch (IOException e) {
-                                                               e.printStackTrace();
-                                                       }
-                                               } else {
-                                                       countNoData++;
-                                               }
-                                               httpConnection1.disconnect();
-                                               httpConnection2.disconnect();
-                                               httpConnection3.disconnect();
+                                       String[] job = line.split("\\s+");
+                                       String jobid = job[job.length - 1];
+                                       if (cw.JobisNotInsterted(jobid)) {
+                                               countinsertions += analyseJob(job);
                                        } else {
                                                ++countinserted;
                                        }
                                } else {
-                                       if (line.matches(date + "(.*)Sequence0/(.*)")) {
-                                               ++counAlignments;
-                                       } else {
-                                               ++countStrange;
-                                       }
+                                       ++countNotanalyzed;
                                }
                        }
                        alljobs.close();
                        System.out.println("Total number of jobs = " + totalcount);
                        System.out.println("   " + countinserted + " jobs inserted already");
-                       System.out.println("   " + counAlignments + " jalview jobs");
-                       System.out.println("   " + countStrange + " not analysed jobs");
-                       System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
-                       System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+                       System.out.println("   " + countNotanalyzed + " not analysed jobs");
+                       System.out.println("   " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)");
                        System.out.println("   " + countinsertions + " new job insertions\n");
                } catch (MalformedURLException e) {
                        e.printStackTrace();
@@ -199,4 +265,4 @@ public class JpredParserHTTP implements JpredParser {
                }
                ;
        }
-}
+};