X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=datadb%2Fcompbio%2Fcassandra%2FJpredParserHTTP.java;h=825689eca19f36ce01f3a08aaf2949a6feaf2431;hb=5fb4cb600b34a9b33e1a96aae9d66cdd1c3201dc;hp=27f66cc0edc6f75ce3064f5d11ab045e8304d5e4;hpb=27060ede81676b3e84f13e2ea2f25836d8c3f6fa;p=proteocache.git diff --git a/datadb/compbio/cassandra/JpredParserHTTP.java b/datadb/compbio/cassandra/JpredParserHTTP.java index 27f66cc..825689e 100644 --- a/datadb/compbio/cassandra/JpredParserHTTP.java +++ b/datadb/compbio/cassandra/JpredParserHTTP.java @@ -1,8 +1,6 @@ package compbio.cassandra; import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.EOFException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -11,27 +9,38 @@ import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; -import java.text.ParseException; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import compbio.cassandra.JpredParser; +import compbio.data.sequence.FastaReader; +import compbio.data.sequence.FastaSequence; +import compbio.engine.JpredJob; +import compbio.engine.ProteoCachePropertyHelperManager; +import compbio.engine.archive.Archive; +import compbio.engine.archive.ArchivedJob; +import compbio.util.PropertyHelper; +import compbio.util.Util; public class JpredParserHTTP implements JpredParser { - private CassandraNativeConnector cc = new CassandraNativeConnector(); + private CassandraWriter cw = new CassandraWriter(); + private static Archive archive; private String dirprefix; private List alignment; private List predictions; - private String jnetpred; + private int countNoData; + private static boolean archiving = false; + private static final PropertyHelper ph = ProteoCachePropertyHelperManager.getPropertyHelper(); - JpredParserHTTP() { + public JpredParserHTTP() { dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results"; } - JpredParserHTTP(String sourceurl) { + public JpredParserHTTP(String sourceurl) { dirprefix = sourceurl; } @@ -39,75 +48,188 @@ public class JpredParserHTTP implements JpredParser { dirprefix = newsourceprefix; } + private boolean initBooleanValue(String key) { + assert key != null; + String status = ph.getProperty(key); + if (Util.isEmpty(status)) { + return false; + } + return new Boolean(status.trim()).booleanValue(); + } + public void Parsing(String source, int nDays) throws IOException { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DATE, -nDays); + archiving = initBooleanValue("archive.enable"); + if (archiving) { + archive = new Archive(); + } for (int i = 0; i < nDays; ++i) { cal.add(Calendar.DATE, 1); - int month = cal.get(Calendar.MONTH) + 1; - int year = cal.get(Calendar.YEAR); - int day = cal.get(Calendar.DATE); - String date = year + "/" + month + "/" + day; - ParsingForDate(source, date); + String date = cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DATE); + ParsingOneDay(source, date); } } + /* + * The method parses the Jpred output concise file in the FASTA format If + * there is a record with ID = QUERY or jobid, this a "one protein" job + * otherwise this is an alignment job + */ private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException { final FastaReader fr = new FastaReader(stream); - String query = ""; - alignment = new ArrayList(); - predictions = new ArrayList(); + String protein = ""; while (fr.hasNext()) { final FastaSequence fs = fr.next(); String seqid = fs.getId(); String seq = fs.getSequence().replaceAll("\n", ""); - if (seqid.equals("QUERY") || seqid.equals(jobid)) { - query = seq; - alignment.add(fs); - } else if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28") + if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28") || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF") - || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM")) { + || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) { predictions.add(fs); - if (seqid.equals("jnetpred")) - jnetpred = seq; } else { alignment.add(fs); + if (seqid.equals("QUERY") || seqid.equals(jobid)) + protein = seq; } } - return query; + return protein; } - private String parseLogFile(final InputStream stream) throws IOException { + private String parseSeqFile(final InputStream stream, String jobid) throws FileNotFoundException { + final FastaReader fr = new FastaReader(stream); + String protein = ""; + final FastaSequence fs = fr.next(); + protein = fs.getSequence().replaceAll("\n", ""); + if (fr.hasNext()) { + // this is an aligment job... + return "alignment"; + } + return protein; + } + + private String parseLogFile(final InputStream stream, JpredJob job) throws IOException { String out = ""; BufferedReader buffer = new BufferedReader(new InputStreamReader(stream)); String line; + if (null != (out = buffer.readLine()) && (out.contains("version"))) { + Matcher matcher = Pattern.compile("((\\d|\\.)+)").matcher(out); + if (matcher.find()) + job.setProgramVersion(matcher.group(0)); + } while (null != (line = buffer.readLine())) { - out += line; + out += line; } return out; } - private List parseArchiveFile(final InputStream stream) throws IOException { - DataInputStream data_in = new DataInputStream(stream); - List out = new ArrayList(); - while (true) { - try { - out.add(data_in.readByte()); - } catch (EOFException eof) { - break; + private int analyseJob(String[] jobinfo) throws IOException { + alignment = new ArrayList(); + predictions = new ArrayList(); + boolean running = true; + boolean ConcisefileExists = false; + boolean LogfileExists = false; + JpredJob job = new JpredJob(jobinfo[jobinfo.length - 1], jobinfo[0], jobinfo[1]); + job.setIP(jobinfo[2]); + job.setProgramName("Jpred"); + Date currDate = new Date(); + String maindir = dirprefix + "/" + job.getJobID() + "/"; + + try { + URL dirurl = new URL(maindir); + HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection(); + if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) { + return 0; + } + URL conciseurl = new URL(maindir + job.getJobID() + ".concise.fasta"); + URL logurl = new URL(maindir + "LOG"); + HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection(); + HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection(); + if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) { + ConcisefileExists = true; + running = false; + try { + job.setProtein(parsePredictions(conciseurl.openStream(), job.getJobID())); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + // The job still can be running of failed... + ++countNoData; + } + if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) { + LogfileExists = true; + job.setLog(parseLogFile(logurl.openStream(), job)); + } else { + // The job has not been started at all... + job.setExecutionStatus("FAIL"); + job.setFinalStatus("STOPPED"); + running = false; + } + if (job.getLog().matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) { + // blast job was too long (more than 3600 secs by default)... + job.setExecutionStatus("FAIL"); + job.setFinalStatus("TIMEDOUT"); + running = false; + } else if (job.getLog().matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) { + // an internal Jpred error... + job.setExecutionStatus("FAIL"); + job.setFinalStatus("JPREDERROR"); + running = false; + } else if ((currDate.getTime() - job.getEndTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) { + // the job was stopped with unknown reason... + job.setExecutionStatus("FAIL"); + job.setFinalStatus("STOPPED"); + running = false; + } + + httpConnection_conciseurl.disconnect(); + httpConnection_logurl.disconnect(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + + if (!running) { + // logging the job + job.setAlignment(alignment); + job.setPredictions(predictions); + if (job.getExecutionStatus().equals("FAIL")) { + URL sequrl = new URL(maindir + job.getJobID() + ".seq"); + HttpURLConnection httpConnection_sequrl = (HttpURLConnection) sequrl.openConnection(); + if (199 < httpConnection_sequrl.getResponseCode() && httpConnection_sequrl.getResponseCode() < 300) { + try { + job.setProtein(parseSeqFile(sequrl.openStream(), job.getJobID())); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + cw.FormQueryTables(job); + + // archiving the job + if (archiving) { + ArchivedJob ajob = new ArchivedJob(job.getJobID()); + String arlink = archive.createJob(job.getJobID()); + if (job.getFinalStatus().equals("OK")) { + ajob.setArchivePath(arlink); + ajob.copyArchiveFromWeb(maindir + job.getJobID() + ".tar.gz"); + cw.ArchiveData(job, arlink); + } else { + cw.ArchiveData(job, "undefined"); + } } + return 1; } - return out; + + return 0; } - private void ParsingForDate(String input, String date) { + private void ParsingOneDay(String input, String date) { int totalcount = 0; - int countNoData = 0; - int countUnclearFASTAid = 0; int countinsertions = 0; int countinserted = 0; - int counAlignments = 0; - int countStrange = 0; + int countNotanalyzed = 0; + countNoData = 0; System.out.println("Inserting jobs for " + date); try { @@ -118,79 +240,23 @@ public class JpredParserHTTP implements JpredParser { while ((line = alljobs.readLine()) != null) { if (line.matches(date + ":(.*)jp_[^\\s]+")) { - String[] table = line.split("\\s+"); - // Format of a record: - // starttime endtime ip email jobid (directory) - // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 - // unknown_email jp_J9HBCBT - String id = table[table.length - 1]; totalcount++; - if (cc.JobisNotInsterted(id)) { - URL dataurl = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta"); - URL archiveurl = new URL(dirprefix + "/" + id + "/" + id + ".tar.gz"); - URL logurl = new URL(dirprefix + "/" + id + "/LOG"); - HttpURLConnection httpConnection1 = (HttpURLConnection) dataurl.openConnection(); - HttpURLConnection httpConnection2 = (HttpURLConnection) logurl.openConnection(); - HttpURLConnection httpConnection3 = (HttpURLConnection) archiveurl.openConnection(); - int response1 = httpConnection1.getResponseCode(); - int response2 = httpConnection2.getResponseCode(); - if (199 < response1 && response1 < 300) { - try { - String protein = parsePredictions(dataurl.openStream(), id); - if (protein.equals("")) { - countUnclearFASTAid++; - } else { - SimpleDateFormat dateformatter = new SimpleDateFormat("yyyy/MM/dd"); - SimpleDateFormat timeformatter = new SimpleDateFormat("yyyy/MM/dd:H:m:s"); - String startdatestring = table[0].substring(0, table[0].indexOf(":")); - try { - Date startdate = dateformatter.parse(startdatestring); - Date starttime = timeformatter.parse(table[0]); - Date endtime = timeformatter.parse(table[1]); - String ip = table[2]; - String execstatus = "OK"; - String finalstatus = "OK"; - countinsertions += cc.FormQueryTables(startdate.getTime(), table[0], table[1], ip, id, execstatus, - finalstatus, protein, predictions); - - long exectime = (endtime.getTime() - starttime.getTime()) / 1000; - String log = ""; - if (199 < response2 && response2 < 300) { - log = parseLogFile(logurl.openStream()); - } - cc.ArchiveData(startdate.getTime(), exectime, ip, id, execstatus, finalstatus, protein, - predictions, alignment, log, archiveurl.toString()); - } catch (ParseException e) { - e.printStackTrace(); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } else { - countNoData++; - } - httpConnection1.disconnect(); - httpConnection2.disconnect(); - httpConnection3.disconnect(); + String[] job = line.split("\\s+"); + String jobid = job[job.length - 1]; + if (cw.JobisNotInsterted(jobid)) { + countinsertions += analyseJob(job); } else { ++countinserted; } } else { - if (line.matches(date + "(.*)Sequence0/(.*)")) { - ++counAlignments; - } else { - ++countStrange; - } + ++countNotanalyzed; } } alljobs.close(); System.out.println("Total number of jobs = " + totalcount); System.out.println(" " + countinserted + " jobs inserted already"); - System.out.println(" " + counAlignments + " jalview jobs"); - System.out.println(" " + countStrange + " not analysed jobs"); - System.out.println(" " + countNoData + " jobs without *.concise.fasta file"); - System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta"); + System.out.println(" " + countNotanalyzed + " not analysed jobs"); + System.out.println(" " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)"); System.out.println(" " + countinsertions + " new job insertions\n"); } catch (MalformedURLException e) { e.printStackTrace(); @@ -199,4 +265,4 @@ public class JpredParserHTTP implements JpredParser { } ; } -} +};