package compbio.cassandra;
import java.io.BufferedReader;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import compbio.cassandra.JpredParser;
+import compbio.engine.JpredJob;
public class JpredParserHTTP implements JpredParser {
- private CassandraCreate cc = new CassandraCreate();
+ private CassandraWriter cw = new CassandraWriter();
private String dirprefix;
+ private List<FastaSequence> alignment;
+ private List<FastaSequence> predictions;
+ private int countNoData;
- JpredParserHTTP() {
+ public JpredParserHTTP() {
dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
}
- JpredParserHTTP(String sourceurl) {
+ public JpredParserHTTP(String sourceurl) {
dirprefix = sourceurl;
}
dirprefix = newsourceprefix;
}
- public void Parsing(String source, int nDays) {
+ public void Parsing(String source, int nDays) throws IOException {
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -nDays);
for (int i = 0; i < nDays; ++i) {
cal.add(Calendar.DATE, 1);
- int month = cal.get(Calendar.MONTH) + 1;
- int year = cal.get(Calendar.YEAR);
- int day = cal.get(Calendar.DATE);
- String date = year + "/" + month + "/" + day;
+ String date = cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DATE);
ParsingForDate(source, date);
}
}
+ /*
+ * The method parses the Jpred output concise file in the FASTA format If
+ * there is a record with ID = QUERY or jobid, this a "one protein" job
+ * otherwise this is an alignment job
+ */
+ private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
+ final FastaReader fr = new FastaReader(stream);
+ String protein = "";
+ alignment = new ArrayList<FastaSequence>();
+ predictions = new ArrayList<FastaSequence>();
+ while (fr.hasNext()) {
+ final FastaSequence fs = fr.next();
+ String seqid = fs.getId();
+ String seq = fs.getSequence().replaceAll("\n", "");
+ if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
+ || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
+ || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) {
+ predictions.add(fs);
+ } else {
+ alignment.add(fs);
+ if (seqid.equals("QUERY") || seqid.equals(jobid))
+ protein = seq;
+ }
+ }
+ return protein;
+ }
+
+ private String parseLogFile(final InputStream stream) throws IOException {
+ String out = "";
+ BufferedReader buffer = new BufferedReader(new InputStreamReader(stream));
+ String line;
+ while (null != (line = buffer.readLine())) {
+ out += line;
+ }
+ return out;
+ }
+
+ private int analyseJob(String[] jobinfo) throws IOException {
+ boolean running = true;
+ boolean ConcisefileExists = false;
+ boolean LogfileExists = false;
+ JpredJob job = new JpredJob (jobinfo[jobinfo.length - 1], jobinfo[0], jobinfo[1]);
+ job.setIP(jobinfo[2]);
+ Date currDate = new Date();
+ String maindir = dirprefix + "/" + job.getJobID() + "/";
+
+ //System.out.println("analyzing job " + job.getJobID());
+ try {
+ URL dirurl = new URL(maindir);
+ HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection();
+ if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) {
+ return 0;
+ }
+ URL conciseurl = new URL(maindir + job.getJobID() + ".concise.fasta");
+ URL archiveurl = new URL(maindir + job.getJobID() + ".tar.gz");
+ URL logurl = new URL(maindir + "LOG");
+ HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection();
+ HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection();
+ HttpURLConnection httpConnection_archiveurl = (HttpURLConnection) archiveurl.openConnection();
+ if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) {
+ ConcisefileExists = true;
+ running = false;
+ try {
+ job.setProtein(parsePredictions(conciseurl.openStream(), job.getJobID()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } else {
+ // The job still can be running of failed...
+ ++countNoData;
+ }
+ if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) {
+ LogfileExists = true;
+ job.setLog(parseLogFile(logurl.openStream()));
+ } else {
+ // The job has not been started at all...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("STOPPED");
+ running = false;
+ }
+ if (job.getLog().matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) {
+ // blast job was too long (more than 3600 secs by default)...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("TIMEDOUT");
+ running = false;
+ } else if (job.getLog().matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) {
+ // an internal Jpred error...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("JPREDERROR");
+ running = false;
+ } else if ((currDate.getTime() - job.getEndTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) {
+ // the job was stopped with unknown reason...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("STOPPED");
+ running = false;
+ }
+
+ httpConnection_conciseurl.disconnect();
+ httpConnection_logurl.disconnect();
+ httpConnection_archiveurl.disconnect();
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ }
+
+ if (!running) {
+ job.setAlignment(alignment);
+ job.setPredictions(predictions);
+ cw.FormQueryTables(job);
+ cw.ArchiveData(job, "undefined");
+ return 1;
+ }
+
+ return 0;
+ }
+
private void ParsingForDate(String input, String date) {
int totalcount = 0;
- int countNoData = 0;
- int countUnclearFASTAid = 0;
int countinsertions = 0;
int countinserted = 0;
- int counAlignments = 0;
- int countStrange = 0;
+ int countNotanalyzed = 0;
+ countNoData = 0;
System.out.println("Inserting jobs for " + date);
try {
String line;
while ((line = alljobs.readLine()) != null) {
- if (line.matches(date + "(.*)jp_[^\\s]+")) {
- String[] table = line.split("\\s+");
- // Format of a record:
- // starttime endtime ip email jobid (directory)
- // 013/10/25:21:55:7 2013/10/25:21:59:13 201.239.98.172 unknown_email jp_J9HBCBT
- String id = table[table.length - 1];
+ if (line.matches(date + ":(.*)jp_[^\\s]+")) {
totalcount++;
- if (!cc.CheckID(id)) {
- String datalink = dirprefix + "/" + id + "/" + id + ".concise.fasta";
- URL urltable = new URL(datalink);
- HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
- int responsecode = httpConnection.getResponseCode();
- if (199 < responsecode && responsecode < 300) {
- try {
- final FastaReader fr = new FastaReader(urltable.openStream());
- final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
- String newprotein = "";
- while (fr.hasNext()) {
- final FastaSequence fs = fr.next();
- if (fs.getId().equals("QUERY") || fs.getId().equals(id))
- newprotein = fs.getSequence().replaceAll("\n", "");
- else
- seqs.add(fs);
- }
- if (newprotein.equals("")) {
- countUnclearFASTAid++;
- } else {
- SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
- String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
- long dateWork1 = 0;
- try {
- Date dat1 = formatter.parse(dateInString1);
- dateWork1 = dat1.getTime();
- } catch (ParseException e) {
- e.printStackTrace();
- }
- cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
- ++countinsertions;
- // flush every 100 insertions
- if (0 == countinsertions % 100) {
- cc.flushData();
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- } else {
- countNoData++;
- }
+ String[] job = line.split("\\s+");
+ String jobid = job[job.length - 1];
+ if (cw.JobisNotInsterted(jobid)) {
+ countinsertions += analyseJob(job);
} else {
++countinserted;
}
} else {
- if (line.matches(date + "(.*)Sequence0/(.*)")) {
- ++counAlignments;
- } else {
- ++countStrange;
- }
+ ++countNotanalyzed;
}
}
alljobs.close();
System.out.println("Total number of jobs = " + totalcount);
System.out.println(" " + countinserted + " jobs inserted already");
- System.out.println(" " + counAlignments + " jalview jobs");
- System.out.println(" " + countStrange + " not analysed jobs");
- System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
- System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+ System.out.println(" " + countNotanalyzed + " not analysed jobs");
+ System.out.println(" " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)");
System.out.println(" " + countinsertions + " new job insertions\n");
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
+ ;
}
}