package compbio.cassandra;
import java.io.BufferedReader;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import compbio.cassandra.JpredParser;
+import compbio.data.sequence.FastaReader;
+import compbio.data.sequence.FastaSequence;
+import compbio.engine.JpredJob;
+import compbio.engine.ProteoCachePropertyHelperManager;
+import compbio.engine.archive.Archive;
+import compbio.engine.archive.ArchivedJob;
+import compbio.util.PropertyHelper;
+import compbio.util.Util;
public class JpredParserHTTP implements JpredParser {
- private CassandraCreate cc = new CassandraCreate();
+ private CassandraWriter cw = new CassandraWriter();
+ private static Archive archive;
private String dirprefix;
-
- JpredParserHTTP() {
- this.dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
+ private List<FastaSequence> alignment;
+ private List<FastaSequence> predictions;
+ private int countNoData;
+ private static boolean archiving = false;
+ private static final PropertyHelper ph = ProteoCachePropertyHelperManager.getPropertyHelper();
+
+ public JpredParserHTTP() {
+ dirprefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
}
-
- JpredParserHTTP(String sourceurl) {
- this.dirprefix = sourceurl;
+
+ public JpredParserHTTP(String sourceurl) {
+ dirprefix = sourceurl;
}
- public void setSource (String newsourceprefix) {
- this.dirprefix = newsourceprefix;
+ public void setSource(String newsourceprefix) {
+ dirprefix = newsourceprefix;
+ }
+
+ private boolean initBooleanValue(String key) {
+ assert key != null;
+ String status = ph.getProperty(key);
+ if (Util.isEmpty(status)) {
+ return false;
+ }
+ return new Boolean(status.trim()).booleanValue();
}
- public void Parsing(String source, int nDays) {
+ public void Parsing(String source, int nDays) throws IOException {
Calendar cal = Calendar.getInstance();
cal.add(Calendar.DATE, -nDays);
+ archiving = initBooleanValue("archive.enable");
+ if (archiving) {
+ archive = new Archive();
+ }
for (int i = 0; i < nDays; ++i) {
cal.add(Calendar.DATE, 1);
- int month = cal.get(Calendar.MONTH) + 1;
- int year = cal.get(Calendar.YEAR);
- int day = cal.get(Calendar.DATE);
- String date = year + "/" + month + "/" + day;
- ParsingForDate(source, date);
+ String date = cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DATE);
+ ParsingOneDay(source, date);
+ }
+ }
+
+ /*
+ * The method parses the Jpred output concise file in the FASTA format If
+ * there is a record with ID = QUERY or jobid, this a "one protein" job
+ * otherwise this is an alignment job
+ */
+ private String parsePredictions(final InputStream stream, String jobid) throws FileNotFoundException {
+ final FastaReader fr = new FastaReader(stream);
+ String protein = "";
+ while (fr.hasNext()) {
+ final FastaSequence fs = fr.next();
+ String seqid = fs.getId();
+ String seq = fs.getSequence().replaceAll("\n", "");
+ if (seqid.equals("jnetpred") || seqid.equals("Lupas_21") || seqid.equals("Lupas_14") || seqid.equals("Lupas_28")
+ || seqid.equals("JNETSOL25") || seqid.equals("JNETSOL5") || seqid.equals("JNETSOL0") || seqid.equals("JNETCONF")
+ || seqid.equals("JNETHMM") || seqid.equals("JNETPSSM") || seqid.equals("JNETCONF")) {
+ predictions.add(fs);
+ } else {
+ alignment.add(fs);
+ if (seqid.equals("QUERY") || seqid.equals(jobid))
+ protein = seq;
+ }
+ }
+ return protein;
+ }
+
+ private String parseLogFile(final InputStream stream) throws IOException {
+ String out = "";
+ BufferedReader buffer = new BufferedReader(new InputStreamReader(stream));
+ String line;
+ while (null != (line = buffer.readLine())) {
+ out += line;
+ }
+ return out;
+ }
+
+ private int analyseJob(String[] jobinfo) throws IOException {
+ alignment = new ArrayList<FastaSequence>();
+ predictions = new ArrayList<FastaSequence>();
+ boolean running = true;
+ boolean ConcisefileExists = false;
+ boolean LogfileExists = false;
+ JpredJob job = new JpredJob(jobinfo[jobinfo.length - 1], jobinfo[0], jobinfo[1]);
+ job.setIP(jobinfo[2]);
+ Date currDate = new Date();
+ String maindir = dirprefix + "/" + job.getJobID() + "/";
+
+ try {
+ URL dirurl = new URL(maindir);
+ HttpURLConnection httpConnection_dirurl = (HttpURLConnection) dirurl.openConnection();
+ if (httpConnection_dirurl.getResponseCode() < 199 || 300 <= httpConnection_dirurl.getResponseCode()) {
+ return 0;
+ }
+ URL conciseurl = new URL(maindir + job.getJobID() + ".concise.fasta");
+ URL logurl = new URL(maindir + "LOG");
+ HttpURLConnection httpConnection_conciseurl = (HttpURLConnection) conciseurl.openConnection();
+ HttpURLConnection httpConnection_logurl = (HttpURLConnection) logurl.openConnection();
+ if (199 < httpConnection_conciseurl.getResponseCode() && httpConnection_conciseurl.getResponseCode() < 300) {
+ ConcisefileExists = true;
+ running = false;
+ try {
+ job.setProtein(parsePredictions(conciseurl.openStream(), job.getJobID()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } else {
+ // The job still can be running of failed...
+ ++countNoData;
+ }
+ if (199 < httpConnection_logurl.getResponseCode() && httpConnection_logurl.getResponseCode() < 300) {
+ LogfileExists = true;
+ job.setLog(parseLogFile(logurl.openStream()));
+ } else {
+ // The job has not been started at all...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("STOPPED");
+ running = false;
+ }
+ if (job.getLog().matches("(.*)TIMEOUT\\syour\\sjob\\stimed\\sout(.*)")) {
+ // blast job was too long (more than 3600 secs by default)...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("TIMEDOUT");
+ running = false;
+ } else if (job.getLog().matches("(.*)Jpred\\serror:\\sDied\\sat(.*)")) {
+ // an internal Jpred error...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("JPREDERROR");
+ running = false;
+ } else if ((currDate.getTime() - job.getEndTime()) / 1000 > 3601 && LogfileExists && !ConcisefileExists) {
+ // the job was stopped with unknown reason...
+ job.setExecutionStatus("FAIL");
+ job.setFinalStatus("STOPPED");
+ running = false;
+ }
+
+ httpConnection_conciseurl.disconnect();
+ httpConnection_logurl.disconnect();
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
}
+
+ if (!running) {
+ job.setAlignment(alignment);
+ job.setPredictions(predictions);
+ cw.FormQueryTables(job);
+ // archiving the job
+ if (archiving) {
+ ArchivedJob ajob = new ArchivedJob(job.getJobID());
+ String arlink = archive.createJob(job.getJobID());
+ if (job.getFinalStatus().equals("OK")) {
+ ajob.setArchivePath(arlink);
+ ajob.copyArchiveFromWeb(maindir + job.getJobID() + ".tar.gz");
+ cw.ArchiveData(job, arlink);
+ } else {
+ cw.ArchiveData(job, "undefined");
+ }
+ }
+ return 1;
+ }
+
+ return 0;
}
- private void ParsingForDate(String input, String date) {
+ private void ParsingOneDay(String input, String date) {
int totalcount = 0;
- int countNoData = 0;
- int countUnclearFASTAid = 0;
int countinsertions = 0;
int countinserted = 0;
- int counAlignments = 0;
- int countStrange = 0;
+ int countNotanalyzed = 0;
+ countNoData = 0;
System.out.println("Inserting jobs for " + date);
try {
String line;
while ((line = alljobs.readLine()) != null) {
- if (line.matches(date + "(.*)jp_[^\\s]+")) {
- String[] table = line.split("\\s+");
- String id = table[table.length - 1];
+ if (line.matches(date + ":(.*)jp_[^\\s]+")) {
totalcount++;
- if (!cc.CheckID(id)) {
- URL urltable = new URL(dirprefix + "/" + id + "/" + id + ".concise.fasta");
- HttpURLConnection httpConnection = (HttpURLConnection) urltable.openConnection();
- int responsecode = httpConnection.getResponseCode();
- if (199 < responsecode && responsecode < 300) {
- try {
- final FastaReader fr = new FastaReader(urltable.openStream());
- final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
- String newprotein = "";
- while (fr.hasNext()) {
- final FastaSequence fs = fr.next();
- if (fs.getId().equals("QUERY") || fs.getId().equals(id))
- newprotein = fs.getSequence().replaceAll("\n", "");
- else
- seqs.add(fs);
- }
- if (newprotein.equals("")) {
- countUnclearFASTAid++;
- } else {
- SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
- String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
- long dateWork1 = 0;
- try {
- Date dat1 = formatter.parse(dateInString1);
- dateWork1 = dat1.getTime();
- } catch (ParseException e) {
- e.printStackTrace();
- }
- cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
- ++countinsertions;
- // flush every 100 insertions
- if (0 == countinsertions % 100) {
- cc.flushData();
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- } else {
- countNoData++;
- }
+ String[] job = line.split("\\s+");
+ String jobid = job[job.length - 1];
+ if (cw.JobisNotInsterted(jobid)) {
+ countinsertions += analyseJob(job);
} else {
++countinserted;
}
} else {
- if (line.matches(date + "(.*)Sequence0/(.*)")) {
- ++counAlignments;
- } else {
- ++countStrange;
- }
+ ++countNotanalyzed;
}
}
alljobs.close();
System.out.println("Total number of jobs = " + totalcount);
System.out.println(" " + countinserted + " jobs inserted already");
- System.out.println(" " + counAlignments + " jalview jobs");
- System.out.println(" " + countStrange + " not analysed jobs");
- System.out.println(" " + countNoData + " jobs without *.concise.fasta file");
- System.out.println(" " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
+ System.out.println(" " + countNotanalyzed + " not analysed jobs");
+ System.out.println(" " + countNoData + " jobs without *.concise.fasta file (RUNNING or FAILED)");
System.out.println(" " + countinsertions + " new job insertions\n");
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
+ ;
}
-}
+};