From: Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Date: Wed, 18 Jun 2014 07:14:53 +0000 (+0100)
Subject: JWS-96 - first working version of the crawler
X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=a03c9e00fd29ac49f1828d7a50b1da22966b0a8f;p=jabaws.git

JWS-96 - first working version of the crawler
---

diff --git a/build.xml b/build.xml
index fe7b271..5acf11a 100644
--- a/build.xml
+++ b/build.xml
@@ -353,6 +353,7 @@
 			</fileset>
 			<fileset refid="statupdater"/>
 			<zipgroupfileset excludes="META-INF/*" dir="" includes="WEB-INF/lib/log4j-1.2.15.jar" />
+			<zipgroupfileset excludes="META-INF/*" dir="" includes="WEB-INF/lib/derby-10.8.2.2.jar" />
 			<zipgroupfileset excludes="META-INF/*" dir="" includes="lib/jcommander-1.30.jar" />
 			<zipgroupfileset excludes="META-INF/*.SF" dir="${web.lib.path}" >
 				<include name="${compbio-util}"/>
diff --git a/log/log4j.properties.updater b/log/log4j.properties.updater
index e6f4a06..16c4d69 100644
--- a/log/log4j.properties.updater
+++ b/log/log4j.properties.updater
@@ -1,6 +1,6 @@
 
 ## CHANGE THIS (The root directory where to store all the log files)  
-#logDir = .
+logDir = .
 
 ## Uncomment to enable JWS2 activity logging to standard out (to the console if available)
 ## for possible log levels please refer to Log4j documentation http://logging.apache.org/log4j/1.2/manual.html 
@@ -13,7 +13,7 @@
 ## FATAL - log fatal events only
 
 ##################################################################################################################################
-log4j.rootLogger=DEBUG
+log4j.rootLogger=TRACE,R
 log4j.appender.R=org.apache.log4j.FileAppender
 log4j.appender.R.File=StatDBupdater.log
 log4j.appender.R.layout=org.apache.log4j.PatternLayout
diff --git a/webservices/compbio/stat/collector/ExecutionStatUpdater.java b/webservices/compbio/stat/collector/ExecutionStatUpdater.java
index a5d89d1..4f5d6de 100644
--- a/webservices/compbio/stat/collector/ExecutionStatUpdater.java
+++ b/webservices/compbio/stat/collector/ExecutionStatUpdater.java
@@ -1,7 +1,6 @@
-/* Copyright (c) 2013 Alexander Sherstnev
- * Copyright (c) 2011 Peter Troshin
+/* Copyright (c) 2014 Alexander Sherstnev
  *  
- *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0     
+ *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.1
  * 
  *  This library is free software; you can redistribute it and/or modify it under the terms of the
  *  Apache License version 2 as published by the Apache Software Foundation
@@ -39,26 +38,11 @@ import compbio.engine.client.PathValidator;
 import compbio.engine.client.SkeletalExecutable;
 
 /**
- * Class assumptions: 1. Number of runs of each WS = number of folders with name
- * 2. Number of successful runs = all runs with no result file 3. Per period of
- * time = limit per file creating time 4. Runtime (avg/max) = finish time -
- * start time 5. Task & result size = result.size
- * 
- * Abandoned runs - not collected runs
- * 
- * Cancelled runs - cancelled
- * 
- * Cluster vs local runs
- * 
- * Reasons for failure = look in the err out?
- * 
- * 
  * Metadata required:
  * 
  * work directory for local and cluster tasks = from Helper or cmd parameter. WS
  * names - enumeration. Status file names and content.
  * 
- * @author Peter Troshin
  * @author Alexander Sherstnev
  * 
  */
@@ -67,8 +51,8 @@ class mainJCommander {
 	@Parameter
 	private List<String> parameters = new ArrayList<String>();
 
-	@Parameter(names = { "-log", "-verbose" }, description = "Level of verbosity")
-	public Integer verbose = 1;
+	@Parameter(names = { "-h", "-help", "--help" }, help = true, description = "Print help")
+	public boolean help;
 
 	@Parameter(names = "-start", description = "Start time")
 	public String starttime;
@@ -84,108 +68,57 @@ class mainJCommander {
 }
 
 public class ExecutionStatUpdater {
-	static SimpleDateFormat DF = new SimpleDateFormat("dd/MM/yyyy hh:mm:ss");
-	static SimpleDateFormat shortDF = new SimpleDateFormat("dd/MM/yyyy");
+	static SimpleDateFormat DF = new SimpleDateFormat("dd/MM/yyyy");
 	private static final Logger log = Logger.getLogger(ExecutionStatUpdater.class);
-
 	final private File workingDirectory;
 	final private List<JobStat> stats;
-	/**
-	 * Consider the job that has been working for longer than timeOutInHours
-	 * completed, whatever the outcome
-	 */
-	final private int timeOutInHours;
 
-	/**
-	 * List subdirectories in the job directory
-	 * 
-	 * @param workingDirectory
-	 * @param timeOutInHours
-	 */
-	public ExecutionStatUpdater(String workingDirectory, int timeOutInHours) {
-		log.info("Starting stat collector for directory: " + workingDirectory);
-		log.info("Maximum allowed runtime(h): " + timeOutInHours);
+	public ExecutionStatUpdater(String workingDirectory) {
+		log.info("Starting stat updater for directory: " + workingDirectory);
 		if (!PathValidator.isValidDirectory(workingDirectory)) {
 			throw new IllegalArgumentException("workingDirectory '" + workingDirectory + "' does not exist!");
 		}
 		this.workingDirectory = new File(workingDirectory);
 		stats = new ArrayList<JobStat>();
-		if (timeOutInHours <= 0) {
-			throw new IllegalArgumentException("Timeout value must be greater than 0! Given value: " + timeOutInHours);
-		}
-		this.timeOutInHours = timeOutInHours;
 	}
 
 	boolean hasCompleted(JobDirectory jd) {
 		JobStat jstat = jd.getJobStat();
-		if (jstat.hasResult() || jstat.getIsCancelled() || jstat.getIsFinished() || hasTimedOut(jd)) {
+		if (jstat.hasResult() || jstat.getIsCancelled() || jstat.getIsFinished()) {
 			return true;
 		}
 		return false;
 	}
 
-	boolean hasTimedOut(JobDirectory jd) {
-		return ((System.currentTimeMillis() - jd.jobdir.lastModified()) / (1000 * 60 * 60)) > timeOutInHours;
-	}
-
-	/*
-	 * Make sure that collectStatistics methods was called prior to calling
-	 * this! TODO consider running collectStatistics from here on the first call
-	 */
-	StatProcessor getStats() {
-		if (stats.isEmpty()) {
-			log.info("Please make sure collectStatistics method was called prior to calling getStats()!");
-		}
-		return new StatProcessor(stats);
-	}
-
 	void writeStatToDB(String dbname) throws SQLException {
 		Set<JobStat> rjobs = new HashSet<JobStat>(stats);
 		StatDB statdb = new StatDB(dbname);
-		log.debug("Removing records that has already been recorded");
+		log.debug("Filtering out records that has already been recorded. init Njobs = " + rjobs.size());
 		statdb.removeRecordedJobs(rjobs);
-		log.debug("New records left: " + rjobs.size());
+		log.debug("Njobs left: " + rjobs.size());
 		statdb.insertData(rjobs);
 	}
 
-	/**
-	 * main function
-	 * @throws ParseException 
-	 */
-	public static void main(String[] args) throws IOException, SQLException, ParseException {
-		mainJCommander jct = new mainJCommander();
-		new JCommander(jct, args);
-		String WorkingDir = "jobsout";
-		String DBname = "ExecutionStatistic";
+	public static void printHelp() {
+		System.out.println("\nUsage: <Class or Jar file name> -dir <JOBDIR> -db <StatDB> -start <DATE1> -end <DATE2> -h");
+		System.out.println("\n[OPTIONS]");
 
-		long StartTime = 0L;
-		Date currDate = new Date();
-		long EndTime = currDate.getTime();
-		if (null != jct.starttime) {
-			Date ST = shortDF.parse(jct.starttime);
-			if (null != ST) {
-				StartTime = ST.getTime();
-			}
-		}
-		if (null != jct.endtime) {
-			Date ET = shortDF.parse(jct.endtime);
-			if (null != ET) {
-				EndTime = ET.getTime();
-			}
-		}
-		if (null != jct.dbname) {
-			DBname = jct.dbname;
-		}
-		if (null != jct.workingdir) {
-			WorkingDir = jct.workingdir;
-		}
+		System.out.println("\n-dir <JOBDIR> - a directory with jabaws jobs");
 
-		System.out.println("Start time: " + jct.starttime + " = " + StartTime);
-		System.out.println("End time: " + jct.endtime + " = " + EndTime);
+		System.out.println("\n-db <StatDB> - a JABAWS statistics database. If information on");
+		System.out.println("jobs from the job directory have been stored in the database,");
+		System.out.println("the information is not going to the database [Default:");
+		System.out.println("ExecutionStatistic]");
 
-		ExecutionStatUpdater esu = new ExecutionStatUpdater(WorkingDir, 1);
-		esu.collectStatistics(StartTime, EndTime);
-		esu.writeStatToDB(DBname);
+		System.out.println("\n-start <DATE1> - start date for processing jobs. Timestamp of");
+		System.out.println("last modification of a job should be later than DATE1. The");
+		System.out.println("date format is dd/mm/yy. If the input date format is different");
+		System.out.println("the default value is used [Default: 00/00/00]");
+
+		System.out.println("\n-end <DATE2> - end date for processing jobs. Timestamp of");
+		System.out.println("last modification of a job should be earlier than DATE2. The");
+		System.out.println("date format is dd/mm/yy. If the input date format is different");
+		System.out.println("the default value is used [Default: current date]\n");
 	}
 
 	static FileFilter directories = new FileFilter() {
@@ -221,7 +154,76 @@ public class ExecutionStatUpdater {
 				log.trace("training input: " + dir.getName() + File.separator + SkeletalExecutable.INPUT);
 			}
 		}
-		log.debug("Statistics collected!");
+		log.debug("Statistics prepared...");
 	}
 
+	/**
+	 * Starts stat DB update program, the only mandatory parameter is a job
+	 * directory.
+	 * 
+	 * @param args
+	 *            Usage: java -jar <Jar file name> -dir <JOBDIR> -db <StatDB>
+	 *            -start <DATE1> -end <DATE2> -h
+	 * 
+	 *            ACTION [OPTIONS]
+	 * 
+	 *            -dir <JOBDIR> - a directory with jabaws jobs
+	 * 
+	 *            -db <StatDB> - a JABAWS statistics database. If information on
+	 *            jobs from the job directory have been stored in the database,
+	 *            the information is not going to the database [Default:
+	 *            ExecutionStatistic]
+	 * 
+	 *            -start <DATE1> - start date for processing jobs. Timestamp of
+	 *            last modification of a job should be later than DATE1. The
+	 *            date format is dd/mm/yy. If the input date format is different
+	 *            the default value is used [Default: 00/00/00]
+	 * 
+	 *            -end <DATE2> - end date for processing jobs. Timestamp of last
+	 *            modification of a job should be earlier than DATE2. The date
+	 *            format is dd/mm/yy. If the input date format is different the
+	 *            default value is used [Default: current date]
+	 * 
+	 * @throws ParseException
+	 * 
+	 */
+	public static void main(String[] args) throws IOException, SQLException, ParseException {
+		mainJCommander jct = new mainJCommander();
+		new JCommander(jct, args);
+		String WorkingDir = "";
+		String DBname = "ExecutionStatistic";
+
+		long StartTime = 0L;
+		Date currDate = new Date();
+		long EndTime = currDate.getTime();
+		if (null != jct.starttime) {
+			Date ST = DF.parse(jct.starttime);
+			if (null != ST) {
+				StartTime = ST.getTime();
+			}
+		}
+		if (null != jct.endtime) {
+			Date ET = DF.parse(jct.endtime);
+			if (null != ET) {
+				EndTime = ET.getTime();
+			}
+		}
+		if (null != jct.dbname) {
+			DBname = jct.dbname;
+		}
+		if (null == jct.workingdir || jct.help) {
+			printHelp();
+			return;
+		}
+
+		WorkingDir = jct.workingdir;
+		log.trace("Collect statistics from jobs at " + WorkingDir);
+		log.trace("Start time: " + jct.starttime + " = " + StartTime + " ms");
+		log.trace("End time: " + jct.endtime + " = " + EndTime + " ms");
+
+		ExecutionStatUpdater esu = new ExecutionStatUpdater(WorkingDir);
+		esu.collectStatistics(StartTime, EndTime);
+		esu.writeStatToDB(DBname);
+		return;
+	}
 }
diff --git a/webservices/compbio/stat/collector/StatDB.java b/webservices/compbio/stat/collector/StatDB.java
index 45947a2..9bfeb5d 100644
--- a/webservices/compbio/stat/collector/StatDB.java
+++ b/webservices/compbio/stat/collector/StatDB.java
@@ -154,6 +154,7 @@ public class StatDB {
 				+ "inputsize, resultsize, isCancelled, isCollected, isClusterJob) "
 				+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
 		PreparedStatement pstm = conn.prepareStatement(insert);
+		int i = 0;
 		for (JobStat js : jobstatus) {
 			// Has to present
 			pstm.setString(1, js.webService.toString());
@@ -185,10 +186,12 @@ public class StatDB {
 			pstm.setBoolean(9, js.isCollected);
 			pstm.setBoolean(10, js.isClusterJob());
 			pstm.executeUpdate();
+			++i;
 		}
 		conn.commit();
 		conn.setAutoCommit(true);
 		pstm.close();
+		log.debug(i + " jobs have been recorded...");
 	}
 
 	public Date getEarliestRecord() throws SQLException {
@@ -319,6 +322,10 @@ public class StatDB {
 
 		String query = "select job_id from exec_stat";
 
+		if (null == conn) {
+			System.out.println ("Something wrong with the DB...");
+			return;
+		}
 		Statement st = conn.createStatement();
 		ResultSet result = st.executeQuery(query);