From: Sasha Sherstnev Date: Wed, 18 Jun 2014 07:14:53 +0000 (+0100) Subject: JWS-96 - first working version of the crawler X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=refs%2Fheads%2FJWS-96;p=jabaws.git JWS-96 - first working version of the crawler --- diff --git a/build.xml b/build.xml index fe7b271..5acf11a 100644 --- a/build.xml +++ b/build.xml @@ -353,6 +353,7 @@ + diff --git a/log/log4j.properties.updater b/log/log4j.properties.updater index e6f4a06..16c4d69 100644 --- a/log/log4j.properties.updater +++ b/log/log4j.properties.updater @@ -1,6 +1,6 @@ ## CHANGE THIS (The root directory where to store all the log files) -#logDir = . +logDir = . ## Uncomment to enable JWS2 activity logging to standard out (to the console if available) ## for possible log levels please refer to Log4j documentation http://logging.apache.org/log4j/1.2/manual.html @@ -13,7 +13,7 @@ ## FATAL - log fatal events only ################################################################################################################################## -log4j.rootLogger=DEBUG +log4j.rootLogger=TRACE,R log4j.appender.R=org.apache.log4j.FileAppender log4j.appender.R.File=StatDBupdater.log log4j.appender.R.layout=org.apache.log4j.PatternLayout diff --git a/webservices/compbio/stat/collector/ExecutionStatUpdater.java b/webservices/compbio/stat/collector/ExecutionStatUpdater.java index a5d89d1..4f5d6de 100644 --- a/webservices/compbio/stat/collector/ExecutionStatUpdater.java +++ b/webservices/compbio/stat/collector/ExecutionStatUpdater.java @@ -1,7 +1,6 @@ -/* Copyright (c) 2013 Alexander Sherstnev - * Copyright (c) 2011 Peter Troshin +/* Copyright (c) 2014 Alexander Sherstnev * - * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0 + * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.1 * * This library is free software; you can redistribute it and/or modify it under the terms of the * Apache License version 2 as published by the Apache Software Foundation @@ -39,26 +38,11 @@ import compbio.engine.client.PathValidator; import compbio.engine.client.SkeletalExecutable; /** - * Class assumptions: 1. Number of runs of each WS = number of folders with name - * 2. Number of successful runs = all runs with no result file 3. Per period of - * time = limit per file creating time 4. Runtime (avg/max) = finish time - - * start time 5. Task & result size = result.size - * - * Abandoned runs - not collected runs - * - * Cancelled runs - cancelled - * - * Cluster vs local runs - * - * Reasons for failure = look in the err out? - * - * * Metadata required: * * work directory for local and cluster tasks = from Helper or cmd parameter. WS * names - enumeration. Status file names and content. * - * @author Peter Troshin * @author Alexander Sherstnev * */ @@ -67,8 +51,8 @@ class mainJCommander { @Parameter private List parameters = new ArrayList(); - @Parameter(names = { "-log", "-verbose" }, description = "Level of verbosity") - public Integer verbose = 1; + @Parameter(names = { "-h", "-help", "--help" }, help = true, description = "Print help") + public boolean help; @Parameter(names = "-start", description = "Start time") public String starttime; @@ -84,108 +68,57 @@ class mainJCommander { } public class ExecutionStatUpdater { - static SimpleDateFormat DF = new SimpleDateFormat("dd/MM/yyyy hh:mm:ss"); - static SimpleDateFormat shortDF = new SimpleDateFormat("dd/MM/yyyy"); + static SimpleDateFormat DF = new SimpleDateFormat("dd/MM/yyyy"); private static final Logger log = Logger.getLogger(ExecutionStatUpdater.class); - final private File workingDirectory; final private List stats; - /** - * Consider the job that has been working for longer than timeOutInHours - * completed, whatever the outcome - */ - final private int timeOutInHours; - /** - * List subdirectories in the job directory - * - * @param workingDirectory - * @param timeOutInHours - */ - public ExecutionStatUpdater(String workingDirectory, int timeOutInHours) { - log.info("Starting stat collector for directory: " + workingDirectory); - log.info("Maximum allowed runtime(h): " + timeOutInHours); + public ExecutionStatUpdater(String workingDirectory) { + log.info("Starting stat updater for directory: " + workingDirectory); if (!PathValidator.isValidDirectory(workingDirectory)) { throw new IllegalArgumentException("workingDirectory '" + workingDirectory + "' does not exist!"); } this.workingDirectory = new File(workingDirectory); stats = new ArrayList(); - if (timeOutInHours <= 0) { - throw new IllegalArgumentException("Timeout value must be greater than 0! Given value: " + timeOutInHours); - } - this.timeOutInHours = timeOutInHours; } boolean hasCompleted(JobDirectory jd) { JobStat jstat = jd.getJobStat(); - if (jstat.hasResult() || jstat.getIsCancelled() || jstat.getIsFinished() || hasTimedOut(jd)) { + if (jstat.hasResult() || jstat.getIsCancelled() || jstat.getIsFinished()) { return true; } return false; } - boolean hasTimedOut(JobDirectory jd) { - return ((System.currentTimeMillis() - jd.jobdir.lastModified()) / (1000 * 60 * 60)) > timeOutInHours; - } - - /* - * Make sure that collectStatistics methods was called prior to calling - * this! TODO consider running collectStatistics from here on the first call - */ - StatProcessor getStats() { - if (stats.isEmpty()) { - log.info("Please make sure collectStatistics method was called prior to calling getStats()!"); - } - return new StatProcessor(stats); - } - void writeStatToDB(String dbname) throws SQLException { Set rjobs = new HashSet(stats); StatDB statdb = new StatDB(dbname); - log.debug("Removing records that has already been recorded"); + log.debug("Filtering out records that has already been recorded. init Njobs = " + rjobs.size()); statdb.removeRecordedJobs(rjobs); - log.debug("New records left: " + rjobs.size()); + log.debug("Njobs left: " + rjobs.size()); statdb.insertData(rjobs); } - /** - * main function - * @throws ParseException - */ - public static void main(String[] args) throws IOException, SQLException, ParseException { - mainJCommander jct = new mainJCommander(); - new JCommander(jct, args); - String WorkingDir = "jobsout"; - String DBname = "ExecutionStatistic"; + public static void printHelp() { + System.out.println("\nUsage: -dir -db -start -end -h"); + System.out.println("\n[OPTIONS]"); - long StartTime = 0L; - Date currDate = new Date(); - long EndTime = currDate.getTime(); - if (null != jct.starttime) { - Date ST = shortDF.parse(jct.starttime); - if (null != ST) { - StartTime = ST.getTime(); - } - } - if (null != jct.endtime) { - Date ET = shortDF.parse(jct.endtime); - if (null != ET) { - EndTime = ET.getTime(); - } - } - if (null != jct.dbname) { - DBname = jct.dbname; - } - if (null != jct.workingdir) { - WorkingDir = jct.workingdir; - } + System.out.println("\n-dir - a directory with jabaws jobs"); - System.out.println("Start time: " + jct.starttime + " = " + StartTime); - System.out.println("End time: " + jct.endtime + " = " + EndTime); + System.out.println("\n-db - a JABAWS statistics database. If information on"); + System.out.println("jobs from the job directory have been stored in the database,"); + System.out.println("the information is not going to the database [Default:"); + System.out.println("ExecutionStatistic]"); - ExecutionStatUpdater esu = new ExecutionStatUpdater(WorkingDir, 1); - esu.collectStatistics(StartTime, EndTime); - esu.writeStatToDB(DBname); + System.out.println("\n-start - start date for processing jobs. Timestamp of"); + System.out.println("last modification of a job should be later than DATE1. The"); + System.out.println("date format is dd/mm/yy. If the input date format is different"); + System.out.println("the default value is used [Default: 00/00/00]"); + + System.out.println("\n-end - end date for processing jobs. Timestamp of"); + System.out.println("last modification of a job should be earlier than DATE2. The"); + System.out.println("date format is dd/mm/yy. If the input date format is different"); + System.out.println("the default value is used [Default: current date]\n"); } static FileFilter directories = new FileFilter() { @@ -221,7 +154,76 @@ public class ExecutionStatUpdater { log.trace("training input: " + dir.getName() + File.separator + SkeletalExecutable.INPUT); } } - log.debug("Statistics collected!"); + log.debug("Statistics prepared..."); } + /** + * Starts stat DB update program, the only mandatory parameter is a job + * directory. + * + * @param args + * Usage: java -jar -dir -db + * -start -end -h + * + * ACTION [OPTIONS] + * + * -dir - a directory with jabaws jobs + * + * -db - a JABAWS statistics database. If information on + * jobs from the job directory have been stored in the database, + * the information is not going to the database [Default: + * ExecutionStatistic] + * + * -start - start date for processing jobs. Timestamp of + * last modification of a job should be later than DATE1. The + * date format is dd/mm/yy. If the input date format is different + * the default value is used [Default: 00/00/00] + * + * -end - end date for processing jobs. Timestamp of last + * modification of a job should be earlier than DATE2. The date + * format is dd/mm/yy. If the input date format is different the + * default value is used [Default: current date] + * + * @throws ParseException + * + */ + public static void main(String[] args) throws IOException, SQLException, ParseException { + mainJCommander jct = new mainJCommander(); + new JCommander(jct, args); + String WorkingDir = ""; + String DBname = "ExecutionStatistic"; + + long StartTime = 0L; + Date currDate = new Date(); + long EndTime = currDate.getTime(); + if (null != jct.starttime) { + Date ST = DF.parse(jct.starttime); + if (null != ST) { + StartTime = ST.getTime(); + } + } + if (null != jct.endtime) { + Date ET = DF.parse(jct.endtime); + if (null != ET) { + EndTime = ET.getTime(); + } + } + if (null != jct.dbname) { + DBname = jct.dbname; + } + if (null == jct.workingdir || jct.help) { + printHelp(); + return; + } + + WorkingDir = jct.workingdir; + log.trace("Collect statistics from jobs at " + WorkingDir); + log.trace("Start time: " + jct.starttime + " = " + StartTime + " ms"); + log.trace("End time: " + jct.endtime + " = " + EndTime + " ms"); + + ExecutionStatUpdater esu = new ExecutionStatUpdater(WorkingDir); + esu.collectStatistics(StartTime, EndTime); + esu.writeStatToDB(DBname); + return; + } } diff --git a/webservices/compbio/stat/collector/StatDB.java b/webservices/compbio/stat/collector/StatDB.java index 45947a2..9bfeb5d 100644 --- a/webservices/compbio/stat/collector/StatDB.java +++ b/webservices/compbio/stat/collector/StatDB.java @@ -154,6 +154,7 @@ public class StatDB { + "inputsize, resultsize, isCancelled, isCollected, isClusterJob) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; PreparedStatement pstm = conn.prepareStatement(insert); + int i = 0; for (JobStat js : jobstatus) { // Has to present pstm.setString(1, js.webService.toString()); @@ -185,10 +186,12 @@ public class StatDB { pstm.setBoolean(9, js.isCollected); pstm.setBoolean(10, js.isClusterJob()); pstm.executeUpdate(); + ++i; } conn.commit(); conn.setAutoCommit(true); pstm.close(); + log.debug(i + " jobs have been recorded..."); } public Date getEarliestRecord() throws SQLException { @@ -319,6 +322,10 @@ public class StatDB { String query = "select job_id from exec_stat"; + if (null == conn) { + System.out.println ("Something wrong with the DB..."); + return; + } Statement st = conn.createStatement(); ResultSet result = st.executeQuery(query);