#!/usr/bin/perl =head1 NAME arco_stats.pl - script to collate SGE job stats from ARCo =cut use strict; use warnings; use Getopt::Long; use Pod::Usage; use File::Basename; use GD::Graph::bars; use DBI; # path for nicer fonts for the graph labels my $FONTPATH = "/homes/www-jpred/live/public_html/fonts/"; my @month = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); my $user = 'www-jpred'; my $out; my $xDim = 700; my $yDim = 400; my $showPlot = 0; my $showCSV = 1; my $runStats = 0; my $quiet = 0; my $help; my $man; GetOptions( 'user=s' => \$user, 'x=i' => \$xDim, 'y=i' => \$yDim, 'plot!' => \$showPlot, 'csv!' => \$showCSV, 'run-stats=i' => \$runStats, 'out=s' => \$out, 'quiet' => \$quiet, 'man' => \$man, 'help|?' => \$help, ) or pod2usage(); pod2usage( -verbose => 2 ) if ($man); pod2usage( -verbose => 1 ) if ($help); my $dbh = DBI->connect( "dbi:Pg:host=postgres.compbio.dundee.ac.uk;dbname=arco", 'account', 'saffron' ) or die "ERROR - can't connect to db: ", $DBI::errstr; # generate output prefix name unless already specified $out = "${user}_stats" unless ($out); # find out the current month and year. Correct year to real 4-digit number. my ( $currDate, $currMon, $currYr ) = ( localtime() )[ 3 .. 5 ]; $currYr += 1900; my $monthly = get_monthly($dbh); warn "Warning - no monthly data found\n" unless ( scalar @{$monthly} ); my $daily = get_daily( $dbh, $currMon, $currYr ); warn "Warning - no daily data found for $month[$currMon] $currYr\n" unless ( scalar @{$daily} ); print_run_stats( $dbh, $runStats ) if ($runStats); $dbh->disconnect; if ($showPlot) { print "Drawing plots...\n" unless ($quiet); draw_graph( $monthly, 'Month', "Monthly Totals", "${out}_monthly.png" ); draw_graph( $daily, 'Date', "Daily Totals for $month[$currMon]", "${out}_daily.png" ); } if ($showCSV) { print "Writing CSV files...\n" unless ($quiet); print_data( $daily, "${out}_daily.csv" ); print_data( $monthly, "${out}_monthly.csv" ); } print "Finished!\n" unless ($quiet); exit; ######################################################################################################### # count the number of jobs per month since records began sub get_monthly { my $dbh = shift; # retrieve all jobs run and give the epoch time they started. # has a kludge to remove some extraneous run info for Aug 2008 my $sth = $dbh->prepare( "SELECT EXTRACT(EPOCH FROM start_time) AS epoch FROM view_accounting WHERE username = '$user' AND submission_time > '2008-sep-01'::Date ORDER BY epoch ASC" ) or die "ERROR - unable to prepare SELECT statement: ", $dbh->errstr(); $sth->execute(); # foreach epoch time retrieve month and year # and count the number of jobs run per month my %data; while ( my @row = $sth->fetchrow_array ) { my ( $mnth, $year ) = ( localtime( $row[0] ) )[ 4 .. 5 ]; $year += 1900; $data{$year}{$mnth}++; } # convert month counts into data structure readable by GD::Graph my $i = 0; my @sortedData; foreach my $year ( sort keys %data ) { foreach my $mon ( sort { $a <=> $b } keys %{ $data{$year} } ) { my $date = sprintf "%s %02d", $month[$mon], $year - 2000; # convert into 2-digit version (not Y2K compatible) $sortedData[0][$i] = $date; $sortedData[1][$i] = $data{$year}{$mon}; ++$i; } } return ( \@sortedData ); } ######################################################################################################### # count the number of jobs per day of current month sub get_daily { my $dbh = shift; my $mnth = shift; my $year = shift; # retrieve the number of jobs run per day during this month my $sth = $dbh->prepare( "SELECT DISTINCT(CAST(start_time AS DATE)) AS start_date, COUNT(CAST(start_time AS DATE)) FROM view_accounting WHERE username = '$user' AND start_time >= '$year-$month[$mnth]-01'::Date GROUP BY start_date ORDER BY start_date ASC" ) or die "ERROR - unable to prepare SELECT statement: ", $dbh->errstr(); $sth->execute(); # generate data structure for GD::Graph with day counts my $i = 0; my @data; while ( my @row = $sth->fetchrow_array ) { # $row[0] is the date # $row[1] is the count my $date = ( split( /-/, $row[0] ) )[2]; #print "$date: $row[1]\n"; $data[0][$i] = $date; $data[1][$i] = $row[1]; ++$i; } $sth->finish(); return ( \@data ); } ######################################################################################################### # print out specific stats relating to run time, queuing time and exit status sub print_run_stats { my ($dbh) = shift; my $days = shift; # get the date n days ago my $secsInDays = 86400 * $days; # num seconds in a day * number of days my $daysAgo = ( time() - $secsInDays ); my ( $date, $mnth, $year ) = ( localtime($daysAgo) )[ 3 .. 5 ]; $year += 1900; #print "$days days ago was: $year-$month[$mnth]-$date\n"; # retrieve run-specific stats for the user my $sth = $dbh->prepare( "SELECT wallclock_time, maxvmem, exit_status, EXTRACT(EPOCH FROM start_time - submission_time) AS wait_time FROM view_accounting WHERE username = '$user' AND submission_time >= '$year-$month[$mnth]-$date'::Date" ) or die "ERROR - unable to prepare SELECT statement: ", $dbh->errstr(); $sth->execute() or die; my $nRows = $sth->rows(); # open stats file my $file = 'run_stats.csv'; open( my $OUT, ">>", $file ) or die "ERROR - unable to open '$file' for write: ${!}\nDied"; print $OUT "$currYr-$month[$currMon]-$currDate,$nRows,"; # if no jobs run in time frame warn, set everything to zero and return if ( $nRows == 0 ) { warn "Warning - no jobs found for user '$user' in the last $days days\n"; print $OUT "0,0,0,0,0\n"; close($OUT); return; } # collate useful data my %data; while ( my @row = $sth->fetchrow_array ) { $data{runtime} += $row[0]; $data{vmem} += $row[1]; if ( $row[2] > 0 ) { if ( $row[2] == 4 ) { $data{timeouts}++; } else { $data{errors}++; } } $data{waittime} += $row[3]; } # define potentially undefined variables $data{timeouts} = 0 unless ( $data{timeouts} ); $data{errors} = 0 unless ( $data{errors} ); # write out data to file foreach my $k qw(runtime vmem waittime) { printf $OUT "%.0f,", $data{$k} / $nRows; } print $OUT "$data{timeouts},$data{errors}\n"; } ######################################################################################################### sub draw_graph { my ( $dataref, $x_label, $title, $outFile ) = @_; my $graph = GD::Graph::bars->new( 700, 400 ); $graph->set_title_font ( "$FONTPATH/VeraBd.ttf", 12 ); $graph->set_x_label_font ( "$FONTPATH/VeraBd.ttf", 8 ); $graph->set_y_label_font ( "$FONTPATH/VeraBd.ttf", 8 ); $graph->set_x_axis_font ( "$FONTPATH/Vera.ttf", 6 ); $graph->set_y_axis_font ( "$FONTPATH/Vera.ttf", 8 ); $graph->set( x_label => $x_label, y_label => 'No. Jpred Submissions', title => $title, shadow_depth => -2, shadowclr => 'lgray', x_labels_vertical => 1, # borderclrs => undef, bar_width => 12, bar_spacing => 4 ) or die $graph->error; my $gd = $graph->plot($dataref) or die $graph->error; open( my $IMG, ">", $outFile ) or die "ERROR - unable to open '$outFile' for write: ${!}\nDied"; binmode $IMG; print $IMG $gd->png; close($IMG); } ######################################################################################################### sub print_data { my $data = shift; my $file = shift; if ( !scalar @{$data} ) { warn "Warning - no data to print out. Nothing to do.\n"; return; } my $total = scalar @{ $data->[0] }; open( my $OUT, ">", $file ) or die "ERROR - unable to open '$file' for write: ${!}\nDied"; print $OUT "Date,nRuns\n"; for ( my $i = 0 ; $i < $total ; ++$i ) { print $OUT "$data->[0][$i],$data->[1][$i]\n"; } } ######################################################################################################### =head1 SYNOPSIS arco_stats.pl --user [--out --x --y ] [--csv] [--plot] [--run-stats ] [--quiet] [--man] [--help] =head1 DESCRIPTION Script to collate run statistics from the SGE ARCo system. The script will retrieve all historical data for the specified, count the number of jobs run and collate them by month. For the current month, data will be broken down by day. With no options script will get stats for the www-jpred user and output CSV formatted data only. Filenames will take the form _stats_[daily|monthly].csv, unless specified with the --out switch. =head1 OPTIONS =over 5 =item B<--user> Specify SGE user. [Default: www-jpred] =item B<--out> Ouput filename prefix. =item B<--x> Specific X-dimension of plot figure in pixels. [Default: 700] =item B<--y> Specific Y-dimension of plot figure in pixels. [Default: 400] =item B<--csv>,B<--nocsv> Toggle for CSV output. [Default: on] =item B<--plot>,B<--noplot> Toggle for plotting of data. [Default: off] =item B<--run-stats> Set the number of days for collating run statistics (e.g. mean run time, mean wait time). [Default: 0] =item B<--quiet> Switch off progress messages. Useful if running in cron. =item B<--help> Brief help. =item B<--man> Full manpage of program. =back =head1 BUGS Script assumes it won't run in the past and threfore is not Y2K compliant. =head1 AUTHOR Chris Cole =cut