#!/usr/bin/perl ##################################################################### # Author: KM Amada (kmamada@ifrec.osaka-u.ac.jp) # # Ver. Date Changelog ##################################################################### # 1.0 07.26.13 Initial release # 2.0 09.03.13 Added extensive warnings and error messages # 3.0 10.28.13 Fix for retrieving large files. Added STDERR logs # 3.1 11.08.13 Added LWP failsafe. Made hat3 not a required output # 3.2 12.08.14 Removed 5-char restriction for own structure files # ##################################################################### use strict; use Getopt::Long; use File::Path qw(make_path remove_tree); use LWP::Simple; use LWP::UserAgent; # to prevent error 'Header line too long (limit is 8192)' [v3.1] use LWP::Protocol::http; push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 0); my $BASEURL = "http://sysimm.ifrec.osaka-u.ac.jp/MAFFTash/REST/service.cgi/premafft"; my ( $WORKDIR, $PDBLIST, $OWNLIST, $HAT3FILE, $INSTRFILE ); GetOptions ( 'd=s' => \$WORKDIR, 'p=s' => \$PDBLIST, 'o=s' => \$OWNLIST, 'h=s' => \$HAT3FILE, 'i=s' => \$INSTRFILE, ); print STDERR "[MAFFTash-premafft]\n"; # set temp directory my $TMP = "/tmp/mapremafft$$"; make_path($TMP) unless -d $TMP; ###### # validation &help("Required parameter : atleast one of either '-p' or '-o'") unless ( defined $PDBLIST || defined $OWNLIST); &help("Required parameter : '-d'") if defined $OWNLIST && ! defined $WORKDIR; $HAT3FILE = "hat3" unless defined $HAT3FILE; $INSTRFILE = "instr" unless defined $INSTRFILE; chop $WORKDIR if defined $WORKDIR && $WORKDIR =~ m/\/$/g; ###### # prepare inputs print STDERR "Preparing inputs for service request...\n"; my @files = (); push(@files, "strweight" => "0.5"); push(@files, "premafft" => "1"); # pdb entries if ( defined $PDBLIST ) { print STDERR "PDB List defined!\n"; &bail("Error: Input file $PDBLIST does not exists!") unless -e $PDBLIST; my $listfile = "$TMP/pdblist.inp"; open(INPF,"<$PDBLIST") or &bail("Error: Cannot open file $PDBLIST for reading!"); open(OUTF,">$listfile") or &bail("Error: Cannot open temporary file $listfile for writing!"); while() { chomp; if ( /^(\w{5})$/ ) { print OUTF ">PDBID\n$1\n"; } } close OUTF; close INPF; push(@files, "inputfile" => ["$listfile"]); } # upload own structures my %ownids = (); if ( defined $OWNLIST ) { print STDERR "OWN List defined!\n"; &bail("Error: Input file $OWNLIST does not exists!") unless -e $OWNLIST; open(OWNINPF,"<$OWNLIST") or &bail("Error: Cannot open file $OWNLIST for reading!"); while() { chomp; if ( /^(\S+)$/ ) { my $fileref = "$WORKDIR/$1.pdb"; unless (-e $fileref) { close OWNINPF; &bail("Error: File $fileref does not exists!"); } push(@files, "inputownfile[]" => ["$fileref"]); $ownids{$1} = 1; } } close OWNINPF; } ###### # start rest service print STDERR "Sending service request...\n"; my $browser = LWP::UserAgent->new; $browser->timeout(0); # post: running a mafftash job my $postResponse = $browser->post( $BASEURL, \@files, 'Content_Type' => 'form-data' ); &bail(sprintf("[%d] %s\n", $postResponse->code, &parseError($postResponse->content))) unless($postResponse->is_success); # get response from post request my ($status, $mafftashid) = &parseResponse($postResponse->content); my $MAXTRIES = 3; my $STIMER = 4; my $longtimer = 0; print STDERR "Request sent! Waiting for response...[$mafftashid]\n"; # wait for results until it becomes available while(1) { $longtimer = $longtimer <= ($STIMER*3) ? $longtimer+$STIMER : $STIMER; sleep $longtimer; # get: get results for mafftash job my $getResponse = $browser->get("$BASEURL/$mafftashid"); if ( $getResponse->is_success ) { # get response from get request ($status, $mafftashid) = &parseResponse($getResponse->content); next unless ( $status eq "done" ); # if job is finished and ready print STDERR "Results found!\n"; my $csfile = "$TMP/checksum.tar.gz"; my $try1 = 1; while(1) { print STDERR "Fetching Results... [Trial $try1]\n"; if ( is_success(getstore("$BASEURL/getmdlist/$mafftashid", $csfile)) && -e $csfile && -s $csfile ) { # get response from get request my $checklist = &extractchecksum($csfile); &bail("Error retrieving list of compressed files!") unless ( scalar %$checklist > 0 ); foreach my $id ( keys %$checklist ) { my $checkfile = "$TMP/$id"; my $checkid = $checklist->{$id}; my $try2 = 1; while(1) { unlink $checkfile if -e $checkfile; if ( is_success(getstore("$BASEURL/get/$mafftashid/$id", $checkfile)) && -e $checkfile && -s $checkfile ) { my $hashid = &getchecksum($checkfile); #print STDERR "[hashid]$hashid [checkid]$checkid\n"; if ($hashid ne "" && $hashid ne $checkid ) { unlink $checkfile if -e $checkfile; &bail("Error retrieving compressed file from server! [Checksum Failed]") if $try2 >= $MAXTRIES; $try2++; sleep $STIMER; } else { last; } } else { &bail("Error retrieving compressed file from server!") if $try2 >= $MAXTRIES; $try2++; sleep $STIMER; } } } last; } else { &bail("Error retrieving list of compressed files from server!") if $try1 >= $MAXTRIES; $try1++; sleep $STIMER; } } last; } else { &bail(sprintf("[%d] %s\n", $getResponse->code, &parseError($getResponse->content))); } } # make sure outputs were generated # decompress print STDERR "Assembling final results...\n"; &backticks("cat $TMP/archive.tar.gz* | tar -zxf - -C $TMP/"); &backticks("mv -f $TMP/instr $INSTRFILE") if -e "$TMP/instr"; &backticks("mv -f $TMP/hat3 $HAT3FILE") if -e "$TMP/hat3"; # sometimes no hat3 file is generated [v3.1] #&bail("Error: Output file $HAT3FILE not found!") unless -e $HAT3FILE; &bail("Error: Output file $INSTRFILE not found!") unless -e $INSTRFILE; # warn if some ownids were ommitted if ( scalar keys(%ownids) > 0 ) { my %instrids = (); open(INSTRF,"<$INSTRFILE") or &bail("Error: Cannot open file $INSTRFILE for reading!"); while() { chomp; if ( /^>\d+_(\S+)$/ ) { $instrids{$1} = 1; } } close INSTRF; foreach my $id ( keys %ownids ) { warn "Warning: Own structure $id was excluded from instr/hat3.\n" unless $instrids{$id}; } } &cleanup(); #################### #################### sub parseResponse { my $response = shift; #"status":"wait","mafftashid":"Ma8211432R" my $status = ""; my $mafftashid = ""; if ( $response =~ /^([^\s:]+):([^\s:]+)$/ ) { $mafftashid = $1; $status = $2; } return ($status, $mafftashid); } sub extractchecksum { my $infile = shift; my %dataset = (); open CSUM, "tar -zxf $infile -O|" or return \%dataset; while() { chomp; if ( /^(\S+)\s+(\S+)$/ ) { $dataset{$2} = $1; } } close CSUM; return \%dataset; } sub parseError { my $response = shift; #"error":"Invalid number of inputs found." my $errorstr = ( $response =~ /\"error\"\s*:\s*\"([^\"]+)\"/ ) ? $1 : ""; return $errorstr; } sub getchecksum { my $infile = shift; # md5 binary check my $MD5BIN = ""; if ( -x "/usr/bin/md5sum" ) { $MD5BIN = "/usr/bin/md5sum"; } elsif ( -x "/sbin/md5" ) { $MD5BIN = "/sbin/md5 -q"; } return "" if $MD5BIN eq ""; my $checksum = ""; open MD5EXE, "$MD5BIN $infile|" or return ""; while() { if (/^(\S+)\s+(\S+)$/) { $checksum = $1; last; } elsif (/^(\S+)$/) { $checksum = $1; last; } } close MD5EXE; return $checksum; } sub backticks { my $command = shift; `$command`; return ($? == -1) ? 0 : 1; } sub bail { my $str = shift; print STDERR "$str\n" if defined $str; &cleanup(); exit(1); } sub cleanup { return if ($TMP eq "" || !-d $TMP); opendir(MAINDIR, $TMP); my @files = readdir(MAINDIR); closedir(MAINDIR); foreach my $file (@files) { unlink "$TMP/$file" if -e "$TMP/$file"; } remove_tree($TMP); } sub help { my $str = shift; print <<'HELPME'; USAGE ./mafftash_premafft.pl -p [FILE] ./mafftash_premafft.pl -o [FILE] -d [DIRECTORY] ./mafftash_premafft.pl -p [FILE] -o [FILE] -d [DIRECTORY] PARAMETERS -p [FILE] FILE contains a list of PDBIDs (one entry per line); make sure that the PDBIDs are in the standard 5-character pdbid+chain naming format -o [FILE] -d [DIRECTORY] FILE contains a list of IDs from your own structure/pdb files (one entry per line) for each ID in the list make sure that a corresponding structure file (same ID with .pdb extension) is stored in DIRECTORY -h [HATFILE] save the output hat3 file in HATFILE; if not set, the output is written to a file named 'hat3' in your current directory -i [INSTRFILE] save the output instr file in INSTRFILE; if not set, the output is written to a file named 'instr' in your current directory HELPME &bail($str); }