#!/usr/bin/perl # hack to pull the alignments and profiles from psiblast # -m6 switch is needed to get the alignments # in the BLAST report, also generates the # psiblast frequency profiles, as they # are no longer present in the output use Getopt::Std; getopt(hfgud); if ($opt_f eq"req"){$freq=1} elsif ($opt_g eq "ap"){$gap=1} elsif ($opt_u eq "ngap"){$ungap=1} elsif ($opt_d eq "egap"){$degap=1} elsif ($opt_h eq "elp"){ print " # Parse_PSI This little script allows parsing of psiblast report files to extract the embedded sequence alignment, and write out in different formats. parse_psi -help : help parse_psi -degap : remove all gaps in alignment and print as a fasta format file parse_psi -ungap : remove gaps in query sequence, and those below the gap in the query sequence, print as fasta format parse_psi -gap : print alignment with gaps in fasta format parse_psi -freq : print frequency profile needed for Jnet Usage: parse_psi -freq file.blast > file.freq \n";exit(0); } else {print "NO options given, try -help\n";exit} if (!-e "$ARGV[0]"){ print ("No such file as $ARGV[0] exists in this path...\n"); exit (0); } if ($ARGV[0] eq ""){ print "No filename given... \n"; exit; } open IN, "$ARGV[0]"; $num=$thisone=$c=0; while(){ if (/^Sequences producing significant alignments/){ $num++; } } print STDERR "$num alignments were found in PSIBlast file $ARGV[0]\n"; $num--; close IN; open IN, "$ARGV[0]"; while(){ if (/^Sequences producing significant alignments/){ if ($thisone == $num){ while (($ln2 = )){ if ($ln2 =~ /^QUERY/){ $alignout[$c]=$ln2; $c++; while (($align = ) !~ /^ Database/){ $alignout[$c]=$align; $c++; } last; } } } $thisone++; } } $c=$j=$z=$blocks=0; # get number of blocks and number of sequences in a block foreach (@alignout){ if ($_ =~ /^\n/){ $blocks++; } if ($_ !~ /^\n/){ $store[$blocks][$z]=$_; $z++; } } $npb=($z/($blocks+1)); for ($i=0; $i < ($blocks+1); $i++){ for ($j=0; $j < $npb ; $j++){ # get the alignment itself #print "$store[$i][$q]"; (@tokens)=split(" ",$store[$i][$q]); #print "$tokens[0] $tokens[1] $tokens[2] $tokens[3]\n"; if($#tokens == 3){ $id=$tokens[0]; $interest=$tokens[2]; } if($#tokens == 1){ $id=$tokens[0]; $interest=$tokens[1]; } if($#tokens == 2){ $id=$tokens[0]; $interest=$tokens[1]; if ($interest =~ /\d/g){ # get the next token, number # must be at this end $interest=$tokens[2]; } } # remove any digits that may creep in with the 3 token, # this would mean the coord is on the end not in the # middle. $interest =~ s/\d//g; undef @tokens; $namekeep[$j]=$id; $seqkeep[$j]=$seqkeep[$j].$interest; $q++; } } if ($gap==1 || $degap==1){ $c=0; foreach (@seqkeep){ if ($degap==1){ $seqkeep[$c] =~ s/-//g; } print ">$namekeep[$c]\n$seqkeep[$c]\n"; $c++; } } if ($ungap==1 || $freq == 1){ $c=0; @sequence=split("",$seqkeep[0]); foreach (@sequence){ if($sequence[$count] eq "\-"){ $printer[$count]=1; } else{$printer[$count]=0;} $count++; } $d=0; foreach $erm (@seqkeep){ undef @sequence; @sequence=split("",$erm); $count=0; foreach (@sequence){ if ($printer[$count]==0){ $ung[$d]=$ung[$d].$sequence[$count]; } $count++; } $d++; } if ($ungap == 1){ $c=0; foreach $re (@ung){ print ">$namekeep[$c]\n$re\n"; $c++; } } if ($freq == 1){ # do the counting.... $seqlen=length($ung[0]); #get a double array for ($i = 0; $i < $#ung; $i++){ undef @sequence; @sequence=split("",$ung[$i]); # convert sequence to an integer value @ints = seq2int(@sequence); $count=0; $j=0; foreach (@sequence) { $ar[$i][$j]=$ints[$j]; $j++; } } # zero countup for ($r=0; $r <20; $r++){ $countup[$r]=0; } for ($i = 0; $i < $seqlen; $i++) { # skip the query sequence for ($j = 1; $j < $#ung ; $j++){ # look at array $countup[$ar[$j][$i]]++; } # get total counts for this column $tot = 0; for ($r = 0; $r < 20; $r++){ $tot += $countup[$r]; } for ($r=0; $r <20; $r++){ # get percent and round up to 10 # this seems to be slightly different to how # psiblast works god knowns how.... # works for now. :-( $outnum= ($countup[$r] / $tot ) * 10; printf ("%2.0f ",$outnum); $countup[$r]=0; } print "\n"; } } } sub seq2int{ my(@s)=@_; $g=0; for ($g=0; $g <= $#s; $g++){ if ($s[$g] eq 'A'){ $intseq[$g] = "0"}; if ($s[$g] eq 'R'){ $intseq[$g] = "1"}; if ($s[$g] eq 'N'){ $intseq[$g] = "2"}; if ($s[$g] eq 'D'){ $intseq[$g] = "3"}; if ($s[$g] eq 'C'){ $intseq[$g] = "4"}; if ($s[$g] eq 'Q'){ $intseq[$g] = "5"}; if ($s[$g] eq 'E'){ $intseq[$g] = "6"}; if ($s[$g] eq 'G'){ $intseq[$g] = "7"}; if ($s[$g] eq 'H'){ $intseq[$g] = "8"}; if ($s[$g] eq 'I'){ $intseq[$g] = "9"}; if ($s[$g] eq 'L'){ $intseq[$g] = "10"}; if ($s[$g] eq 'K'){ $intseq[$g] = "11"}; if ($s[$g] eq 'M'){ $intseq[$g] = "12"}; if ($s[$g] eq 'F'){ $intseq[$g] = "13"}; if ($s[$g] eq 'P'){ $intseq[$g] = "14"}; if ($s[$g] eq 'S'){ $intseq[$g] = "15"}; if ($s[$g] eq 'T'){ $intseq[$g] = "16"}; if ($s[$g] eq 'W'){ $intseq[$g] = "17"}; if ($s[$g] eq 'Y'){ $intseq[$g] = "18"}; if ($s[$g] eq 'V'){ $intseq[$g] = "19"}; if ($s[$g] eq '-'){ $intseq[$g] = "25"}; } return (@intseq); }