3 # Usage: p7extract.pl <hmmsearch output file>
5 # Converts hmmsearch output to GLF or GDF. GLF is the default.
6 # Order is sorted by bit score
9 # -C : extract Pfam coverage statistics (NAR paper)
10 # -d : extract domains in GDF format
11 # -t <x> : report only hits better than evalue of <x>
12 # -s : include scores in output
13 # -e : include evalues in output
14 # -l : include negative log evalues in output for easy sorting
16 # Note: p7extract.pl -sel gives the extended GLF format expected by
17 # the PROFMARK benchmark scripts
24 if ($opt_C) { $coverage_mode = 1; $gdfmode = 1;}
25 if ($opt_d) { $gdfmode = 1; }
26 if ($opt_t) { $ethresh = $opt_t; }
27 if ($opt_s) { $do_scores = 1; }
28 if ($opt_e) { $do_eval = 1; }
29 if ($opt_l) { $do_log = 1; }
31 $status = 1; # -C only: assume we will fail, 'til proven otherwise
35 if (/^Query HMM:\s+(\S+)/) {$hmmname = $1;}
36 if (/^Scores for complete sequences/) {$indom = 0; $inseq = 1;}
37 if (/^Parsed for domains/) {$indom = 1; $inseq = 0;}
38 if (/^Histogram of all scores/) {$indom = 0; $inseq = 0;}
39 if (/^Total sequences searched/) {$status = 0;} # looks like we've seen output
42 (($id, $sc, $ev, $nd) = /(\S+).+\s(\S+)\s+(\S+)\s+(\d+)\s*$/))
44 if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0)
47 $show_key{$id} = 1; # remember this name
55 if ($gdfmode && $indom &&
56 (($id, $sqfrom, $sqto, $sc, $ev) =
57 /(\S+)\s+\S+\s+(\d+)\s+(\d+).+\s(\S+)\s+(\S+)\s*$/))
59 if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0)
61 $key = "$id/$sqfrom-$sqto";
64 $show_sqfrom{$key} = $sqfrom;
65 $show_sqto{$key} = $sqto;
71 $domsize = $sqto - $sqfrom + 1;
72 if ($domsize < 0) { $domsize *= -1; }
73 $numresidues += $domsize;
82 printf "%-20s %6d %6d %6d\n", $hmmname, $numseqs, $numdomains, $numresidues;
85 printf "%-20s [FAILED]\n", $hmmname;
93 foreach $key (sort byscore keys(%show_key))
97 printf("%-24s\t%6d\t%6d\t%15s",
98 $key, $show_sqfrom{$key}, $show_sqto{$key}, $show_id{$key})
100 printf("%-24s", $key);
102 # Optional extensions to GDF/GLF
103 if ($do_scores) { printf("\t%8s", $show_sc{$key}); }
104 if ($do_eval) { printf("\t%12s", $show_ev{$key}); }
105 if ($do_log) { printf("\t%12.1f", -log($show_ev{$key})); }
110 $show_sc{$b} <=> $show_sc{$a};
114 $show_ev{$a} <=> $show_ev{$b};