4 # database name from which homologues are collected
5 # by locally installed blast. Leave this if you do
6 # not use the '-l' option.
8 mafftpath = "_BINDIR/mafft"
9 # path of mafft. "/usr/local/bin/mafft"
10 # if mafft is in your command path, "mafft" is ok.
12 blastpath = "blastall"
14 # if blastall is in your command path, "blastall" is ok.
16 # mafft-homologs.rb v. 2.1 aligns sequences together with homologues
17 # automatically collected from SwissProt via NCBI BLAST.
19 # mafft > 5.58 is required
22 # mafft-homologs.rb [options] input > output
24 # -a # the number of collected sequences (default: 50)
25 # -e # threshold value (default: 1e-10)
26 # -o "xxx" options for mafft
27 # (default: " --op 1.53 --ep 0.123 --maxiterate 1000")
28 # -l locally carries out blast searches instead of NCBI blast
29 # (requires locally installed blast and a database)
30 # -f outputs collected homologues also (default: off)
31 # -w entire sequences are subjected to BLAST search
32 # (default: well-aligned region only)
39 temp_vf = Tempfile.new("_vf").path
40 temp_if = Tempfile.new("_if").path
41 temp_pf = Tempfile.new("_pf").path
42 temp_af = Tempfile.new("_af").path
43 temp_qf = Tempfile.new("_qf").path
44 temp_bf = Tempfile.new("_bf").path
45 temp_rid = Tempfile.new("_rid").path
46 temp_res = Tempfile.new("_res").path
49 system( mafftpath + " --help > #{temp_vf} 2>&1" )
50 pfp = File.open( "#{temp_vf}", 'r' )
52 break if $_ =~ /MAFFT v/
56 mafftversion = sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s
60 if( mafftversion < "5.58" ) then
62 puts "======================================================"
63 puts "Install new mafft (v. >= 5.58)"
64 puts "======================================================"
71 def readfasta( fp, name, seq )
76 name.push( $_.sub(/>/,"").strip )
77 seq.push( tmpseq ) if nseq > 0
95 mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder "
96 if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil || ARGV.length == 0 || $OPT_h then
97 puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file"
102 corewin = $OPT_c.to_i
105 corethr = $OPT_d.to_f
126 mafftopt += " " + $OPT_o + " "
129 system "cat " + ARGV.to_s + " > #{temp_if}"
130 ar = mafftopt.split(" ")
133 if ar[i] == "--seed" then
134 system "cat #{ar[i+1]} >> #{temp_if}"
139 ifp = File.open( "#{temp_if}", 'r' )
141 nseq += 1 if $_ =~ /^>/
146 STDERR.puts "The number of input sequences must be <100."
149 system( "cp #{temp_if}" + " #{temp_pf}" )
151 STDERR.puts "Performing preliminary alignment .. "
152 if entiresearch == 1 then
153 # system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" )
154 system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" )
156 system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" )
160 pfp = File.open( "#{temp_pf}", 'r' )
166 nin = readfasta( pfp, inname, inseq )
168 slen.push( inseq[i].gsub(/-/,"").length )
173 pfp = File.open( "#{temp_if}", 'r' )
177 nin = readfasta( pfp, orname, orseq )
180 allen = inseq[0].length
182 for j in (i+1)..(nin-1)
187 for a in 0..(allen-1)
188 next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-"
190 pid += 1.0 if inseq[i][a,1] == inseq[j][a,1]
193 # puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}"
206 afp = File.open( "#{temp_af}", 'w' )
208 STDERR.puts "Searching .. \n"
213 inseq[i].gsub!(/-/,"")
214 afp.puts ">" + orname[i]
217 # afp.puts ">" + inname[i]
220 STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i]
222 STDERR.puts "Skip.\n\n"
227 command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}"
230 ridp = File.open( "#{temp_rid}", 'r' )
232 break if $_ =~ / RID = (.*)/
236 STDERR.puts "Submitted to NCBI. rid = " + rid
238 STDERR.printf "Waiting "
242 command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}"
244 resp = File.open( "#{temp_res}", 'r' )
246 # if $_ =~ /WAITING/ then
251 break if $_ =~ /QBlastInfoBegin/
254 if $_ =~ /WAITING/ then
263 # puts "Not supported"
265 qfp = File.open( "#{temp_qf}", 'w' )
269 command = blastpath + " -p blastp -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}"
271 resp = File.open( "#{temp_res}", 'r' )
273 STDERR.puts " Done.\n\n"
275 resp = File.open( "#{temp_res}", 'r' )
278 break if $_ =~ /<Hit_id>(.*)<\/Hit_id>/ || $_ =~ /(<Iteration_stat>)/
281 break if $_ =~ /<Iteration_stat>/
284 break if $_ =~ /<Hsp_bit-score>(.*)<\/Hsp_bit-score>/
289 known = ids.index( id )
291 if sco[known] >= score then
294 ids.delete_at( known )
295 add.delete_at( known )
296 sco.delete_at( known )
300 break if $_ =~ /<Hsp_hseq>(.*)<\/Hsp_hseq>/
303 target = $1.sub( /-/, "" )
305 # STDERR.puts "adding 1 seq"
316 while n > 0 && outnum < nadd
318 afp.puts ">_addedbymaffte_" + ids[m]
327 STDERR.puts "Performing alignment .. "
328 system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" )
331 bfp = File.open( "#{temp_bf}", 'r' )
334 readfasta( bfp, outnam, outseq )
343 if fullout == 0 && outnam[i] =~ /^_addedbymaffte_/ then
346 outseq2.push( outseq[i] )
347 outnam2.push( outnam[i].sub( /^_addedbymaffte_/, "_ho_" ) )
350 nout = outseq2.length
351 len = outseq[0].length
357 if outseq2[j][p,1] != "-" then
369 puts ">" + outnam2[i]
370 puts outseq2[i].gsub( /.{1,60}/, "\\0\n" )
374 system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )