4 # database name from which homologues are collected
5 # by locally installed blast. Leave this if you do
6 # not use the '-l' option.
8 mafftpath = "_BINDIR/mafft"
9 # path of mafft. "/usr/local/bin/mafft"
10 # if mafft is in your command path, "mafft" is ok.
12 blastpath = "blastall"
14 # if blastall is in your command path, "blastall" is ok.
16 # mafft-homologs.rb v. 2.1 aligns sequences together with homologues
17 # automatically collected from SwissProt via NCBI BLAST.
19 # mafft > 5.58 is required
22 # mafft-homologs.rb [options] input > output
24 # -a # the number of collected sequences (default: 50)
25 # -e # threshold value (default: 1e-10)
26 # -o "xxx" options for mafft
27 # (default: " --op 1.53 --ep 0.123 --maxiterate 1000")
28 # -l locally carries out blast searches instead of NCBI blast
29 # (requires locally installed blast and a database)
30 # -f outputs collected homologues also (default: off)
31 # -w entire sequences are subjected to BLAST search
32 # (default: well-aligned region only)
40 temp_vf = Tempfile.new("_vf").path
41 temp_if = Tempfile.new("_if").path
42 temp_pf = Tempfile.new("_pf").path
43 temp_af = Tempfile.new("_af").path
44 temp_qf = Tempfile.new("_qf").path
45 temp_bf = Tempfile.new("_bf").path
46 temp_rid = Tempfile.new("_rid").path
47 temp_res = Tempfile.new("_res").path
50 system( mafftpath + " --help > #{temp_vf} 2>&1" )
51 pfp = File.open( "#{temp_vf}", 'r' )
53 break if $_ =~ /MAFFT v/
58 mafftversion = $_.sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s
62 if( mafftversion < "5.58" ) then
64 STDERR.puts "======================================================"
65 STDERR.puts "Install new mafft (v. >= 5.58)"
66 STDERR.puts "======================================================"
73 def readfasta( fp, name, seq )
78 name.push( $_.sub(/>/,"").strip )
79 seq.push( tmpseq ) if nseq > 0
97 mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder "
100 #if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil || ARGV.length == 0 || $OPT_h then
101 # puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file"
104 params = ARGV.getopts( "sfwlhe:a:o:c:d:" )
108 if params["c"] != nil then
109 corewin = params["c"].to_i
113 if params["d"] != nil then
114 corethr = params["d"].to_f
118 if params["w"] == true then
123 if params["f"] == true then
128 if params["s"] == true then
133 if params["l"] == true then
138 if params["e"] != nil then
140 eval = params["e"].to_f
144 if params["a"] != nil then
145 nadd = params["a"].to_i
149 if params["o"] != nil then
150 mafftopt += " " + params["o"] + " "
153 infn = ARGV[0].to_s.strip
155 system "cat " + infn + " > #{temp_if}"
156 ar = mafftopt.split(" ")
159 if ar[i] == "--seed" then
160 system "cat #{ar[i+1]} >> #{temp_if}"
165 ifp = File.open( "#{temp_if}", 'r' )
167 nseq += 1 if $_ =~ /^>/
172 STDERR.puts "The number of input sequences must be <100."
175 system( "cp #{temp_if}" + " #{temp_pf}" )
177 STDERR.puts "Performing preliminary alignment .. "
178 if entiresearch == 1 then
179 # system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" )
180 system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" )
182 system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" )
186 pfp = File.open( "#{temp_pf}", 'r' )
192 nin = readfasta( pfp, inname, inseq )
194 slen.push( inseq[i].gsub(/-/,"").length )
199 pfp = File.open( "#{temp_if}", 'r' )
203 nin = readfasta( pfp, orname, orseq )
206 allen = inseq[0].length
208 for j in (i+1)..(nin-1)
213 for a in 0..(allen-1)
214 next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-"
216 pid += 1.0 if inseq[i][a,1] == inseq[j][a,1]
219 # puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}"
232 afp = File.open( "#{temp_af}", 'w' )
234 STDERR.puts "Searching .. \n"
239 inseq[i].gsub!(/-/,"")
240 afp.puts ">" + orname[i]
243 # afp.puts ">" + inname[i]
246 STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i]
248 STDERR.puts "Skip.\n\n"
253 command = "lynx -source 'https://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}"
256 ridp = File.open( "#{temp_rid}", 'r' )
258 break if $_ =~ / RID = (.*)/
262 STDERR.puts "Submitted to NCBI. rid = " + rid
264 STDERR.printf "Waiting "
268 command = "lynx -source 'https://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}"
270 resp = File.open( "#{temp_res}", 'r' )
272 # if $_ =~ /WAITING/ then
277 break if $_ =~ /QBlastInfoBegin/
280 if $_ =~ /WAITING/ then
289 # puts "Not supported"
291 qfp = File.open( "#{temp_qf}", 'w' )
295 command = blastpath + " -p blastp -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}"
297 resp = File.open( "#{temp_res}", 'r' )
299 STDERR.puts " Done.\n\n"
301 resp = File.open( "#{temp_res}", 'r' )
304 break if $_ =~ /<Hit_id>(.*)<\/Hit_id>/ || $_ =~ /(<Iteration_stat>)/
307 break if $_ =~ /<Iteration_stat>/
310 break if $_ =~ /<Hsp_bit-score>(.*)<\/Hsp_bit-score>/
315 known = ids.index( id )
317 if sco[known] >= score then
320 ids.delete_at( known )
321 add.delete_at( known )
322 sco.delete_at( known )
326 break if $_ =~ /<Hsp_hseq>(.*)<\/Hsp_hseq>/
329 target = $1.sub( /-/, "" ).sub( /U/, "X" )
331 # STDERR.puts "adding 1 seq"
342 while n > 0 && outnum < nadd
344 afp.puts ">_addedbymaffte_" + ids[m]
353 STDERR.puts "Performing alignment .. "
354 system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" )
357 bfp = File.open( "#{temp_bf}", 'r' )
360 readfasta( bfp, outnam, outseq )
369 if fullout == 0 && outnam[i] =~ /_addedbymaffte_/ then
372 outseq2.push( outseq[i] )
373 outnam2.push( outnam[i].sub( /_addedbymaffte_/, "_ho_" ) )
376 nout = outseq2.length
377 len = outseq[0].length
383 if outseq2[j][p,1] != "-" then
395 puts ">" + outnam2[i]
396 puts outseq2[i].gsub( /.{1,60}/, "\\0\n" )
400 system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )
401 if File.exist?( "#{temp_af}.tree" ) then
402 system( "sed 's/_addedbymaffte_/_ho_/' #{temp_af}.tree > #{ARGV[0].to_s}.tree" )
403 system( "rm #{temp_af}.tree" )