+++ /dev/null
-#!/usr/bin/env ruby
-
-localdb = "sp"
-# database name from which homologues are collected
-# by locally installed blast. Leave this if you do
-# not use the '-l' option.
-
-mafftpath = "/usr/local/bin/mafft"
-# path of mafft. "/usr/local/bin/mafft"
-# if mafft is in your command path, "mafft" is ok.
-
-blastpath = "blastall"
-# path of blastall.
-# if blastall is in your command path, "blastall" is ok.
-
-# mafft-homologs.rb v. 2.1 aligns sequences together with homologues
-# automatically collected from SwissProt via NCBI BLAST.
-#
-# mafft > 5.58 is required
-#
-# Usage:
-# mafft-homologs.rb [options] input > output
-# Options:
-# -a # the number of collected sequences (default: 50)
-# -e # threshold value (default: 1e-10)
-# -o "xxx" options for mafft
-# (default: " --op 1.53 --ep 0.123 --maxiterate 1000")
-# -l locally carries out blast searches instead of NCBI blast
-# (requires locally installed blast and a database)
-# -f outputs collected homologues also (default: off)
-# -w entire sequences are subjected to BLAST search
-# (default: well-aligned region only)
-
-require 'getopts'
-require 'tempfile'
-
-# mktemp
-GC.disable
-temp_vf = Tempfile.new("_vf").path
-temp_if = Tempfile.new("_if").path
-temp_pf = Tempfile.new("_pf").path
-temp_af = Tempfile.new("_af").path
-temp_qf = Tempfile.new("_qf").path
-temp_bf = Tempfile.new("_bf").path
-temp_rid = Tempfile.new("_rid").path
-temp_res = Tempfile.new("_res").path
-
-
-system( mafftpath + " --help > #{temp_vf} 2>&1" )
-pfp = File.open( "#{temp_vf}", 'r' )
-while pfp.gets
- break if $_ =~ /MAFFT v/
-end
-pfp.close
-if( $_ ) then
- mafftversion = sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s
-else
- mafftversion = "0"
-end
-if( mafftversion < "5.58" ) then
- puts ""
- puts "======================================================"
- puts "Install new mafft (v. >= 5.58)"
- puts "======================================================"
- puts ""
- exit
-end
-
-srand ( 0 )
-
-def readfasta( fp, name, seq )
- nseq = 0
- tmpseq = ""
- while fp.gets
- if $_ =~ /^>/ then
- name.push( $_.sub(/>/,"").strip )
- seq.push( tmpseq ) if nseq > 0
- nseq += 1
- tmpseq = ""
- else
- tmpseq += $_.strip
- end
- end
- seq.push( tmpseq )
- return nseq
-end
-
-nadd = 50
-eval = 1e-10
-local = 0
-fullout = 0
-entiresearch = 0
-corewin = 50
-corethr = 0.3
-mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder "
-if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil || ARGV.length == 0 || $OPT_h then
- puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file"
- exit
-end
-
-if $OPT_c then
- corewin = $OPT_c.to_i
-end
-if $OPT_d then
- corethr = $OPT_d.to_f
-end
-if $OPT_w
- entiresearch = 1
-end
-if $OPT_f
- fullout = 1
-end
-if $OPT_s
- fullout = 0
-end
-if $OPT_l
- local = 1
-end
-if $OPT_e then
- eval = $OPT_e.to_f
-end
-if $OPT_a then
- nadd = $OPT_a.to_i
-end
-if $OPT_o then
- mafftopt += " " + $OPT_o + " "
-end
-
-system "cat " + ARGV.to_s + " > #{temp_if}"
-ar = mafftopt.split(" ")
-nar = ar.length
-for i in 0..(nar-1)
- if ar[i] == "--seed" then
- system "cat #{ar[i+1]} >> #{temp_if}"
- end
-end
-
-nseq = 0
-ifp = File.open( "#{temp_if}", 'r' )
- while ifp.gets
- nseq += 1 if $_ =~ /^>/
- end
-ifp.close
-
-if nseq >= 100 then
- STDERR.puts "The number of input sequences must be <100."
- exit
-elsif nseq == 1 then
- system( "cp #{temp_if}" + " #{temp_pf}" )
-else
- STDERR.puts "Performing preliminary alignment .. "
- if entiresearch == 1 then
-# system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" )
- system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" )
- else
- system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" )
- end
-end
-
-pfp = File.open( "#{temp_pf}", 'r' )
-inname = []
-inseq = []
-slen = []
-act = []
-nin = 0
-nin = readfasta( pfp, inname, inseq )
-for i in 0..(nin-1)
- slen.push( inseq[i].gsub(/-/,"").length )
- act.push( 1 )
-end
-pfp.close
-
-pfp = File.open( "#{temp_if}", 'r' )
-orname = []
-orseq = []
-nin = 0
-nin = readfasta( pfp, orname, orseq )
-pfp.close
-
-allen = inseq[0].length
-for i in 0..(nin-2)
- for j in (i+1)..(nin-1)
- next if act[i] == 0
- next if act[j] == 0
- pid = 0.0
- total = 0
- for a in 0..(allen-1)
- next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-"
- total += 1
- pid += 1.0 if inseq[i][a,1] == inseq[j][a,1]
- end
- pid /= total
-# puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}"
- if pid > 0.5 then
- if slen[i] < slen[j]
- act[i] = 0
- else
- act[j] = 0
- end
- end
- end
-end
-#p act
-
-
-afp = File.open( "#{temp_af}", 'w' )
-
-STDERR.puts "Searching .. \n"
-ids = []
-add = []
-sco = []
-for i in 0..(nin-1)
- inseq[i].gsub!(/-/,"")
- afp.puts ">" + orname[i]
- afp.puts orseq[i]
-
-# afp.puts ">" + inname[i]
-# afp.puts inseq[i]
-
- STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i]
- if act[i] == 0 then
- STDERR.puts "Skip.\n\n"
- next
- end
-
- if local == 0 then
- command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}"
- system command
-
- ridp = File.open( "#{temp_rid}", 'r' )
- while ridp.gets
- break if $_ =~ / RID = (.*)/
- end
- ridp.close
- rid = $1.strip
- STDERR.puts "Submitted to NCBI. rid = " + rid
-
- STDERR.printf "Waiting "
- while 1
- STDERR.printf "."
- sleep 10
- command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}"
- system command
- resp = File.open( "#{temp_res}", 'r' )
-# resp.gets
-# if $_ =~ /WAITING/ then
-# resp.close
-# next
-# end
- while( resp.gets )
- break if $_ =~ /QBlastInfoBegin/
- end
- resp.gets
- if $_ =~ /WAITING/ then
- resp.close
- next
- else
- resp.close
- break
- end
- end
- else
-# puts "Not supported"
-# exit
- qfp = File.open( "#{temp_qf}", 'w' )
- qfp.puts "> "
- qfp.puts inseq[i]
- qfp.close
- command = blastpath + " -p blastp -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}"
- system command
- resp = File.open( "#{temp_res}", 'r' )
- end
- STDERR.puts " Done.\n\n"
-
- resp = File.open( "#{temp_res}", 'r' )
- while 1
- while resp.gets
- break if $_ =~ /<Hit_id>(.*)<\/Hit_id>/ || $_ =~ /(<Iteration_stat>)/
- end
- id = $1
- break if $_ =~ /<Iteration_stat>/
-# p id
- while resp.gets
- break if $_ =~ /<Hsp_bit-score>(.*)<\/Hsp_bit-score>/
- end
- score = $1.to_f
-# p score
-
- known = ids.index( id )
- if known != nil then
- if sco[known] >= score then
- next
- else
- ids.delete_at( known )
- add.delete_at( known )
- sco.delete_at( known )
- end
- end
- while resp.gets
- break if $_ =~ /<Hsp_hseq>(.*)<\/Hsp_hseq>/
- end
-# break if $1 == nil
- target = $1.sub( /-/, "" )
-# p target
-# STDERR.puts "adding 1 seq"
- ids.push( id )
- sco.push( score )
- add.push( target )
- end
- resp.close
-end
-
-n = ids.length
-
-outnum = 0
-while n > 0 && outnum < nadd
- m = rand( n )
- afp.puts ">_addedbymaffte_" + ids[m]
- afp.puts add[m]
- ids.delete_at( m )
- add.delete_at( m )
- n -= 1
- outnum += 1
-end
-afp.close
-
-STDERR.puts "Performing alignment .. "
-system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" )
-STDERR.puts "done."
-
-bfp = File.open( "#{temp_bf}", 'r' )
-outseq = []
-outnam = []
-readfasta( bfp, outnam, outseq )
-bfp.close
-
-outseq2 = []
-outnam2 = []
-
-len = outseq.length
-for i in 0..(len-1)
-# p outnam[i]
- if fullout == 0 && outnam[i] =~ /^_addedbymaffte_/ then
- next
- end
- outseq2.push( outseq[i] )
- outnam2.push( outnam[i].sub( /^_addedbymaffte_/, "_ho_" ) )
-end
-
-nout = outseq2.length
-len = outseq[0].length
-p = len
-while p>0
- p -= 1
- allgap = 1
- for j in 0..(nout-1)
- if outseq2[j][p,1] != "-" then
- allgap = 0
- break
- end
- end
- if allgap == 1 then
- for j in 0..(nout-1)
- outseq2[j][p,1] = ""
- end
- end
-end
-for i in 0..(nout-1)
- puts ">" + outnam2[i]
- puts outseq2[i].gsub( /.{1,60}/, "\\0\n" )
-end
-
-
-system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )