X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fmafft%2Fbinaries%2Fmafft-homologs.rb;fp=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fmafft%2Fbinaries%2Fmafft-homologs.rb;h=c1a2468da3cf7ea373a8f55a7ca84581167c97e7;hb=dbde3fb6f00b9bb770343631a517c0e599db8528;hp=0000000000000000000000000000000000000000;hpb=85f830bbd51a7277994bd4233141016304e210c9;p=jabaws.git diff --git a/website/archive/binaries/mac/src/mafft/binaries/mafft-homologs.rb b/website/archive/binaries/mac/src/mafft/binaries/mafft-homologs.rb new file mode 100644 index 0000000..c1a2468 --- /dev/null +++ b/website/archive/binaries/mac/src/mafft/binaries/mafft-homologs.rb @@ -0,0 +1,374 @@ +#!/usr/bin/env ruby + +localdb = "sp" +# database name from which homologues are collected +# by locally installed blast. Leave this if you do +# not use the '-l' option. + +mafftpath = "/usr/bin/mafft" +# path of mafft. "/usr/local/bin/mafft" +# if mafft is in your command path, "mafft" is ok. + +blastpath = "blastall" +# path of blastall. +# if blastall is in your command path, "blastall" is ok. + +# mafft-homologs.rb v. 2.1 aligns sequences together with homologues +# automatically collected from SwissProt via NCBI BLAST. +# +# mafft > 5.58 is required +# +# Usage: +# mafft-homologs.rb [options] input > output +# Options: +# -a # the number of collected sequences (default: 50) +# -e # threshold value (default: 1e-10) +# -o "xxx" options for mafft +# (default: " --op 1.53 --ep 0.123 --maxiterate 1000") +# -l locally carries out blast searches instead of NCBI blast +# (requires locally installed blast and a database) +# -f outputs collected homologues also (default: off) +# -w entire sequences are subjected to BLAST search +# (default: well-aligned region only) + +require 'getopts' +require 'tempfile' + +# mktemp +GC.disable +temp_vf = Tempfile.new("_vf").path +temp_if = Tempfile.new("_if").path +temp_pf = Tempfile.new("_pf").path +temp_af = Tempfile.new("_af").path +temp_qf = Tempfile.new("_qf").path +temp_bf = Tempfile.new("_bf").path +temp_rid = Tempfile.new("_rid").path +temp_res = Tempfile.new("_res").path + + +system( mafftpath + " --help > #{temp_vf} 2>&1" ) +pfp = File.open( "#{temp_vf}", 'r' ) +while pfp.gets + break if $_ =~ /MAFFT v/ +end +pfp.close +if( $_ ) then + mafftversion = sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s +else + mafftversion = "0" +end +if( mafftversion < "5.58" ) then + puts "" + puts "======================================================" + puts "Install new mafft (v. >= 5.58)" + puts "======================================================" + puts "" + exit +end + +srand ( 0 ) + +def readfasta( fp, name, seq ) + nseq = 0 + tmpseq = "" + while fp.gets + if $_ =~ /^>/ then + name.push( $_.sub(/>/,"").strip ) + seq.push( tmpseq ) if nseq > 0 + nseq += 1 + tmpseq = "" + else + tmpseq += $_.strip + end + end + seq.push( tmpseq ) + return nseq +end + +nadd = 50 +eval = 1e-10 +local = 0 +fullout = 0 +entiresearch = 0 +corewin = 50 +corethr = 0.3 +mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder " +if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil || ARGV.length == 0 || $OPT_h then + puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file" + exit +end + +if $OPT_c then + corewin = $OPT_c.to_i +end +if $OPT_d then + corethr = $OPT_d.to_f +end +if $OPT_w + entiresearch = 1 +end +if $OPT_f + fullout = 1 +end +if $OPT_s + fullout = 0 +end +if $OPT_l + local = 1 +end +if $OPT_e then + eval = $OPT_e.to_f +end +if $OPT_a then + nadd = $OPT_a.to_i +end +if $OPT_o then + mafftopt += " " + $OPT_o + " " +end + +system "cat " + ARGV.to_s + " > #{temp_if}" +ar = mafftopt.split(" ") +nar = ar.length +for i in 0..(nar-1) + if ar[i] == "--seed" then + system "cat #{ar[i+1]} >> #{temp_if}" + end +end + +nseq = 0 +ifp = File.open( "#{temp_if}", 'r' ) + while ifp.gets + nseq += 1 if $_ =~ /^>/ + end +ifp.close + +if nseq >= 100 then + STDERR.puts "The number of input sequences must be <100." + exit +elsif nseq == 1 then + system( "cp #{temp_if}" + " #{temp_pf}" ) +else + STDERR.puts "Performing preliminary alignment .. " + if entiresearch == 1 then +# system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" ) + system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" ) + else + system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" ) + end +end + +pfp = File.open( "#{temp_pf}", 'r' ) +inname = [] +inseq = [] +slen = [] +act = [] +nin = 0 +nin = readfasta( pfp, inname, inseq ) +for i in 0..(nin-1) + slen.push( inseq[i].gsub(/-/,"").length ) + act.push( 1 ) +end +pfp.close + +pfp = File.open( "#{temp_if}", 'r' ) +orname = [] +orseq = [] +nin = 0 +nin = readfasta( pfp, orname, orseq ) +pfp.close + +allen = inseq[0].length +for i in 0..(nin-2) + for j in (i+1)..(nin-1) + next if act[i] == 0 + next if act[j] == 0 + pid = 0.0 + total = 0 + for a in 0..(allen-1) + next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-" + total += 1 + pid += 1.0 if inseq[i][a,1] == inseq[j][a,1] + end + pid /= total +# puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}" + if pid > 0.5 then + if slen[i] < slen[j] + act[i] = 0 + else + act[j] = 0 + end + end + end +end +#p act + + +afp = File.open( "#{temp_af}", 'w' ) + +STDERR.puts "Searching .. \n" +ids = [] +add = [] +sco = [] +for i in 0..(nin-1) + inseq[i].gsub!(/-/,"") + afp.puts ">" + orname[i] + afp.puts orseq[i] + +# afp.puts ">" + inname[i] +# afp.puts inseq[i] + + STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i] + if act[i] == 0 then + STDERR.puts "Skip.\n\n" + next + end + + if local == 0 then + command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}" + system command + + ridp = File.open( "#{temp_rid}", 'r' ) + while ridp.gets + break if $_ =~ / RID = (.*)/ + end + ridp.close + rid = $1.strip + STDERR.puts "Submitted to NCBI. rid = " + rid + + STDERR.printf "Waiting " + while 1 + STDERR.printf "." + sleep 10 + command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}" + system command + resp = File.open( "#{temp_res}", 'r' ) +# resp.gets +# if $_ =~ /WAITING/ then +# resp.close +# next +# end + while( resp.gets ) + break if $_ =~ /QBlastInfoBegin/ + end + resp.gets + if $_ =~ /WAITING/ then + resp.close + next + else + resp.close + break + end + end + else +# puts "Not supported" +# exit + qfp = File.open( "#{temp_qf}", 'w' ) + qfp.puts "> " + qfp.puts inseq[i] + qfp.close + command = blastpath + " -p blastp -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}" + system command + resp = File.open( "#{temp_res}", 'r' ) + end + STDERR.puts " Done.\n\n" + + resp = File.open( "#{temp_res}", 'r' ) + while 1 + while resp.gets + break if $_ =~ /(.*)<\/Hit_id>/ || $_ =~ /()/ + end + id = $1 + break if $_ =~ // +# p id + while resp.gets + break if $_ =~ /(.*)<\/Hsp_bit-score>/ + end + score = $1.to_f +# p score + + known = ids.index( id ) + if known != nil then + if sco[known] >= score then + next + else + ids.delete_at( known ) + add.delete_at( known ) + sco.delete_at( known ) + end + end + while resp.gets + break if $_ =~ /(.*)<\/Hsp_hseq>/ + end +# break if $1 == nil + target = $1.sub( /-/, "" ).sub( /U/, "X" ) +# p target +# STDERR.puts "adding 1 seq" + ids.push( id ) + sco.push( score ) + add.push( target ) + end + resp.close +end + +n = ids.length + +outnum = 0 +while n > 0 && outnum < nadd + m = rand( n ) + afp.puts ">_addedbymaffte_" + ids[m] + afp.puts add[m] + ids.delete_at( m ) + add.delete_at( m ) + n -= 1 + outnum += 1 +end +afp.close + +STDERR.puts "Performing alignment .. " +system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" ) +STDERR.puts "done." + +bfp = File.open( "#{temp_bf}", 'r' ) +outseq = [] +outnam = [] +readfasta( bfp, outnam, outseq ) +bfp.close + +outseq2 = [] +outnam2 = [] + +len = outseq.length +for i in 0..(len-1) +# p outnam[i] + if fullout == 0 && outnam[i] =~ /_addedbymaffte_/ then + next + end + outseq2.push( outseq[i] ) + outnam2.push( outnam[i].sub( /_addedbymaffte_/, "_ho_" ) ) +end + +nout = outseq2.length +len = outseq[0].length +p = len +while p>0 + p -= 1 + allgap = 1 + for j in 0..(nout-1) + if outseq2[j][p,1] != "-" then + allgap = 0 + break + end + end + if allgap == 1 then + for j in 0..(nout-1) + outseq2[j][p,1] = "" + end + end +end +for i in 0..(nout-1) + puts ">" + outnam2[i] + puts outseq2[i].gsub( /.{1,60}/, "\\0\n" ) +end + + +system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )