forester/ruby/scripts/bioruby_examples/msa_1.rb

   1 require 'rubygems'
   2 require 'bio'
   3
   4 #############
   5
   6 # Reads in a ClustalW formatted multiple sequence alignment
   7 # from a file named "infile_clustalw.aln" and stores it in 'report'.
   8 report = Bio::ClustalW::Report.new(File.read('infile_clustalw.aln'))
   9
  10 # Accesses the actual alignment.
  11 align = report.alignment
  12
  13 # Goes through all sequences in 'align' and prints the
  14 # actual molecular sequence.
  15 align.each do |entry|
  16   puts entry.seq
  17 end
  18
  19 ##############
  20
  21 DEFAULT_PARSER = Bio::Alignment::MultiFastaFormat
  22 puts DEFAULT_PARSER.to_s
  23
  24 #file = Bio::Alignment.readfiles('bcl2.fasta', Bio::Alignment::MultiFastaFormat)
  25 #file.each do |entry|
  26 #  puts entry.entry_id           # Gets the identifier, e.g. 'sp|O35147|BAD_RAT'.
  27 #  puts entry.definition         # Gets the complete fasta description line.
  28 #  puts entry.seq                # Gets the actual sequence.
  29   #puts entry.aaseq.composition  # Gets the amino acid composition.
  30 #end
  31 #puts 'OK'
  32 #puts
  33
  34 file = Bio::FastaFormat.open('bcl2.fasta')
  35 file.each do |entry|
  36    puts entry.entry_id           # Gets the identifier, e.g. 'sp|O35147|BAD_RAT'.
  37   puts entry.definition         # Gets the complete fasta description line.
  38   puts entry.seq                # Gets the actual sequence.
  39   # do something on each fasta sequence entry
  40 end
  41
  42 ##############
  43
  44 # Creates a new file named "outfile.fasta" and writes
  45 # multiple sequence alignment 'align' to it in fasta format.
  46 File.open('outfile.fasta', 'w') do |f|
  47   f.write(align.output(:fasta))
  48 end
  49
  50 # Creates a new file named "outfile.aln" and writes
  51 # multiple sequence alignment 'align' to it in clustal format.
  52 File.open('outfile.aln', 'w') do |f|
  53   f.write(align.output(:clustal))
  54 end
  55
  56 #############
  57
  58 seq1 = Bio::Sequence.auto("gggggg")
  59
  60
  61 puts seq1.output(:fasta)
  62 #seq2 = Bio::Sequence::AA.new("ggggt")
  63 #seq3 = Bio::Sequence::AA.new("ggt")
  64
  65
  66
  67 seqs = [ "MFQIPEFEPSEQEDSSSAER",
  68          "MGTPKQPSLAPAHALGLRKS",
  69          "PKQPSLAPAHALGLRKS",
  70          "MCSTSGCDLE" ]
  71
  72
  73 # MAFFT
  74 options = [ '--maxiterate', '1000', '--localpair' ]
  75 mafft = Bio::MAFFT.new('/home/zma/SOFTWARE/mafft-6.847-without-extensions/scripts/mafft', options )
  76 report = mafft.query_align( seqs)
  77
  78 # Accesses the actual alignment
  79 align = report.alignment
  80
  81 # Prints each sequence to the console.
  82 align.each { |s| puts s.to_s }
  83
  84
  85 puts 'MAFFT OK'
  86 puts
  87
  88 #clustalw
  89 clustalw = Bio::ClustalW.new('/home/zma/SOFTWARE/clustalw-2.1/src/clustalw2' )
  90 report = clustalw.query_align( seqs)
  91 #puts report.alignment.output_fasta.to_s
  92 report.alignment.each { |x| puts x.to_s }
  93 puts 'OK'
  94 puts
  95
  96 #muscle
  97 options = [ '-quiet', '-maxiters', '64' ]
  98 muscle = Bio::Muscle.new('/home/zma/SOFTWARE/muscle3.8.31/src/muscle', options )
  99 report = muscle.query_align( seqs)
 100 #puts report.alignment.output_fasta.to_s
 101 report.alignment.each { |x| puts x.to_s }
 102 puts 'OK'
 103 puts
 104
 105 file = Bio::FastaFormat.open('bcl2.fasta')
 106 file.each do |entry|
 107   puts entry.entry_id           # Gets the identifier, e.g. 'sp|O35147|BAD_RAT'.
 108   puts entry.definition         # Gets the complete fasta description line.
 109   puts entry.seq                # Gets the actual sequence.
 110   puts entry.aaseq.composition  # Gets the amino acid composition.
 111 end
 112 puts 'OK'
 113 puts
 114
 115 Bio::FlatFile.auto('bcl2.fasta') do |ff|
 116   ff.each do |entry|
 117     puts entry.entry_id           # Gets the identifier, e.g. 'sp|O35147|BAD_RAT'.
 118     puts entry.definition         # Gets the complete fasta description line.
 119     puts entry.seq                # Gets the actual sequence.
 120     puts entry.aaseq.composition  # Gets the amino acid composition.
 121   end
 122 end
 123 puts 'OK'
 124 puts
 125
 126
 127
 128
 129
 130