From: cmzmasek@gmail.com Date: Fri, 7 Sep 2012 04:35:25 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=9758329082d556b3b41db956ceb80c335df49437;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb new file mode 100755 index 0000000..7ab2c91 --- /dev/null +++ b/forester/ruby/evoruby/exe/select_same_gn.rb @@ -0,0 +1,82 @@ +#!/usr/local/bin/ruby -w + + +require 'lib/evo/sequence/sequence' +require 'lib/evo/msa/msa' +require 'lib/evo/msa/msa_factory' +require 'lib/evo/io/writer/fasta_writer' +require 'lib/evo/io/parser/fasta_parser' + + +module Evoruby + + input = ARGV[ 0 ] + f = MsaFactory.new() + + msa = nil + + begin + msa = f.create_msa_from_file( input, FastaParser.new() ) + rescue Exception => e + puts "error: " + e.to_s + exit + end + + gn_to_seqs = Hash.new + unique_genes_msa = Msa.new + longest_non_unique_genes_msa = Msa.new + gn_re = /GN=(\S+)/ + fragment_re = /fragment/i + + frag_counter = 0 + + for i in 0 ... msa.get_number_of_seqs() + seq = msa.get_sequence( i ) + name = seq.get_name + + if fragment_re.match( name ) + frag_counter += 1 + next + end + gn_match = gn_re.match( name ) + unless gn_match + puts "no match in " + name + exit + end + gn = gn_match[1] + unless gn_to_seqs.has_key?(gn) + gn_to_seqs[gn] = Msa.new + end + gn_to_seqs[gn].add_sequence(seq) + end + + puts "Sequeunces ignored because \"fragment\" in desc: " + frag_counter.to_s + puts + puts + + counter = 1 + gn_to_seqs.each_pair do |gene,seqs| + if seqs.get_number_of_seqs > 1 + puts counter.to_s + ": " + gene + puts seqs.to_fasta + puts + puts + counter += 1 + longest = 0 + longest_seq = nil + for j in 0 ... seqs.get_number_of_seqs() + current = seqs.get_sequence( j ) + if current.get_length > longest + longest = current.get_length + longest_seq = current + end + end + longest_non_unique_genes_msa.add_sequence(longest_seq) + else + unique_genes_msa.add_sequence( seqs.get_sequence( 0 ) ) + end + end + w = FastaWriter.new + w.write(unique_genes_msa, "uniques.fasta") + w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta") +end diff --git a/forester/ruby/evoruby/lib/evo/msa/msa.rb b/forester/ruby/evoruby/lib/evo/msa/msa.rb index 98e9767..f338093 100644 --- a/forester/ruby/evoruby/lib/evo/msa/msa.rb +++ b/forester/ruby/evoruby/lib/evo/msa/msa.rb @@ -155,26 +155,31 @@ module Evoruby @sequences.length end - def get_length() - if ( !is_aligned() ) + def get_length + if !is_aligned() error_msg = "attempt to get length of unaligned msa" raise StandardError, error_msg, caller end - if ( get_number_of_seqs() < 1 ) + if get_number_of_seqs() < 1 -1 else @sequences[ 0 ].get_length() end end - def to_str() - s = String.new() - for i in 0...get_number_of_seqs() - s += @sequences[ i ].to_str + Constants::LINE_DELIMITER + def to_str + to_fasta + end + + def to_fasta + s = String.new + for i in 0...get_number_of_seqs + s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER end s end - + + def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 ) if ( !is_aligned() ) error_msg = "attempt to get overlap diagram of unaligned msa" diff --git a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb index 77b8d7a..09f8428 100644 --- a/forester/ruby/evoruby/lib/evo/sequence/sequence.rb +++ b/forester/ruby/evoruby/lib/evo/sequence/sequence.rb @@ -156,9 +156,14 @@ module Evoruby @molecular_sequence.concat( molecular_sequence_str ) end - def to_str() + def to_str return "[" + @name + "] " + @molecular_sequence end + + def to_fasta + return ">" + @name + Constants::LINE_DELIMITER + @molecular_sequence + end + end # class Sequence