in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 7 Sep 2012 04:35:25 +0000 (04:35 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 7 Sep 2012 04:35:25 +0000 (04:35 +0000)
forester/ruby/evoruby/exe/select_same_gn.rb [new file with mode: 0755]
forester/ruby/evoruby/lib/evo/msa/msa.rb
forester/ruby/evoruby/lib/evo/sequence/sequence.rb

diff --git a/forester/ruby/evoruby/exe/select_same_gn.rb b/forester/ruby/evoruby/exe/select_same_gn.rb
new file mode 100755 (executable)
index 0000000..7ab2c91
--- /dev/null
@@ -0,0 +1,82 @@
+#!/usr/local/bin/ruby -w
+
+
+require 'lib/evo/sequence/sequence'
+require 'lib/evo/msa/msa'
+require 'lib/evo/msa/msa_factory'
+require 'lib/evo/io/writer/fasta_writer'
+require 'lib/evo/io/parser/fasta_parser'
+
+
+module Evoruby
+
+  input = ARGV[ 0 ]
+  f = MsaFactory.new()
+
+  msa = nil
+
+  begin
+    msa = f.create_msa_from_file( input, FastaParser.new() )
+  rescue Exception => e
+    puts "error: " + e.to_s
+    exit
+  end
+
+  gn_to_seqs = Hash.new
+  unique_genes_msa = Msa.new
+  longest_non_unique_genes_msa = Msa.new
+  gn_re = /GN=(\S+)/
+  fragment_re = /fragment/i
+
+  frag_counter = 0
+
+  for i in 0 ... msa.get_number_of_seqs()
+    seq = msa.get_sequence( i )
+    name = seq.get_name
+
+    if fragment_re.match( name )
+      frag_counter += 1
+      next
+    end
+    gn_match = gn_re.match( name )
+    unless gn_match
+      puts "no match in " + name
+      exit
+    end
+    gn = gn_match[1]
+    unless gn_to_seqs.has_key?(gn)
+      gn_to_seqs[gn] = Msa.new
+    end
+    gn_to_seqs[gn].add_sequence(seq)
+  end
+
+  puts "Sequeunces ignored because \"fragment\" in desc: " + frag_counter.to_s
+  puts
+  puts
+
+  counter = 1
+  gn_to_seqs.each_pair do |gene,seqs|
+    if seqs.get_number_of_seqs > 1
+      puts counter.to_s + ": " + gene
+      puts seqs.to_fasta
+      puts
+      puts
+      counter += 1
+      longest = 0
+      longest_seq = nil
+      for j in 0 ... seqs.get_number_of_seqs()
+        current = seqs.get_sequence( j )
+        if current.get_length > longest
+          longest =  current.get_length
+          longest_seq = current
+        end
+      end
+      longest_non_unique_genes_msa.add_sequence(longest_seq)
+    else
+      unique_genes_msa.add_sequence( seqs.get_sequence( 0 ) )
+    end
+  end
+  w = FastaWriter.new
+  w.write(unique_genes_msa, "uniques.fasta")
+  w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta")
+end
index 98e9767..f338093 100644 (file)
@@ -155,26 +155,31 @@ module Evoruby
       @sequences.length
     end
 
-    def get_length()
-      if ( !is_aligned() )
+    def get_length
+      if !is_aligned()
         error_msg = "attempt to get length of unaligned msa"
         raise StandardError, error_msg, caller
       end
-      if ( get_number_of_seqs() < 1 )
+      if get_number_of_seqs() < 1
         -1
       else
         @sequences[ 0 ].get_length()
       end
     end
 
-    def to_str()
-      s = String.new()
-      for i in 0...get_number_of_seqs()
-        s += @sequences[ i ].to_str + Constants::LINE_DELIMITER
+    def to_str
+      to_fasta 
+    end
+
+    def to_fasta
+      s = String.new
+      for i in 0...get_number_of_seqs
+        s += @sequences[ i ].to_fasta + Constants::LINE_DELIMITER
       end
       s
     end
-
+    
+    
     def print_overlap_diagram( min_overlap = 1, io = STDOUT, max_name_length = 10 )
       if ( !is_aligned() )
         error_msg = "attempt to get overlap diagram of unaligned msa"
index 77b8d7a..09f8428 100644 (file)
@@ -156,9 +156,14 @@ module Evoruby
             @molecular_sequence.concat( molecular_sequence_str )
         end
 
-        def to_str()
+        def to_str
             return "[" + @name + "] " + @molecular_sequence
         end
+        
+        def to_fasta
+            return ">" + @name + Constants::LINE_DELIMITER  + @molecular_sequence
+        end
+        
 
     end # class Sequence