inprogress
[jalview.git] / forester / ruby / evoruby / exe / select_same_gn.rb
index 7ab2c91..b2c2410 100755 (executable)
@@ -13,6 +13,8 @@ module Evoruby
   input = ARGV[ 0 ]
   f = MsaFactory.new()
 
+  IGNORE_SEQS_LACKING_GN = false
+
   msa = nil
 
   begin
@@ -22,40 +24,99 @@ module Evoruby
     exit
   end
 
+  all_names = Set.new
+  all_seqs_per_species = Hash.new
+  all_msa_per_species = Hash.new
   gn_to_seqs = Hash.new
   unique_genes_msa = Msa.new
   longest_non_unique_genes_msa = Msa.new
   gn_re = /GN=(\S+)/
   fragment_re = /fragment/i
+  species_re = /\[([A-Z0-9]{3,6})\]$/
 
   frag_counter = 0
+  no_gn_counter = 0
+  same_seq_counter = 0
 
   for i in 0 ... msa.get_number_of_seqs()
     seq = msa.get_sequence( i )
     name = seq.get_name
+    if all_names.include?( name )
+      puts "error: sequence name \"" + name + "\" is not unique (#" + i.to_s + ")"
+      exit
+    else
+      all_names << name
+    end
 
     if fragment_re.match( name )
+      puts "ignored because fragment: " + name
       frag_counter += 1
       next
     end
-    gn_match = gn_re.match( name )
-    unless gn_match
-      puts "no match in " + name
+
+    species = nil
+    if species_re.match( name )
+      s_match = species_re.match( name )
+      species = s_match[1]
+
+      unless all_seqs_per_species.has_key?( species )
+        all_seqs_per_species[ species ] = Set.new
+      end
+      all_seqs = all_seqs_per_species[ species ]
+      mol_seq = seq.get_sequence_as_string.upcase
+      if all_seqs.include?( mol_seq )
+        puts "ignored because identical sequence in same species: " + name
+        same_seq_counter += 1
+        next
+      else
+        all_seqs << mol_seq
+      end
+    else
+      puts "error: no species for: " + name
       exit
     end
-    gn = gn_match[1]
+
+    gn_match = gn_re.match( name )
+    if IGNORE_SEQS_LACKING_GN
+      unless gn_match
+        puts "ignored because no GN=: " + name
+        no_gn_counter += 1
+        next
+      end
+    else
+      unless gn_match
+        puts "no GN=: " + name
+      end
+    end
+
+    gn =nil
+    if gn_match
+      gn = gn_match[1] + "_" + species
+    else
+      if IGNORE_SEQS_LACKING_GN
+        puts "cannot be"
+        exit
+      end
+      gn = name
+    end
+
     unless gn_to_seqs.has_key?(gn)
       gn_to_seqs[gn] = Msa.new
     end
     gn_to_seqs[gn].add_sequence(seq)
   end
 
-  puts "Sequeunces ignored because \"fragment\" in desc: " + frag_counter.to_s
+  puts "Sequences ignored because \"fragment\" in desc                : " + frag_counter.to_s
+  if IGNORE_SEQS_LACKING_GN
+    puts "Sequences ignored because no \"GN=\" in desc                  : " + no_gn_counter.to_s
+  end
+  puts "Sequences ignored because identical sequence in same species: " + same_seq_counter.to_s
   puts
   puts
 
   counter = 1
   gn_to_seqs.each_pair do |gene,seqs|
+    seq = nil
     if seqs.get_number_of_seqs > 1
       puts counter.to_s + ": " + gene
       puts seqs.to_fasta
@@ -71,12 +132,28 @@ module Evoruby
           longest_seq = current
         end
       end
-      longest_non_unique_genes_msa.add_sequence(longest_seq)
+      seq = longest_seq
+      longest_non_unique_genes_msa.add_sequence( seq )
     else
-      unique_genes_msa.add_sequence( seqs.get_sequence( 0 ) )
+      seq = seqs.get_sequence( 0 )
+      unique_genes_msa.add_sequence( seq )
     end
+
+    species = species_re.match( seq.get_name )[ 1 ]
+    unless all_msa_per_species.has_key?(species)
+      all_msa_per_species[species] = Msa.new
+    end
+    all_msa_per_species[species].add_sequence(seq)
+
   end
+
   w = FastaWriter.new
-  w.write(unique_genes_msa, "uniques.fasta")
-  w.write(longest_non_unique_genes_msa, "non_uniques_longest.fasta")
+  w.write(unique_genes_msa, "seqs_from_unique_genes.fasta")
+  w.write(longest_non_unique_genes_msa, "longest_seqs_from_nonunique_genes.fasta")
+
+  all_msa_per_species.each_pair do |s,m|
+    w = FastaWriter.new
+    w.write(m, s +".fasta")
+  end
+
 end