in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / apps / multi_sequence_extractor.rb
index f8838a7..b499c72 100644 (file)
@@ -23,21 +23,25 @@ module Evoruby
   class MultiSequenceExtractor
 
     PRG_NAME                           = "mse"
-    PRG_VERSION                        = "1.0.0"
+    PRG_VERSION                        = "1.02"
     PRG_DESC                           = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
-    PRG_DATE                           = "2008.08.13"
-    COPYRIGHT                          = "2008-2009 Christian M Zmasek"
+    PRG_DATE                           = "2012.07.20"
+    COPYRIGHT                          = "2008-2012 Christian M Zmasek"
     CONTACT                            = "phylosoft@gmail.com"
     WWW                                = "www.phylosoft.org"
     HELP_OPTION_1                      = 'help'
     HELP_OPTION_2                      = 'h'
 
+    EXT_OPTION                          = 'e'
+    EXTRACT_LINKERS_OPTION              = 'l'
     LOG_SUFFIX                          = ".mse_log"
+    LINKERS_SUFFIX                      = ".linkers"
     FASTA_SUFFIX                        = ".fasta"
     FASTA_WITH_NORMALIZED_IDS_SUFFIX    = ".ni.fasta"
     NORMALIZED_IDS_MAP_SUFFIX           = ".nim"
     PROTEINS_LIST_FILE_SEPARATOR        = "\t"
 
+
     def run()
 
       Util.print_program_information( PRG_NAME,
@@ -69,6 +73,8 @@ module Evoruby
       end
 
       allowed_opts = Array.new
+      allowed_opts.push(EXT_OPTION)
+      allowed_opts.push(EXTRACT_LINKERS_OPTION)
 
       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
       if ( disallowed.length > 0 )
@@ -92,6 +98,19 @@ module Evoruby
         end
       end
 
+      extension = 0
+      if cla.is_option_set?(EXT_OPTION)
+        extension = cla.get_option_value_as_int(EXT_OPTION)
+        if extension < 0
+          extension = 0
+        end
+      end
+
+      extract_linkers = false
+      if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
+        extract_linkers = true
+      end
+
       if  !File.exist?( input_dir )
         Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
       end
@@ -131,6 +150,14 @@ module Evoruby
         puts( "Mapping file               : " + mapping_file )
         log << "Mapping file               : " + mapping_file + ld
       end
+      if extension > 0
+        puts( "Extension                  : " + extension.to_s )
+        log << "Extension                  : " + extension.to_s + ld
+      end
+      if extract_linkers
+        puts( "Extract linkers            : true" )
+        log << "Extract linkers            : true" + ld
+      end
       log << "Date                       : " + Time.now.to_s + ld
       puts
 
@@ -154,7 +181,9 @@ module Evoruby
           log,
           out_dir,
           out_dir_doms,
-          mapping_file )
+          mapping_file,
+          extension,
+          extract_linkers )
       }
       puts
       Util.print_message( PRG_NAME, "OK" )
@@ -170,7 +199,9 @@ module Evoruby
         log,
         out_dir,
         out_dir_doms,
-        mapping_file )
+        mapping_file,
+        extension,
+        extract_linkers )
 
       begin
         Util.check_file_for_readability( input_file )
@@ -182,6 +213,10 @@ module Evoruby
       out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
       out_file_path_ids_map                   = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
       doms_out_file_path_fasta_file           = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
+      doms_ext_out_file_path_fasta_file           = nil
+      if extension > 0
+        doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
+      end
       begin
         Util.check_file_for_writability( out_file_path_fasta_file )
         Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
@@ -203,16 +238,18 @@ module Evoruby
       new_msa                = Msa.new
       new_msa_normalized_ids = Msa.new
       new_msa_domains = Msa.new
+      new_msa_domains_extended = Msa.new
       per_species_counter = 0
+      linkers = ""
 
       puts basename
 
       File.open( input_file ) do | file |
         while line = file.gets
           line.strip!
-          if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+          if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
             values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
-
+            mod_line = nil
             if ( values.length < 2 )
               Util.fatal_error( PRG_NAME, "unexpected format: " + line )
             end
@@ -280,7 +317,7 @@ module Evoruby
             ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
 
             orig_name = nil
-            if ( seq != nil )
+            if seq != nil
               orig_name = seq.get_name
               seq.set_name( seq.get_name + " [" + current_species + "]" )
               new_msa.add_sequence( seq )
@@ -288,21 +325,71 @@ module Evoruby
               Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
             end
 
-            if  domain_ranges != nil
+            if domain_ranges != nil
+              first = true
+              prev_to = -1
+
               domain_ranges.each { |range|
                 if range != nil && range.length > 0
-                  s= range.split("-")
-                  from = s[ 0 ]
-                  to = s[ 1 ]
-                  new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) )
-                end
+                  s = range.split("-")
+                  from = s[ 0 ].to_i
+                  to = s[ 1 ].to_i
+                  new_msa_domains.add_sequence( Sequence.new( orig_name +
+                       " [" + from.to_s +
+                       "-" + to.to_s +
+                       "] [" + basename + "] [" +
+                       current_species + "]",
+                      seq.get_sequence_as_string[from..to] ) )
+                  if extension > 0
+                    from_e = from - extension
+                    if from_e < 0
+                      from_e = 0
+                    end
+                    to_e = to + extension
+                    if to_e > seq.get_sequence_as_string.length - 1
+                      to_e = seq.get_sequence_as_string.length - 1
+                    end
+                    new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
+                         " [" + from.to_s +
+                         "-" + to.to_s  +
+                         "] [extended by " +
+                         extension.to_s +
+                         "] [" + basename + "] [" +
+                         current_species + "]",
+                        seq.get_sequence_as_string[ from_e..to_e ] ) )
+                  end # extension > 0
+                  if  extract_linkers
+                    if first
+                      first = false
+                      f = 0
+                      t = from - 1
+                      if extension > 0
+                        f = from - extension
+                      end
+                      mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
+                    else
+                      mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
+                    end
+                    prev_to = to
+                  end
+                end # range != nil && range.length > 0
               }
+              if extract_linkers && prev_to > 0
+                f = prev_to + 1
+                t = seq.get_sequence_as_string.length - 1
+                if extension > 0
+                  t = prev_to + extension
+                end
+                mod_line += get_linker_sequence( f, t, seq ) + "]"
+              end
             end
 
             new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
-
-          end
-        end
+            if mod_line
+              linkers << mod_line + Constants::LINE_DELIMITER
+            end
+          end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
+        end # while line = file.gets
 
       end
 
@@ -324,7 +411,7 @@ module Evoruby
         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
 
-      if  new_msa_domains != nil
+      if new_msa_domains != nil
         begin
           io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
         rescue Exception => e
@@ -332,6 +419,14 @@ module Evoruby
         end
       end
 
+      if extension > 0 && new_msa_domains_extended != nil
+        begin
+          io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
       begin
         io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
       rescue Exception => e
@@ -346,6 +441,32 @@ module Evoruby
         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
       end
 
+      if extract_linkers && linkers != nil
+        begin
+          f = File.open( out_dir + Constants::FILE_SEPARATOR + basename +  LINKERS_SUFFIX , 'a' )
+          f.print( linkers )
+          f.close
+        rescue Exception => e
+          Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+        end
+      end
+
+
+    end
+
+
+    def get_linker_sequence( from, to, seq )
+      if from < 0
+        from = 0
+      end
+      if to > seq.get_sequence_as_string.length - 1
+        to = seq.get_sequence_as_string.length - 1
+      end
+      if from > to
+        return ""
+      else
+        return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
+      end
     end
 
     def obtain_inputfiles( input_dir, seq_names_files_suffix )
@@ -419,7 +540,10 @@ module Evoruby
          "and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
          "genome multiple-sequence ('fasta') files not in input dir]" )
       puts()
-      puts( "  " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
+      puts( "  option: -" + EXT_OPTION  + "=<int>: to extend extracted domains" )
+      puts( "          -" + EXTRACT_LINKERS_OPTION  + "      : to extract linkers" )
+      puts()
+      puts( "  " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
       puts()
     end