class MultiSequenceExtractor
PRG_NAME = "mse"
- PRG_VERSION = "1.0.0"
+ PRG_VERSION = "1.02"
PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files"
- PRG_DATE = "2008.08.13"
- COPYRIGHT = "2008-2009 Christian M Zmasek"
+ PRG_DATE = "2012.07.20"
+ COPYRIGHT = "2008-2012 Christian M Zmasek"
CONTACT = "phylosoft@gmail.com"
WWW = "www.phylosoft.org"
HELP_OPTION_1 = 'help'
HELP_OPTION_2 = 'h'
+ EXT_OPTION = 'e'
+ EXTRACT_LINKERS_OPTION = 'l'
LOG_SUFFIX = ".mse_log"
+ LINKERS_SUFFIX = ".linkers"
FASTA_SUFFIX = ".fasta"
FASTA_WITH_NORMALIZED_IDS_SUFFIX = ".ni.fasta"
NORMALIZED_IDS_MAP_SUFFIX = ".nim"
PROTEINS_LIST_FILE_SEPARATOR = "\t"
+
def run()
Util.print_program_information( PRG_NAME,
end
allowed_opts = Array.new
+ allowed_opts.push(EXT_OPTION)
+ allowed_opts.push(EXTRACT_LINKERS_OPTION)
disallowed = cla.validate_allowed_options_as_str( allowed_opts )
if ( disallowed.length > 0 )
end
end
+ extension = 0
+ if cla.is_option_set?(EXT_OPTION)
+ extension = cla.get_option_value_as_int(EXT_OPTION)
+ if extension < 0
+ extension = 0
+ end
+ end
+
+ extract_linkers = false
+ if cla.is_option_set?(EXTRACT_LINKERS_OPTION)
+ extract_linkers = true
+ end
+
if !File.exist?( input_dir )
Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" )
end
puts( "Mapping file : " + mapping_file )
log << "Mapping file : " + mapping_file + ld
end
+ if extension > 0
+ puts( "Extension : " + extension.to_s )
+ log << "Extension : " + extension.to_s + ld
+ end
+ if extract_linkers
+ puts( "Extract linkers : true" )
+ log << "Extract linkers : true" + ld
+ end
log << "Date : " + Time.now.to_s + ld
puts
log,
out_dir,
out_dir_doms,
- mapping_file )
+ mapping_file,
+ extension,
+ extract_linkers )
}
puts
Util.print_message( PRG_NAME, "OK" )
log,
out_dir,
out_dir_doms,
- mapping_file )
+ mapping_file,
+ extension,
+ extract_linkers )
begin
Util.check_file_for_readability( input_file )
out_file_path_normalized_ids_fasta_file = out_dir + Constants::FILE_SEPARATOR + basename + FASTA_WITH_NORMALIZED_IDS_SUFFIX
out_file_path_ids_map = out_dir + Constants::FILE_SEPARATOR + basename + NORMALIZED_IDS_MAP_SUFFIX
doms_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains" + FASTA_SUFFIX
+ doms_ext_out_file_path_fasta_file = nil
+ if extension > 0
+ doms_ext_out_file_path_fasta_file = out_dir_doms + Constants::FILE_SEPARATOR + basename + "_domains_ext_" + extension.to_s + FASTA_SUFFIX
+ end
begin
Util.check_file_for_writability( out_file_path_fasta_file )
Util.check_file_for_writability( out_file_path_normalized_ids_fasta_file )
new_msa = Msa.new
new_msa_normalized_ids = Msa.new
new_msa_domains = Msa.new
+ new_msa_domains_extended = Msa.new
per_species_counter = 0
+ linkers = ""
puts basename
File.open( input_file ) do | file |
while line = file.gets
- if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
+ line.strip!
+ if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
values = line.split( PROTEINS_LIST_FILE_SEPARATOR )
-
+ mod_line = nil
if ( values.length < 2 )
Util.fatal_error( PRG_NAME, "unexpected format: " + line )
end
ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER )
orig_name = nil
- if ( seq != nil )
+ if seq != nil
orig_name = seq.get_name
seq.set_name( seq.get_name + " [" + current_species + "]" )
new_msa.add_sequence( seq )
Util.fatal_error( PRG_NAME, "unexected error: seq is nil" )
end
- if domain_ranges != nil
+ if domain_ranges != nil
+ first = true
+ prev_to = -1
+
domain_ranges.each { |range|
if range != nil && range.length > 0
- s= range.split("-")
- from = s[ 0 ]
- to = s[ 1 ]
- new_msa_domains.add_sequence( Sequence.new( orig_name + "/" + from + "-" + to + " [" + basename + "] [" + current_species + "]", seq.get_sequence_as_string[from.to_i..to.to_i] ) )
- end
+ s = range.split("-")
+ from = s[ 0 ].to_i
+ to = s[ 1 ].to_i
+ new_msa_domains.add_sequence( Sequence.new( orig_name +
+ " [" + from.to_s +
+ "-" + to.to_s +
+ "] [" + basename + "] [" +
+ current_species + "]",
+ seq.get_sequence_as_string[from..to] ) )
+ if extension > 0
+ from_e = from - extension
+ if from_e < 0
+ from_e = 0
+ end
+ to_e = to + extension
+ if to_e > seq.get_sequence_as_string.length - 1
+ to_e = seq.get_sequence_as_string.length - 1
+ end
+ new_msa_domains_extended.add_sequence( Sequence.new( orig_name +
+ " [" + from.to_s +
+ "-" + to.to_s +
+ "] [extended by " +
+ extension.to_s +
+ "] [" + basename + "] [" +
+ current_species + "]",
+ seq.get_sequence_as_string[ from_e..to_e ] ) )
+ end # extension > 0
+ if extract_linkers
+ if first
+ first = false
+ f = 0
+ t = from - 1
+ if extension > 0
+ f = from - extension
+ end
+ mod_line = line + "\t[" + get_linker_sequence( f, t, seq ) + "|"
+ else
+ mod_line += get_linker_sequence( prev_to + 1, from - 1, seq ) + "|"
+ end
+ prev_to = to
+ end
+ end # range != nil && range.length > 0
}
+ if extract_linkers && prev_to > 0
+ f = prev_to + 1
+ t = seq.get_sequence_as_string.length - 1
+ if extension > 0
+ t = prev_to + extension
+ end
+ mod_line += get_linker_sequence( f, t, seq ) + "]"
+ end
end
new_msa_normalized_ids.add_sequence( Sequence.new( normalized_id, seq.get_sequence_as_string ) )
-
- end
- end
+ if mod_line
+ linkers << mod_line + Constants::LINE_DELIMITER
+ end
+ end # !Util.is_string_empty?( line ) && !(line =~ /\s*#/ )
+ end # while line = file.gets
end
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
- if new_msa_domains != nil
+ if new_msa_domains != nil
begin
io.write_to_file( new_msa_domains, doms_out_file_path_fasta_file, fasta_writer )
rescue Exception => e
end
end
+ if extension > 0 && new_msa_domains_extended != nil
+ begin
+ io.write_to_file( new_msa_domains_extended, doms_ext_out_file_path_fasta_file, fasta_writer )
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+ end
+
begin
io.write_to_file( new_msa_normalized_ids, out_file_path_normalized_ids_fasta_file, fasta_writer )
rescue Exception => e
Util.fatal_error( PRG_NAME, "error: " + e.to_s )
end
+ if extract_linkers && linkers != nil
+ begin
+ f = File.open( out_dir + Constants::FILE_SEPARATOR + basename + LINKERS_SUFFIX , 'a' )
+ f.print( linkers )
+ f.close
+ rescue Exception => e
+ Util.fatal_error( PRG_NAME, "error: " + e.to_s )
+ end
+ end
+
+
+ end
+
+
+ def get_linker_sequence( from, to, seq )
+ if from < 0
+ from = 0
+ end
+ if to > seq.get_sequence_as_string.length - 1
+ to = seq.get_sequence_as_string.length - 1
+ end
+ if from > to
+ return ""
+ else
+ return from.to_s + "-" + to.to_s + ":" + seq.get_sequence_as_string[ from..to ]
+ end
end
def obtain_inputfiles( input_dir, seq_names_files_suffix )
"and possibly genome multiple-sequence ('fasta') files> <output directory for sequences> <output directory for domains> [mapping file for " +
"genome multiple-sequence ('fasta') files not in input dir]" )
puts()
- puts( " " + "Example: \"mse.rb .prot . seqs doms ../genome_locations.txt\"" )
+ puts( " option: -" + EXT_OPTION + "=<int>: to extend extracted domains" )
+ puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )
+ puts()
+ puts( " " + "Example: \"mse.rb .prot . protein_seqs domain_seqs ../genome_locations.txt\"" )
puts()
end