X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fruby%2Fevoruby%2Flib%2Fevo%2Ftool%2Fmulti_sequence_extractor.rb;h=2bf5c65fcaaa1fa24cf878f36a92ed0ee9e5f63e;hb=1d0174b7a5bc38ee6ef5815ad89344609663ee41;hp=b499c7262fa555d38560d9656740d58e5bdad257;hpb=dbf5b588d65d1c62094dd5d339eca5056a5ade5f;p=jalview.git diff --git a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb index b499c72..2bf5c65 100644 --- a/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb +++ b/forester/ruby/evoruby/lib/evo/tool/multi_sequence_extractor.rb @@ -1,10 +1,9 @@ # # = lib/evo/apps/multi_sequence_extractor.rb - MultiSequenceExtractor class # -# Copyright:: Copyright (C) 2006-2008 Christian M. Zmasek +# Copyright:: Copyright (C) 2014 Christian M. Zmasek # License:: GNU Lesser General Public License (LGPL) # -# $Id: multi_sequence_extractor.rb,v 1.10 2010/12/13 19:00:11 cmzmasek Exp $ require 'lib/evo/util/constants' @@ -16,19 +15,17 @@ require 'lib/evo/io/parser/fasta_parser' require 'lib/evo/io/writer/fasta_writer' require 'lib/evo/util/command_line_arguments' - - module Evoruby class MultiSequenceExtractor PRG_NAME = "mse" - PRG_VERSION = "1.02" + PRG_VERSION = "1.04" PRG_DESC = "extraction of sequences by name from multiple multi-sequence ('fasta') files" - PRG_DATE = "2012.07.20" - COPYRIGHT = "2008-2012 Christian M Zmasek" - CONTACT = "phylosoft@gmail.com" - WWW = "www.phylosoft.org" + PRG_DATE = "140318" + COPYRIGHT = "2014 Christian M Zmasek" + CONTACT = "phyloxml@gmail.com" + WWW = "https://sites.google.com/site/cmzmasek/home/software/forester" HELP_OPTION_1 = 'help' HELP_OPTION_2 = 'h' @@ -41,6 +38,10 @@ module Evoruby NORMALIZED_IDS_MAP_SUFFIX = ".nim" PROTEINS_LIST_FILE_SEPARATOR = "\t" + def initialize() + @file_to_msa = Hash.new + @seqs = 0 + end def run() @@ -67,7 +68,7 @@ module Evoruby exit( 0 ) end - if ( cla.get_number_of_files != 4 && cla.get_number_of_files != 5 ) + if ( cla.get_number_of_files != 5 ) print_help exit( -1 ) end @@ -87,15 +88,12 @@ module Evoruby input_dir = cla.get_file_name( 1 ) out_dir = cla.get_file_name( 2 ) out_dir_doms = cla.get_file_name( 3 ) - mapping_file = nil + mapping_file = cla.get_file_name( 4 ) - if ( cla.get_number_of_files == 5 ) - mapping_file = cla.get_file_name( 4 ) - begin - Util.check_file_for_readability( mapping_file ) - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end + begin + Util.check_file_for_readability( mapping_file ) + rescue ArgumentError => e + Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end extension = 0 @@ -111,26 +109,32 @@ module Evoruby extract_linkers = true end - if !File.exist?( input_dir ) + unless File.exist?( out_dir ) + Dir.mkdir( out_dir ) + end + unless File.exist?( out_dir_doms ) + Dir.mkdir( out_dir_doms ) + end + + unless File.exist?( input_dir ) Util.fatal_error( PRG_NAME, "error: input directory [#{input_dir}] does not exist" ) end - if !File.exist?( out_dir ) + unless File.exist?( out_dir ) Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir}] does not exist" ) end - if !File.exist?( out_dir_doms ) + unless File.exist?( out_dir_doms ) Util.fatal_error( PRG_NAME, "error: output directory [#{out_dir_doms}] does not exist" ) end - if !File.directory?( input_dir ) + unless File.directory?( input_dir ) Util.fatal_error( PRG_NAME, "error: [#{input_dir}] is not a directory" ) end - if !File.directory?( out_dir ) + unless File.directory?( out_dir ) Util.fatal_error( PRG_NAME, "error: [#{out_dir}] is not a directory" ) end - if !File.directory?( out_dir_doms ) + unless File.directory?( out_dir_doms ) Util.fatal_error( PRG_NAME, "error: [#{out_dir_doms}] is not a directory" ) end - log = String.new log << "Program : " + PRG_NAME + ld @@ -146,10 +150,8 @@ module Evoruby log << "Output dir : " + out_dir + ld puts( "Output dir domains : " + out_dir_doms ) log << "Output dir domains : " + out_dir_doms + ld - if ( mapping_file != nil ) - puts( "Mapping file : " + mapping_file ) - log << "Mapping file : " + mapping_file + ld - end + puts( "Mapping file : " + mapping_file ) + log << "Mapping file : " + mapping_file + ld if extension > 0 puts( "Extension : " + extension.to_s ) log << "Extension : " + extension.to_s + ld @@ -161,9 +163,7 @@ module Evoruby log << "Date : " + Time.now.to_s + ld puts - if ( mapping_file != nil ) - species_codes_to_paths = extract_mappings( mapping_file ) - end + species_codes_to_paths = extract_mappings( mapping_file ) input_files = obtain_inputfiles( input_dir, seq_names_files_suffix ) @@ -245,6 +245,7 @@ module Evoruby puts basename File.open( input_file ) do | file | + species_counter = 1 while line = file.gets line.strip! if !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) @@ -254,9 +255,9 @@ module Evoruby Util.fatal_error( PRG_NAME, "unexpected format: " + line ) end species = values[ 0 ] - if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" - next - end + #if species == "BRADI" || species == "ASPNG" || species == "SCLSC" || species == "PTEVA" || species == "EIMTE" + # next + #end seq_name = values[ 1 ] domain_ranges = nil if ( values.length > 3 ) @@ -286,27 +287,26 @@ module Evoruby print_counts( per_species_counter, log, Constants::LINE_DELIMITER ) per_species_counter = 0 end - puts " " + current_species + " [" + my_readlink + "]" - log << current_species + " [" + my_readlink + "]" + Constants::LINE_DELIMITER + puts " " + species_counter.to_s + ":" + current_species + " [" + my_readlink + "]" + log << species_counter.to_s << ": " << current_species << " [" + my_readlink + "]" << Constants::LINE_DELIMITER + species_counter += 1 end - puts " " + seq_name - log << " " + seq_name + Constants::LINE_DELIMITER + log << " " << seq_name << Constants::LINE_DELIMITER per_species_counter = per_species_counter + 1 seq = nil - if current_msa.find_by_name_start( seq_name, true ).size > 0 - begin - seq = current_msa.get_by_name_start( seq_name, true ).copy - rescue ArgumentError => e - Util.fatal_error( PRG_NAME, "error: " + e.to_s ) - end - else + indices = current_msa.find_by_name_start( seq_name, true ) + if indices.size == 1 + seq = current_msa.get_sequence( indices[ 0 ] ) + elsif indices.size == 0 # Not found, try finding by partial match. begin seq = current_msa.get_by_name( seq_name, true, true ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + else + Util.fatal_error( PRG_NAME, "error: seq name \"" + seq_name + "\" not unique" ) end normalized_id = per_species_counter.to_s( 16 ).upcase + @@ -314,7 +314,7 @@ module Evoruby per_species_counter.to_i - ids_map_writer.write( normalized_id + ": " + seq.get_name + Constants::LINE_DELIMITER ) + ids_map_writer.write( normalized_id + "\t" + seq.get_name + Constants::LINE_DELIMITER ) orig_name = nil if seq != nil @@ -450,8 +450,6 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end end - - end @@ -493,19 +491,19 @@ module Evoruby if ( species_code_to_path.has_value?( path ) ) Util.fatal_error( PRG_NAME, "error: path [#{path}] is not unique" ) end - if ( !File.exist?( path ) ) + unless ( File.exist?( path ) ) Util.fatal_error( PRG_NAME, "error: file [#{path}] does not exist" ) end - if ( !File.file?( path ) ) + unless ( File.file?( path ) ) Util.fatal_error( PRG_NAME, "error: [#{path}] is not a regular file" ) end - if ( !File.readable?( path ) ) + unless ( File.readable?( path ) ) Util.fatal_error( PRG_NAME, "error: file [#{path}] is not readable" ) end - if ( File.size( path ) < 10000 ) + if ( File.size( path ) < 1000 ) Util.fatal_error( PRG_NAME, "error: file [#{path}] appears too small" ) end - if ( !Util.looks_like_fasta?( path ) ) + unless ( Util.looks_like_fasta?( path ) ) Util.fatal_error( PRG_NAME, "error: file [#{path}] does not appear to be a fasta file" ) end species_code_to_path[ species ] = path @@ -518,11 +516,15 @@ module Evoruby end def print_counts( per_species_counter, log, ld ) - puts " [sum: " + per_species_counter.to_s + "]" - log << " [sum: " + per_species_counter.to_s + "]" + ld + puts " sum: " + per_species_counter.to_s + log << " sum: " + per_species_counter.to_s + ld end def read_fasta_file( input ) + if @file_to_msa.has_key?( input ) + return @file_to_msa[ input ] + end + f = MsaFactory.new() msa = nil begin @@ -530,15 +532,19 @@ module Evoruby rescue Exception => e Util.fatal_error( PRG_NAME, "error: " + e.to_s ) end + if @seqs <= 400000 + @file_to_msa[ input ] = msa + @seqs += msa.get_number_of_seqs + puts " total seqs in memory: " + @seqs.to_s + end msa end def print_help() puts( "Usage:" ) puts() - puts( " " + PRG_NAME + ".rb [mapping file for " + - "genome multiple-sequence ('fasta') files not in input dir]" ) + puts( " " + PRG_NAME + ".rb " + + " " ) puts() puts( " option: -" + EXT_OPTION + "=: to extend extracted domains" ) puts( " -" + EXTRACT_LINKERS_OPTION + " : to extract linkers" )