2 # = lib/evo/tool/multi_domain_seq_extractor - MultiDomainSeqExtractor class
4 # Copyright:: Copyright (C) 2017 Christian M. Zmasek
5 # License:: GNU Lesser General Public License (LGPL)
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/io/parser/hmmscan_multi_domain_extractor'
13 class MultiDomainSeqExtractor
17 PRG_DESC = "Extraction of multi domain sequences from hmmscan output"
19 WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"
21 LOG_FILE_SUFFIX = '_MDSX.log'
22 HELP_OPTION_1 = 'help'
26 Util.print_program_information( PRG_NAME,
33 if ( ARGV == nil || ( ARGV.length < 1 ) )
39 cla = CommandLineArguments.new( ARGV )
41 Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
44 if ( cla.is_option_set?( HELP_OPTION_1 ) ||
45 cla.is_option_set?( HELP_OPTION_2 ) )
50 unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 )
55 allowed_opts = Array.new
57 disallowed = cla.validate_allowed_options_as_str( allowed_opts )
58 if ( disallowed.length > 0 )
59 Util.fatal_error( PRG_NAME,
60 "unknown option(s): " + disallowed,
64 domain_id = cla.get_file_name( 0 )
65 hmmscan_output = cla.get_file_name( 1 )
66 fasta_sequence_file = ""
69 if (cla.get_number_of_files == 3)
70 fasta_sequence_file = cla.get_file_name( 2 )
72 hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
73 if ( hmmscan_index != nil )
74 prefix = hmmscan_output[0 .. hmmscan_index-1 ]
75 suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
76 files = Dir.entries( "." )
77 matching_files = Util.get_matching_files( files, prefix, suffix)
78 if matching_files.length < 1
79 Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
80 '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
82 if matching_files.length > 1
83 Util.fatal_error( PRG_NAME, 'more than one file matching [' +
84 prefix + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
86 fasta_sequence_file = matching_files[ 0 ]
88 Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
91 hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
92 if ( hmmscan_index != nil )
93 outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_MDSX"
95 Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
99 ld = Constants::LINE_DELIMITER
102 # puts( "Domain : " + domain_id )
103 # log << "Domain : " + domain_id + ld
104 # puts( "Hmmscan outputfile : " + hmmscan_output )
105 # log << "Hmmscan outputfile : " + hmmscan_output + ld
106 # puts( "Fasta sequencefile (complete sequences) : " + fasta_sequence_file )
107 # log << "Fasta sequencefile (complete sequences) : " + fasta_sequence_file + ld
108 # puts( "Outputfile : " + outfile + ".fasta" )
109 # log << "Outputfile : " + outfile + ld
110 # puts( "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX )
111 # log << "Passed sequences outfile (fasta) : " + outfile + PASSED_SEQS_SUFFIX + ld
112 # puts( "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX )
113 # log << "Failed sequences outfile (fasta) : " + outfile + FAILED_SEQS_SUFFIX + ld
114 # puts( "Logfile : " + outfile + LOG_FILE_SUFFIX )
115 # log << "Logfile : " + outfile + LOG_FILE_SUFFIX + ld
122 parser = HmmscanMultiDomainExtractor.new()
123 domain_count = parser.parse( domain_id,
128 rescue ArgumentError, IOError => e
129 Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
131 rescue Exception => e
133 Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
138 Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
139 # Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
140 # Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
141 # Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
142 # Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
145 f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
148 rescue Exception => e
149 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
153 Util.print_message( PRG_NAME, "OK" )
162 puts( " " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]" )
164 puts( " options: -" )
168 puts( " " + PRG_NAME + ".rb " )
174 end # class DomainSequenceExtractor