cf6d40ed2faac27ab1b4eb30417595d7aa5cc19a
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / multi_domain_seq_extractor.rb
1 #
2 # = lib/evo/tool/multi_domain_seq_extractor - MultiDomainSeqExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10 require 'lib/evo/io/parser/hmmscan_multi_domain_extractor'
11
12 module Evoruby
13   class MultiDomainSeqExtractor
14
15     PRG_NAME       = "mdsx"
16     PRG_VERSION    = "1.000"
17     PRG_DESC       = "Extraction of multi domain sequences from hmmscan output"
18     PRG_DATE       = "20170220"
19     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
20
21     LOG_FILE_SUFFIX                    = '_MDSX.log'
22     HELP_OPTION_1                      = 'help'
23     HELP_OPTION_2                      = 'h'
24     def run()
25
26       Util.print_program_information( PRG_NAME,
27       PRG_VERSION,
28       PRG_DESC ,
29       PRG_DATE,
30       WWW,
31       STDOUT )
32
33       if ( ARGV == nil || ( ARGV.length < 1 )  )
34         print_help
35         exit( -1 )
36       end
37
38       begin
39         cla = CommandLineArguments.new( ARGV )
40       rescue ArgumentError
41         Util.fatal_error( PRG_NAME, "error: " + $!, STDOUT )
42       end
43
44       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
45       cla.is_option_set?( HELP_OPTION_2 ) )
46         print_help
47         exit( 0 )
48       end
49
50       unless ( cla.get_number_of_files == 2 || cla.get_number_of_files == 3 )
51         print_help
52         exit( -1 )
53       end
54
55       allowed_opts = Array.new
56
57       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
58       if ( disallowed.length > 0 )
59         Util.fatal_error( PRG_NAME,
60         "unknown option(s): " + disallowed,
61         STDOUT )
62       end
63
64       domain_id           = cla.get_file_name( 0 )
65       hmmscan_output      = cla.get_file_name( 1 )
66       fasta_sequence_file = ""
67       outfile             = ""
68
69       if (cla.get_number_of_files == 3)
70         fasta_sequence_file = cla.get_file_name( 2 )
71       else
72         hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
73         if ( hmmscan_index != nil )
74           prefix = hmmscan_output[0 .. hmmscan_index-1 ]
75           suffix = Constants::ID_NORMALIZED_FASTA_FILE_SUFFIX
76           files = Dir.entries( "." )
77           matching_files = Util.get_matching_files( files, prefix, suffix)
78           if matching_files.length < 1
79             Util.fatal_error( PRG_NAME, 'no file matching [' + prefix +
80             '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
81           end
82           if matching_files.length > 1
83             Util.fatal_error( PRG_NAME, 'more than one file matching [' +
84             prefix  + '...' + suffix + '] present in current directory: need to indicate <file containing complete sequences in fasta format> as second argument' )
85           end
86           fasta_sequence_file = matching_files[ 0 ]
87         else
88           Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
89         end
90       end
91       hmmscan_index = hmmscan_output.index(Constants::HMMSCAN)
92       if ( hmmscan_index != nil )
93         outfile = hmmscan_output.sub(Constants::HMMSCAN, "_") + "_MDSX"
94       else
95         Util.fatal_error( PRG_NAME, 'input files do not seem in format for standard analysis pipeline, need to explicitly indicate all' )
96       end
97
98       log = String.new
99       ld = Constants::LINE_DELIMITER
100
101       #      puts()
102       #      puts( "Domain                                                                             : " + domain_id )
103       #      log << "Domain                                                                             : " + domain_id + ld
104       #      puts( "Hmmscan outputfile                                                                 : " + hmmscan_output )
105       #      log << "Hmmscan outputfile                                                                 : " + hmmscan_output + ld
106       #      puts( "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file )
107       #      log << "Fasta sequencefile (complete sequences)                                            : " + fasta_sequence_file + ld
108       #      puts( "Outputfile                                                                         : " + outfile + ".fasta" )
109       #      log << "Outputfile                                                                         : " + outfile + ld
110       #      puts( "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX )
111       #      log << "Passed sequences outfile (fasta)                                                   : " + outfile + PASSED_SEQS_SUFFIX + ld
112       #      puts( "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX )
113       #      log << "Failed sequences outfile (fasta)                                                   : " + outfile + FAILED_SEQS_SUFFIX + ld
114       #      puts( "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX )
115       #      log << "Logfile                                                                            : " + outfile + LOG_FILE_SUFFIX + ld
116
117       puts
118       log <<  ld
119
120       domain_count = 0
121       begin
122         parser = HmmscanMultiDomainExtractor.new()
123         domain_count = parser.parse( domain_id,
124         hmmscan_output,
125         fasta_sequence_file,
126         outfile,
127         log )
128       rescue ArgumentError, IOError => e
129         Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT )
130
131       rescue Exception => e
132         puts e.backtrace
133         Util.fatal_error( PRG_NAME, "unexpected exception: " + e.to_s, STDOUT )
134
135       end
136
137       puts
138       Util.print_message( PRG_NAME, "extracted a total of " + domain_count.to_s + " domains" )
139       #  Util.print_message( PRG_NAME, "wrote: " + outfile + ".fasta")
140       #  Util.print_message( PRG_NAME, "wrote: " + outfile + LOG_FILE_SUFFIX )
141       #  Util.print_message( PRG_NAME, "wrote: " + outfile + PASSED_SEQS_SUFFIX )
142       #  Util.print_message( PRG_NAME, "wrote: " + outfile + FAILED_SEQS_SUFFIX )
143
144       begin
145         f = File.open( outfile + LOG_FILE_SUFFIX, 'a' )
146         f.print( log )
147         f.close
148       rescue Exception => e
149         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
150       end
151
152       puts
153       Util.print_message( PRG_NAME, "OK" )
154       puts
155
156     end
157
158     def print_help()
159       puts()
160       puts( "Usage:" )
161       puts()
162       puts( "  " + PRG_NAME + ".rb <da> <hmmscan outputfile> [file containing complete sequences in fasta format]" )
163       puts()
164       puts( "  options: -"  )
165       puts()
166       puts( "Examples:" )
167       puts
168       puts( "  " + PRG_NAME + ".rb " )
169       puts
170
171       puts()
172     end
173
174   end # class DomainSequenceExtractor
175
176 end # module Evoruby