in progress...
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / fasta_extractor.rb
1 #
2 # = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
3 #
4 # Copyright::    Copyright (C) 2017 Christian M. Zmasek
5 # License::      GNU Lesser General Public License (LGPL)
6
7 require 'lib/evo/util/constants'
8 require 'lib/evo/util/util'
9 require 'lib/evo/util/command_line_arguments'
10
11 module Evoruby
12   class FastaExtractor
13
14     PRG_NAME       = "fastx"
15     PRG_VERSION    = "1.000"
16     PRG_DESC       = "extraction of molecular sequences from a fasta file"
17     PRG_DATE       = "170215"
18     WWW            = "https://sites.google.com/site/cmzmasek/home/software/forester"
19
20     HELP_OPTION_1                      = 'help'
21     HELP_OPTION_2                      = 'h'
22     def run()
23
24       Util.print_program_information( PRG_NAME,
25       PRG_VERSION,
26       PRG_DESC ,
27       PRG_DATE,
28       WWW,
29       STDOUT )
30
31       if ( ARGV == nil || ( ARGV.length < 1 )  )
32         print_help
33         exit( -1 )
34       end
35
36       begin
37         cla = CommandLineArguments.new( ARGV )
38       rescue ArgumentError => e
39         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
40       end
41
42       if ( cla.is_option_set?( HELP_OPTION_1 ) ||
43       cla.is_option_set?( HELP_OPTION_2 ) )
44         print_help
45         exit( 0 )
46       end
47
48       if ( cla.get_number_of_files != 3 )
49         print_help
50         exit( -1 )
51       end
52
53       allowed_opts = Array.new
54
55       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
56       if ( disallowed.length > 0 )
57         Util.fatal_error( PRG_NAME,
58         "unknown option(s): " + disallowed,
59         STDOUT )
60       end
61
62       input_file  = cla.get_file_name( 0 )
63       query       = cla.get_file_name( 1 )
64       output_file = cla.get_file_name( 2 )
65
66       if  !File.exist?( input_file )
67         Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
68       end
69       if File.exist?( output_file   )
70         Util.fatal_error( PRG_NAME, "error: [#{output_file}] already exists" )
71       end
72
73       results = extract_sequences( query, input_file, output_file )
74
75       Util.print_message( PRG_NAME, "matched: " + results )
76       Util.print_message( PRG_NAME, "wrote:   " + output_file )
77       Util.print_message( PRG_NAME, "OK" )
78
79     end
80
81     def extract_sequences( query, fasta_file, output_file )
82       output = File.open( output_file, "a" )
83       matching_state = false
84       matches = 0
85       total = 0
86       File.open( fasta_file ) do | file |
87         while line = file.gets
88           if !Util.is_string_empty?( line )
89             if line =~ /^\s*>/
90               total += 1
91               if total % 10000 == 0
92                 STDOUT.write "\r#{matches}/#{total}"
93                 STDOUT.flush
94               end
95               if line =~ /#{query}/
96                 matching_state = true
97                 matches += 1
98                 output.print( line )
99               else
100                 matching_state = false
101               end
102             elsif matching_state
103               output.print( line )
104             end
105           end
106         end
107       end
108       output.close()
109       matches.to_s + "/" + total.to_s
110     end
111
112     def print_help()
113       puts( "Usage:" )
114       puts()
115       puts( "  " + PRG_NAME + ".rb <input fasta file> <query> <output file>" )
116       puts()
117       puts( "Examples:" )
118       puts
119       puts( "  " + PRG_NAME + ".rb Pfam-A.fasta kinase kinases" )
120       puts()
121     end
122
123   end # class FastaExtractor
124 end