in progress
[jalview.git] / forester / ruby / evoruby / lib / evo / tool / fasta_extractor.rb
1 #
2 # = lib/evo/apps/fasta_extractor.rb - FastaExtractor class
3 #
4 # Copyright::  Copyright (C) 2006-2008 Christian M. Zmasek
5 # License::    GNU Lesser General Public License (LGPL)
6 #
7 # $Id: fasta_extractor.rb,v 1.2 2010/12/13 19:00:11 cmzmasek Exp $
8
9
10 require 'lib/evo/util/util'
11 require 'lib/evo/util/constants'
12 require 'lib/evo/util/command_line_arguments'
13
14
15 module Evoruby
16
17     class FastaExtractor
18
19         PRG_NAME                           = "fae"
20         PRG_VERSION                        = "1.0.0"
21         PRG_DESC                           = "extraction of nucleotide sequences from a fasta file by names from wublast search"
22         PRG_DATE                           = "2008.08.09"
23         COPYRIGHT                          = "2008-2009 Christian M Zmasek"
24         CONTACT                            = "phylosoft@gmail.com"
25         WWW                                = "www.phylosoft.org"
26         HELP_OPTION_1                      = 'help'
27         HELP_OPTION_2                      = 'h'
28
29
30         def run()
31
32             Util.print_program_information( PRG_NAME,
33                 PRG_VERSION,
34                 PRG_DESC ,
35                 PRG_DATE,
36                 COPYRIGHT,
37                 CONTACT,
38                 WWW,
39                 STDOUT )
40
41             ld = Constants::LINE_DELIMITER
42
43             begin
44                 cla = CommandLineArguments.new( ARGV )
45             rescue ArgumentError => e
46                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
47             end
48
49             if ( cla.is_option_set?( HELP_OPTION_1 ) ||
50                      cla.is_option_set?( HELP_OPTION_2 ) )
51                 print_help
52                 exit( 0 )
53             end
54
55             if ( cla.get_number_of_files != 3 )
56                 print_help
57                 exit( -1 )
58             end
59
60             allowed_opts = Array.new
61
62             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
63             if ( disallowed.length > 0 )
64                 Util.fatal_error( PRG_NAME,
65                     "unknown option(s): " + disallowed,
66                     STDOUT )
67             end
68
69             input_file  = cla.get_file_name( 0 )
70             names_file  = cla.get_file_name( 1 )
71             output_file = cla.get_file_name( 2 )
72
73             if  !File.exist?( input_file )
74                 Util.fatal_error( PRG_NAME, "error: input file [#{input_file}] does not exist" )
75             end
76             if  !File.exist?( names_file )
77                 Util.fatal_error( PRG_NAME, "error: names file [#{names_file}] does not exist" )
78             end
79             if File.exist?( output_file   )
80                 Util.fatal_error( PRG_NAME, "error: [#{output_file }] already exists" )
81             end
82
83             names = extract_names_with_frames( names_file )
84
85             extract_sequences( names, input_file, output_file )
86
87             puts
88             Util.print_message( PRG_NAME, "OK" )
89             puts
90
91         end
92
93
94         def extract_names_with_frames( names_file )
95             names = Hash.new()
96             File.open( names_file ) do | file |
97                 while line = file.gets
98                     if ( !Util.is_string_empty?( line ) && !(line =~ /\s*#/ ) )
99                         if ( line =~ /(\S+)\s+([+|-]\d)\s+\d+\s+(\S+)/ )
100                             name  = $1
101                             frame = $2
102                             e     = $3
103                             names[ name ] =  "[" + frame + "] [" + e + "]"
104                         end
105                     end
106                 end
107             end
108             names
109         end
110
111         def extract_sequences( names, fasta_file, output_file )
112             output = File.open( output_file, "a" )
113             matching_state = false
114             counter = 0
115             File.open( fasta_file ) do | file |
116                 while line = file.gets
117                     if !Util.is_string_empty?( line )
118                         if ( line =~ /\s*>\s*(.+)/ )
119                             name = $1
120                             if names.has_key?( name )
121                                 matching_state = true
122                                 counter += 1
123                                 puts counter.to_s + ". " +name + " " + names[ name ]
124                                 output.print( ">" + name + " " + names[ name ] )
125                                 output.print( Evoruby::Constants::LINE_DELIMITER )
126                             else
127                                 matching_state = false
128                             end
129                         elsif matching_state
130                             output.print( line )
131                         end
132                     end
133                 end
134             end
135             output.close()
136         end
137
138         def print_help()
139             puts( "Usage:" )
140             puts()
141             puts( "  " + PRG_NAME + ".rb <input fasta file> <names file based on blast output> <output file>" )
142             puts()
143         end
144
145     end # class FastaExtractor
146 end