forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb

   1 #
   2 # = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6
   7 require 'lib/evo/util/util'
   8 require 'lib/evo/msa/msa_factory'
   9 require 'lib/evo/msa/msa'
  10 require 'lib/evo/io/msa_io'
  11 require 'lib/evo/io/parser/sp_taxonomy_parser'
  12 require 'lib/evo/io/parser/fasta_parser'
  13 require 'lib/evo/io/writer/fasta_writer'
  14 require 'lib/evo/io/writer/phylip_sequential_writer'
  15 require 'lib/evo/util/command_line_arguments'
  16 require 'lib/evo/apps/tseq_taxonomy_processor'
  17
  18 module Evoruby
  19   class FastaTaxonomyProcessor
  20
  21     PRG_NAME       = "fasta_tap"
  22     PRG_DATE       = "2009.01.20"
  23     PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
  24     PRG_VERSION    = "1.00"
  25     WWW            = "www.phylosoft.org"
  26     def initialize()
  27       @tax_ids_to_sp_taxonomies = Hash.new()
  28     end
  29
  30     def run()
  31
  32       Util.print_program_information( PRG_NAME,
  33       PRG_VERSION,
  34       PRG_DESC,
  35       PRG_DATE,
  36       COPYRIGHT,
  37       CONTACT,
  38       WWW,
  39       STDOUT )
  40
  41       if  ARGV == nil || ARGV.length != 4
  42         puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
  43         puts()
  44         exit( -1 )
  45       end
  46
  47       begin
  48         cla = CommandLineArguments.new( ARGV )
  49       rescue ArgumentError => e
  50         Util.fatal_error( PRG_NAME, "error: " + e.to_s )
  51       end
  52       allowed_opts = Array.new
  53       disallowed = cla.validate_allowed_options_as_str( allowed_opts )
  54       if ( disallowed.length > 0 )
  55         Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
  56       end
  57
  58       sp_taxonomy_infile = cla.get_file_name( 0 )
  59       sequences_infile = cla.get_file_name( 1 )
  60       sequences_outfile = cla.get_file_name( 2 )
  61       mapping_outfile = cla.get_file_name( 3 )
  62
  63       Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
  64       Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
  65       Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
  66       Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
  67
  68       sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
  69
  70       Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
  71
  72       fasta_parser = FastaParser.new
  73       msa_fac = MsaFactory.new
  74
  75       seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
  76
  77       Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
  78
  79       removed = seqs.remove_redundant_sequences!( true, true )
  80
  81       if removed.size > 0
  82         Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
  83         removed.each { | seq_name |
  84           puts seq_name
  85         }
  86         Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
  87       end
  88
  89       mapping_out = File.open( mapping_outfile, "a" )
  90
  91       for i in 0 ... seqs.get_number_of_seqs
  92         seq = seqs.get_sequence( i )
  93         seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
  94       end
  95
  96       io = MsaIO.new()
  97
  98       w = FastaWriter.new()
  99
 100       w.set_max_name_length( 10 )
 101       w.clean( true )
 102       begin
 103         io.write_to_file( seqs, sequences_outfile, w )
 104       rescue Exception => e
 105         Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
 106       end
 107       mapping_out.close()
 108
 109       Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
 110       Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
 111       Util.print_message( PRG_NAME, "OK" )
 112
 113     end
 114
 115     private
 116
 117     def modify_name( seq, i, sp_taxonomies, mapping_outfile )
 118
 119       #i = i + 1792
 120
 121       seq_desc = seq.get_name
 122
 123       taxonomy_sn = nil
 124
 125       if seq_desc =~ /\[(.+)\]/
 126         taxonomy_sn = $1
 127       else
 128         Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
 129       end
 130
 131       matching_sp_taxonomy = nil
 132
 133       sp_taxonomies.each { |sp_taxonomy|
 134         if ( sp_taxonomy.scientific_name == taxonomy_sn )
 135           matching_sp_taxonomy = sp_taxonomy
 136         end
 137       }
 138
 139       if  matching_sp_taxonomy == nil
 140         Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
 141       end
 142
 143       new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
 144
 145       gi = nil
 146       if seq_desc =~ /gi\|(.+?)\|/
 147         gi = $1
 148       else
 149         Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
 150       end
 151
 152       seq_name = ""
 153
 154       if seq_desc =~ /\|\s*([^|]+?)\s*\[/
 155         seq_name = $1
 156       end
 157
 158       if  seq_name =~ /\[.+\]$/
 159         # Redundant taxonomy information hides here.
 160         seq_name = seq_name.sub(/\[.+\]$/, '')
 161       end
 162       if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
 163         # Pointless information.
 164         seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
 165       end
 166       if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
 167         # Pointless information.
 168         seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
 169       end
 170
 171       if gi != nil
 172         mapping_outfile.print( new_name + "\t" +
 173         TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
 174         TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
 175         TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
 176         TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
 177         TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
 178         TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
 179         TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
 180         TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
 181         Constants::LINE_DELIMITER )
 182       else
 183         mapping_outfile.print( new_name + "\t" +
 184         TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
 185         TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
 186         TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
 187         TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
 188         TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
 189         TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
 190         Constants::LINE_DELIMITER )
 191
 192       end
 193       new_name
 194     end
 195
 196   end
 197
 198 end # module Evoruby