forester/ruby/evoruby/lib/evo/tool/fasta_taxonomy_processor.rb

   1 #
   2 # = lib/evo/apps/fasta_taxonomy_processor - FastaTaxonomyProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: fasta_taxonomy_processor.rb,v 1.4 2010/12/13 19:00:11 cmzmasek Exp $
   8
   9
  10 require 'lib/evo/util/util'
  11 require 'lib/evo/msa/msa_factory'
  12 require 'lib/evo/msa/msa'
  13 require 'lib/evo/io/msa_io'
  14 require 'lib/evo/io/parser/sp_taxonomy_parser'
  15 require 'lib/evo/io/parser/fasta_parser'
  16 require 'lib/evo/io/writer/fasta_writer'
  17 require 'lib/evo/io/writer/phylip_sequential_writer'
  18 require 'lib/evo/util/command_line_arguments'
  19 require 'lib/evo/apps/tseq_taxonomy_processor'
  20
  21 module Evoruby
  22
  23     class FastaTaxonomyProcessor
  24
  25         PRG_NAME       = "fasta_tap"
  26         PRG_DATE       = "2009.01.20"
  27         PRG_DESC       = "preprocessing of multiple sequence files in ncbi fasta format"
  28         PRG_VERSION    = "1.00"
  29         COPYRIGHT      = "2009 Christian M Zmasek"
  30         CONTACT        = "phylosoft@gmail.com"
  31         WWW            = "www.phylosoft.org"
  32
  33         def initialize()
  34             @tax_ids_to_sp_taxonomies = Hash.new()
  35         end
  36
  37         def run()
  38
  39             Util.print_program_information( PRG_NAME,
  40                 PRG_VERSION,
  41                 PRG_DESC,
  42                 PRG_DATE,
  43                 COPYRIGHT,
  44                 CONTACT,
  45                 WWW,
  46                 STDOUT )
  47
  48             if  ARGV == nil || ARGV.length != 4
  49                 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in ncbi fasta format> <name for fasta outfile> <name for map outfile>" )
  50                 puts()
  51                 exit( -1 )
  52             end
  53
  54             begin
  55                 cla = CommandLineArguments.new( ARGV )
  56             rescue ArgumentError => e
  57                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
  58             end
  59             allowed_opts = Array.new
  60             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
  61             if ( disallowed.length > 0 )
  62                 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
  63             end
  64
  65             sp_taxonomy_infile = cla.get_file_name( 0 )
  66             sequences_infile = cla.get_file_name( 1 )
  67             sequences_outfile = cla.get_file_name( 2 )
  68             mapping_outfile = cla.get_file_name( 3 )
  69
  70             Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
  71             Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
  72             Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
  73             Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
  74
  75             sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
  76
  77             Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
  78
  79             fasta_parser = FastaParser.new
  80             msa_fac = MsaFactory.new
  81
  82             seqs = msa_fac.create_msa_from_file( sequences_infile, fasta_parser )
  83
  84             Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
  85
  86             removed = seqs.remove_redundant_sequences!( true, true )
  87
  88             if removed.size > 0
  89                 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
  90                 removed.each { | seq_name |
  91                     puts seq_name
  92                 }
  93                 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
  94             end
  95
  96             mapping_out = File.open( mapping_outfile, "a" )
  97
  98             for i in 0 ... seqs.get_number_of_seqs
  99                 seq = seqs.get_sequence( i )
 100                 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
 101             end
 102
 103             io = MsaIO.new()
 104
 105             w = FastaWriter.new()
 106
 107             w.set_max_name_length( 10 )
 108             w.clean( true )
 109             begin
 110                 io.write_to_file( seqs, sequences_outfile, w )
 111             rescue Exception => e
 112                 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
 113             end
 114             mapping_out.close()
 115
 116             Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
 117             Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
 118             Util.print_message( PRG_NAME, "OK" )
 119
 120         end
 121
 122         private
 123
 124         def modify_name( seq, i, sp_taxonomies, mapping_outfile )
 125
 126             #i = i + 1792
 127
 128             seq_desc = seq.get_name
 129
 130             taxonomy_sn = nil
 131
 132             if seq_desc =~ /\[(.+)\]/
 133                 taxonomy_sn = $1
 134             else
 135                 Util.fatal_error( PRG_NAME, "no taxonomy in [" + seq_desc + "]"  )
 136             end
 137
 138             matching_sp_taxonomy = nil
 139
 140             sp_taxonomies.each { |sp_taxonomy|
 141                 if ( sp_taxonomy.scientific_name == taxonomy_sn )
 142                     matching_sp_taxonomy = sp_taxonomy
 143                 end
 144             }
 145
 146             if  matching_sp_taxonomy == nil
 147                 Util.fatal_error( PRG_NAME, "taxonomy [" + taxonomy_sn + "] for [" + seq_desc + "] not found" )
 148             end
 149
 150             new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
 151
 152             gi = nil
 153             if seq_desc =~ /gi\|(.+?)\|/
 154                 gi = $1
 155             else
 156               Util.fatal_error( PRG_NAME, "no gi in [" + seq_desc + "]"  )
 157             end
 158
 159             seq_name = ""
 160
 161             if seq_desc =~ /\|\s*([^|]+?)\s*\[/
 162                 seq_name = $1
 163             end
 164
 165             if  seq_name =~ /\[.+\]$/
 166                 # Redundant taxonomy information hides here.
 167                 seq_name = seq_name.sub(/\[.+\]$/, '')
 168             end
 169             if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
 170                 # Pointless information.
 171                 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
 172             end
 173             if  seq_name =~ /^\s*conserved\s+hypothetical\s+protein\s*/i
 174                 # Pointless information.
 175                 seq_name = seq_name.sub( /^\s*conserved\s+hypothetical\s+protein\s*/i, '' )
 176             end
 177
 178             if gi != nil
 179             mapping_outfile.print( new_name + "\t" +
 180                  TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
 181                  TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
 182                  TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
 183                  TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
 184                  TseqTaxonomyProcessor::SEQ_ACCESSION + gi.to_s + "\t" +
 185                  TseqTaxonomyProcessor::SEQ_ACCESSION_SOURCE + "gi" + "\t" +
 186                  TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
 187                  TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
 188                  Constants::LINE_DELIMITER )
 189             else
 190                  mapping_outfile.print( new_name + "\t" +
 191                  TseqTaxonomyProcessor::TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
 192                  TseqTaxonomyProcessor::TAXONOMY_ID + matching_sp_taxonomy.id + "\t" +
 193                  TseqTaxonomyProcessor::TAXONOMY_ID_TYPE + "ncbi" + "\t" +
 194                  TseqTaxonomyProcessor::TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
 195                  TseqTaxonomyProcessor::SEQ_NAME + seq_name + "\t" +
 196                  TseqTaxonomyProcessor::SEQ_MOL_SEQ + seq.get_sequence_as_string +
 197                  Constants::LINE_DELIMITER )
 198
 199             end
 200             new_name
 201         end
 202
 203     end
 204
 205 end # module Evoruby