forester/ruby/evoruby/lib/evo/apps/tseq_taxonomy_processor.rb

   1 #
   2 # = lib/evo/apps/tseq_taxonomy_processor - TseqTaxonomyProcessor class
   3 #
   4 # Copyright::  Copyright (C) 2006-2007 Christian M. Zmasek
   5 # License::    GNU Lesser General Public License (LGPL)
   6 #
   7 # $Id: tseq_taxonomy_processor.rb,v 1.6 2010/12/13 19:00:11 cmzmasek Exp $
   8
   9
  10 require 'lib/evo/util/util'
  11 require 'lib/evo/msa/msa_factory'
  12 require 'lib/evo/msa/msa'
  13 require 'lib/evo/io/msa_io'
  14 require 'lib/evo/io/parser/sp_taxonomy_parser'
  15 require 'lib/evo/io/parser/ncbi_tseq_parser'
  16 require 'lib/evo/io/writer/fasta_writer'
  17 require 'lib/evo/io/writer/phylip_sequential_writer'
  18 require 'lib/evo/util/command_line_arguments'
  19
  20 module Evoruby
  21
  22     class TseqTaxonomyProcessor
  23
  24         PRG_NAME       = "tseq_tap"
  25         PRG_DATE       = "2009.01.06"
  26         PRG_DESC       = "preprocessing of multiple sequence files in ncbi tseq xml format"
  27         PRG_VERSION    = "1.02"
  28         COPYRIGHT      = "2009 Christian M Zmasek"
  29         CONTACT        = "phylosoft@gmail.com"
  30         WWW            = "www.phylosoft.org"
  31
  32         TAXONOMY_CODE           = "TAXONOMY_CODE:"
  33         TAXONOMY_ID             = "TAXONOMY_ID:"
  34         TAXONOMY_ID_TYPE        = "TAXONOMY_ID_TYPE:"
  35         TAXONOMY_SN             = "TAXONOMY_SN:"
  36         TAXONOMY_CN             = "TAXONOMY_CN:"
  37         SEQ_ACCESSION           = "SEQ_ACCESSION:"
  38         SEQ_ACCESSION_SOURCE    = "SEQ_ACCESSION_SOURCE:"
  39         SEQ_SECONDARY_ACCESSION = "SEQ_SECONDARY_ACCESSION:"
  40         SEQ_SYMBOL              = "SEQ_SYMBOL:"
  41         SEQ_NAME                = "SEQ_NAME:"
  42         SEQ_MOL_SEQ             = "SEQ_MOL_SEQ:"
  43
  44         def initialize()
  45             @tax_ids_to_sp_taxonomies = Hash.new()
  46         end
  47
  48         def run()
  49
  50             Util.print_program_information( PRG_NAME,
  51                 PRG_VERSION,
  52                 PRG_DESC,
  53                 PRG_DATE,
  54                 COPYRIGHT,
  55                 CONTACT,
  56                 WWW,
  57                 STDOUT )
  58
  59             if  ARGV == nil || ARGV.length != 4
  60                 puts( "Usage: #{PRG_NAME}.rb <sp taxonomy file> <sequences in tseq xml format> <name for fasta outfile> <name for map outfile>" )
  61                 puts()
  62
  63                 exit( -1 )
  64             end
  65
  66             begin
  67                 cla = CommandLineArguments.new( ARGV )
  68             rescue ArgumentError => e
  69                 Util.fatal_error( PRG_NAME, "error: " + e.to_s )
  70             end
  71             allowed_opts = Array.new
  72             disallowed = cla.validate_allowed_options_as_str( allowed_opts )
  73             if ( disallowed.length > 0 )
  74                 Util.fatal_error( PRG_NAME, "unknown option(s): " + disallowed )
  75             end
  76
  77             sp_taxonomy_infile = cla.get_file_name( 0 )
  78             sequences_infile = cla.get_file_name( 1 )
  79             sequences_outfile = cla.get_file_name( 2 )
  80             mapping_outfile = cla.get_file_name( 3 )
  81
  82             Util.fatal_error_if_not_readable( PRG_NAME, sp_taxonomy_infile )
  83             Util.fatal_error_if_not_readable( PRG_NAME, sequences_infile )
  84             Util.fatal_error_if_not_writable( PRG_NAME, mapping_outfile )
  85             Util.fatal_error_if_not_writable( PRG_NAME, sequences_outfile )
  86
  87             sp_taxonomies = SpTaxonomyParser.parse( sp_taxonomy_infile )
  88
  89             Util.print_message( PRG_NAME, "read in taxonomic data for " + sp_taxonomies.size.to_s + " species from: " + sp_taxonomy_infile )
  90
  91             tseq_parser = NcbiTSeqParser.new
  92             msa_fac = MsaFactory.new
  93
  94             seqs = msa_fac.create_msa_from_file( sequences_infile, tseq_parser )
  95
  96             Util.print_message( PRG_NAME, "read in " + seqs.get_number_of_seqs.to_s + " sequences from: " + sequences_infile )
  97
  98             removed = seqs.remove_redundant_sequences!( true, true )
  99
 100             if removed.size > 0
 101                 Util.print_message( PRG_NAME, "going to ignore the following " + removed.size.to_s + " redundant sequences:" )
 102                 removed.each { | seq_name |
 103                     puts seq_name
 104                 }
 105                 Util.print_message( PRG_NAME, "will process " + seqs.get_number_of_seqs.to_s + " non-redundant sequences" )
 106             end
 107
 108             mapping_out = File.open( mapping_outfile, "a" )
 109
 110             for i in 0 ... seqs.get_number_of_seqs
 111                 seq = seqs.get_sequence( i )
 112                 if seq.get_taxonomy == nil
 113                     Util.fatal_error( PRG_NAME, "sequence [" + seq.get_name + "] has no taxonomy information" )
 114                 end
 115                 seq.set_name( Util::normalize_seq_name( modify_name( seq, i, sp_taxonomies, mapping_out ), 10 ) )
 116             end
 117
 118             io = MsaIO.new()
 119
 120             w = FastaWriter.new()
 121
 122             w.set_max_name_length( 10 )
 123             w.clean( true )
 124             begin
 125                 io.write_to_file( seqs, sequences_outfile, w )
 126             rescue Exception => e
 127                 Util.fatal_error( PRG_NAME, "failed to write file: " + e.to_s )
 128             end
 129             mapping_out.close()
 130
 131             Util.print_message( PRG_NAME, "wrote: " + mapping_outfile )
 132             Util.print_message( PRG_NAME, "wrote: " + sequences_outfile )
 133             Util.print_message( PRG_NAME, "OK" )
 134
 135         end
 136
 137         private
 138
 139         def modify_name( seq, i, sp_taxonomies, mapping_outfile )
 140
 141             tax_id = seq.get_taxonomy.get_id
 142             matching_sp_taxonomy = nil
 143
 144             if @tax_ids_to_sp_taxonomies.has_key?( tax_id )
 145                 # This is so that a second lookup will be much faster.
 146                 matching_sp_taxonomy = @tax_ids_to_sp_taxonomies[ tax_id ]
 147             else
 148                 sp_taxonomies.each { |sp_taxonomy|
 149                     if ( sp_taxonomy.id == tax_id )
 150                         if  matching_sp_taxonomy != nil
 151                             Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] is not unique" )
 152                         end
 153                         matching_sp_taxonomy = sp_taxonomy
 154                         @tax_ids_to_sp_taxonomies[ tax_id ] = sp_taxonomy
 155                     end
 156                 }
 157             end
 158             if  matching_sp_taxonomy == nil
 159                 Util.fatal_error( PRG_NAME, "taxonomy id [" + tax_id.to_s + "] for [" +  seq.get_taxonomy.get_name + "] not found" )
 160             end
 161
 162             new_name = i.to_s( 16 ) + "_" + matching_sp_taxonomy.code
 163
 164             seq_name = seq.get_name
 165             if  seq_name =~ /\[.+\]$/
 166                 # Redundant taxonomy information hides here.
 167                 seq_name = seq_name.sub(/\[.+\]$/, '')
 168             end
 169             if  seq_name =~ /^\s*hypothetical\s+protein\s*/i
 170                 # Pointless information.
 171                 seq_name = seq_name.sub( /^\s*hypothetical\s+protein\s*/i, '' )
 172             end
 173
 174             mapping_outfile.print( new_name + "\t" +
 175                  TAXONOMY_CODE + matching_sp_taxonomy.code + "\t" +
 176                  TAXONOMY_ID + tax_id + "\t" +
 177                  TAXONOMY_ID_TYPE + seq.get_taxonomy.get_id_source + "\t" +
 178                  TAXONOMY_SN + matching_sp_taxonomy.scientific_name + "\t" +
 179                  SEQ_ACCESSION + seq.get_accession + "\t" +
 180                  SEQ_ACCESSION_SOURCE + seq.get_accession_source + "\t" +
 181                  SEQ_SYMBOL + seq.get_symbol + "\t" +
 182                  SEQ_NAME + seq_name + "\t" +
 183                  SEQ_MOL_SEQ + seq.get_sequence_as_string +
 184                  Constants::LINE_DELIMITER )
 185             new_name
 186         end
 187
 188     end
 189
 190 end # module Evoruby