From: cmzmasek@gmail.com Date: Thu, 25 Oct 2012 02:41:23 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=d03733a9998b73a5880b9dab0f688e6a69661019;p=jalview.git in progress --- diff --git a/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb b/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb index ec2b9f0..a47a6b2 100644 --- a/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb +++ b/forester/ruby/evoruby/lib/evo/io/parser/uniprot_parser.rb @@ -28,38 +28,37 @@ module Evoruby @file = file end - def parse( ids ) - #ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' ) - entries = [] + entries = Hash.new de = [] dr = [] - read = false + id = nil File.open( @file ).each do | line | if line.index( ID ) == 0 - puts line - ids.each do | id | - puts " " + id - if line.index( id ) == 0 - read = true + # puts line + ids.each do | i | + #puts " " + i + if line.include?( i ) && line.split[ 1 ] == i + id = i break end end end - if read - if line.index LAST == 0 - read = false + if id != nil + if line.include?( LAST ) && line.index( LAST ) == 0 e = UniprotEntry.new e.de = de e.dr = dr entries[ id ] = e + puts id + id = nil de = [] dr = [] else - if line.index DE == 0 + if line.include?( DE ) && line.index( DE ) == 0 add( line, de ) - elsif line.index DR == 0 + elsif line.include?( DR ) && line.index( DR ) == 0 add( line, dr ) end end diff --git a/forester/ruby/evoruby/lib/evo/io/web/uniprotkb.rb b/forester/ruby/evoruby/lib/evo/io/web/uniprotkb.rb new file mode 100644 index 0000000..81f0c76 --- /dev/null +++ b/forester/ruby/evoruby/lib/evo/io/web/uniprotkb.rb @@ -0,0 +1,25 @@ + +require 'net/http' +require 'uri' + +module Evoruby + + + class UniprotKB + def initialize + + end + + def get + require 'net/http' + require 'uri' + + uri = URI.parse("http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uniprotkb;id=1433X_MAIZE;format=uniprot;style=raw") + response = Net::HTTP.get_response uri + puts response.body + + end + + end + +end diff --git a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb index e6ba67d..a8f1590 100644 --- a/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb +++ b/forester/ruby/evoruby/lib/evo/tool/hmmscan_summary.rb @@ -8,11 +8,14 @@ # # last modified: 121003 +require 'set' + require 'lib/evo/util/constants' require 'lib/evo/util/util' require 'lib/evo/util/command_line_arguments' require 'lib/evo/io/parser/hmmscan_parser' require 'lib/evo/io/parser/uniprot_parser' +require 'lib/evo/io/web/uniprotkb' module Evoruby @@ -46,6 +49,9 @@ module Evoruby def run + ukb = UniprotKB.new + ukb.get + Util.print_program_information( PRG_NAME, PRG_VERSION, PRG_DESC, @@ -132,11 +138,11 @@ module Evoruby Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end end - + uniprot = "" - if ( cla.is_option_set?( UNIPROT ) ) + if ( cla.is_option_set?( UNIPROT ) ) begin - uniprot = cla.get_option_value( UNIPROT ) + uniprot = cla.get_option_value( UNIPROT ) rescue ArgumentError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end @@ -175,15 +181,15 @@ module Evoruby else puts( "column delimiter : " + column_delimiter ) end - if fs_e_value_threshold >= 0.0 + if fs_e_value_threshold >= 0.0 puts( "E-value threshold : " + fs_e_value_threshold.to_s ) else puts( "E-value threshold : no threshold" ) end - if !hmm_for_protein_output.empty? + if !hmm_for_protein_output.empty? puts( "HMM for proteins : " + hmm_for_protein_output ) end - if !uniprot.empty? + if !uniprot.empty? puts( "Uniprot : " + uniprot ) end puts() @@ -198,7 +204,7 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, uniprot ) - rescue ArgumentError, IOError => e + rescue IOError => e Util.fatal_error( PRG_NAME, "error: " + e.to_s, STDOUT ) end domain_counts = get_domain_counts() @@ -227,17 +233,20 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, uniprot ) + + + Util.check_file_for_readability( inpath ) Util.check_file_for_writability( outpath ) hmmscan_parser = HmmscanParser.new( inpath ) results = hmmscan_parser.parse - + uniprot_entries = nil - if !uniprot.empty? + if !uniprot.empty? && !hmm_for_protein_output.empty? uniprot_entries = read_uniprot( results, uniprot ) end - + outfile = File.open( outpath, "a" ) query = "" @@ -249,7 +258,7 @@ module Evoruby hmmscan_results_per_protein = [] - + prev_query = "" @@ -285,7 +294,8 @@ module Evoruby process_hmmscan_results_per_protein( hmmscan_results_per_protein, fs_e_value_threshold, hmm_for_protein_output, - i_e_value_threshold ) + i_e_value_threshold, + uniprot_entries ) end hmmscan_results_per_protein.clear end @@ -300,31 +310,37 @@ module Evoruby end end end - if !hmm_for_protein_output.empty? - if !hmmscan_results_per_protein.empty? - process_hmmscan_results_per_protein( hmmscan_results_per_protein, - fs_e_value_threshold, - hmm_for_protein_output, - i_e_value_threshold, - uniprot_entries ) - end + if !hmm_for_protein_output.empty? && !hmmscan_results_per_protein.empty? + process_hmmscan_results_per_protein( hmmscan_results_per_protein, + fs_e_value_threshold, + hmm_for_protein_output, + i_e_value_threshold, + uniprot_entries ) end + outfile.flush() outfile.close() end # def parse - - def read_uniprot( hmmscan_results, uniprot ) - ids = [] - hmmscan_results.each do | r | - ids << r.query - end - uniprot_parser = UniprotParser.new uniprot - uniprot_entries = uniprot_parser.parse ids - uniprot_entries - end - + def process_id( id ) + if id =~ /(sp|tr)\|\S+\|(\S+)/ + id = $2 + end + id + end + + def read_uniprot( hmmscan_results, uniprot ) + ids = Set.new + hmmscan_results.each do | r | + + ids << process_id( r.query ) + end + uniprot_parser = UniprotParser.new uniprot + uniprot_entries = uniprot_parser.parse ids + uniprot_entries + end + def count_model( model ) if ( @domain_counts.has_key?( model ) ) count = @domain_counts[ model ].to_i @@ -339,7 +355,7 @@ module Evoruby fs_e_value_threshold, hmm_for_protein_output, i_e_value_threshold, - uniprot_entries ) + uniprot_entries ) dc = 0 # filter according to i-Evalue threshold @@ -385,8 +401,23 @@ module Evoruby s << r.model + " " end s << "\t" - s << uniprot_entries[ own.query ] - s << "\t" + e = uniprot_entries[ process_id( own.query ) ] + if e != nil && e.de != nil + e.de.each { |i| s << i + " " } + else + s << "-" + end + s << "\t" + + if e != nil && e.gn != nil + e.gn.each { |i| s << i + " " } + else + s << "-" + end + + + + s << "\t" overview = make_overview( hmmscan_results_per_protein_filtered, hmm_for_protein_output ) s << overview + "\t" diff --git a/forester/ruby/evoruby/lib/evo/util/util.rb b/forester/ruby/evoruby/lib/evo/util/util.rb index 234a625..79f92e0 100644 --- a/forester/ruby/evoruby/lib/evo/util/util.rb +++ b/forester/ruby/evoruby/lib/evo/util/util.rb @@ -12,229 +12,215 @@ require 'lib/evo/util/constants' module Evoruby - class Util - - def Util.normalize_seq_name( name, length ) - if name.length > length - name = name[ 0, length ] - elsif name.length < length - for i in 0 ... length - name.length - name = name + " " - end - end - name - end - - # def Util.normalize_mol_sequence( seq ) - # new_seq = String.new() - # for i in 0 ... seq.length - # c = seq.get_slice( i ) - # if is_aa_gap_character?( c ) - # new_seq = new_seq + "-" - # else - # new_seq = new_seq + c - # end - # end - # new_seq - # end - - - # Returns true if char_code corresponds to: space * - . _ - def Util.is_aa_gap_character?( char_code ) - return ( char_code <= 32 || char_code == 42 || char_code == 45 || char_code == 46 ||char_code == 95 ) - end - - # Deletes *, digits, and whitespace, replaces BJOUZ? with X, and replaces non-(letters, -) with - - def Util.clean_seq_str( seq_str ) - seq_str = seq_str.upcase - seq_str = seq_str.gsub( /\s+/, '' ) - seq_str = seq_str.gsub( /\d+/, '' ) - seq_str = seq_str.gsub( '*', '' ) - seq_str = seq_str.gsub( /[BJOUZ?]/, 'X' ) - seq_str = seq_str.gsub( /[^A-Z\-]/, '-' ) - seq_str - end - - # raises ArgumentError - def Util.check_file_for_readability( path ) - unless ( File.exist?( path ) ) - error_msg = "file [#{path}] does not exist" - raise ArgumentError, error_msg - end - unless ( File.file?( path ) ) - error_msg = "file [#{path}] is not a regular file" - raise ArgumentError, error_msg - end - unless ( File.readable?( path ) ) - error_msg = "file [#{path}] is not a readable file" - raise ArgumentError, error_msg - end - if ( File.zero?( path ) ) - error_msg = "file [#{path}] is empty" - raise ArgumentError, error_msg - end - end - - # raises ArgumentError - def Util.check_file_for_writability( path ) - if File.directory?( path ) - error_msg = "file [#{path}] is an existing directory" - raise ArgumentError, error_msg - elsif File.exist?( path ) - error_msg = "file [#{path}] already exists" - raise ArgumentError, error_msg - elsif File.writable?( path ) - error_msg = "file [#{path}] is not writeable" - raise ArgumentError, error_msg - end - end - - def Util.fatal_error_if_not_writable( prg_name, path ) - begin - Util.check_file_for_writability( path ) - rescue ArgumentError => e - Util.fatal_error( prg_name, e.to_s ) - end - end - - def Util.fatal_error_if_not_readable( prg_name, path ) - begin - Util.check_file_for_readability( path ) - rescue ArgumentError => e - Util.fatal_error( prg_name, e.to_s ) - end - end - - def Util.get_env_variable_value( env_variable ) - value = ENV[env_variable] - if value == nil || value.empty? - error_msg = "apparently environment variable #{env_variable} has not been set" - raise StandardError, error_msg - end - value - end - - - # raises ArgumentError - def Util.file2array( path, split_by_semicolon ) - Util.check_file_for_readability( path ) - a = Array.new() - c = 0 - File.open( path ) do | file | - while line = file.gets - if ( line =~ /^\s*(\S.*?)\s*$/ ) - s = $1 - if ( split_by_semicolon && s =~/;/ ) - sa = s.split( /;/ ) - for i in 0 ... sa.length() - a[ c ] = sa[ i ].strip! - end - else - a[ c ] = s - end - c += 1 - end - end - end - return a - end - - def Util.print_program_information( prg_name, - prg_version, - prg_desc, - date, - copyright, - contact, - www, - io = STDOUT ) - - if RUBY_VERSION !~ /1.9/ - puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) - exit( -1 ) - end - - ruby_version = RUBY_VERSION - l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12 - io.print( Evoruby::Constants::LINE_DELIMITER ) - io.print( prg_name + " " + prg_version + " [" + date + "] [ruby " + ruby_version + "]") - io.print( Evoruby::Constants::LINE_DELIMITER ) - l.times { - io.print( "_" ) - } - io.print( Constants::LINE_DELIMITER ) - io.print( Constants::LINE_DELIMITER ) - io.print( prg_desc ) - io.print( Constants::LINE_DELIMITER ) - io.print( Constants::LINE_DELIMITER ) - io.print( "Copyright (C) " + copyright ) - io.print( Constants::LINE_DELIMITER ) - io.print( "Contact: " + contact ) - io.print( Constants::LINE_DELIMITER ) - io.print( " " + www ) - io.print( Constants::LINE_DELIMITER ) - io.print( Constants::LINE_DELIMITER ) - end - - def Util.fatal_error( prg_name, message, io = STDOUT ) - io.print( Constants::LINE_DELIMITER ) - if ( !Util.is_string_empty?( prg_name ) ) - io.print( "[" + prg_name + "] > " + message ) + class Util + + def Util.normalize_seq_name( name, length ) + if name.length > length + name = name[ 0, length ] + elsif name.length < length + for i in 0 ... length - name.length + name = name + " " + end + end + name + end + + # Returns true if char_code corresponds to: space * - . _ + def Util.is_aa_gap_character?( char_code ) + return ( char_code <= 32 || char_code == 42 || char_code == 45 || char_code == 46 ||char_code == 95 ) + end + + # Deletes *, digits, and whitespace, replaces BJOUZ? with X, and replaces non-(letters, -) with - + def Util.clean_seq_str( seq_str ) + seq_str = seq_str.upcase + seq_str = seq_str.gsub( /\s+/, '' ) + seq_str = seq_str.gsub( /\d+/, '' ) + seq_str = seq_str.gsub( '*', '' ) + seq_str = seq_str.gsub( /[BJOUZ?]/, 'X' ) + seq_str = seq_str.gsub( /[^A-Z\-]/, '-' ) + seq_str + end + + # raises ArgumentError + def Util.check_file_for_readability( path ) + unless ( File.exist?( path ) ) + error_msg = "file [#{path}] does not exist" + raise IOError, error_msg + end + unless ( File.file?( path ) ) + error_msg = "file [#{path}] is not a regular file" + raise IOError, error_msg + end + unless ( File.readable?( path ) ) + error_msg = "file [#{path}] is not a readable file" + raise IOError, error_msg + end + if ( File.zero?( path ) ) + error_msg = "file [#{path}] is empty" + raise IOError, error_msg + end + end + + # raises ArgumentError + def Util.check_file_for_writability( path ) + if File.directory?( path ) + error_msg = "file [#{path}] is an existing directory" + raise IOError, error_msg + elsif File.exist?( path ) + error_msg = "file [#{path}] already exists" + raise IOError, error_msg + elsif File.writable?( path ) + error_msg = "file [#{path}] is not writeable" + raise IOError, error_msg + end + end + + def Util.fatal_error_if_not_writable( prg_name, path ) + begin + Util.check_file_for_writability( path ) + rescue IOError => e + Util.fatal_error( prg_name, e.to_s ) + end + end + + def Util.fatal_error_if_not_readable( prg_name, path ) + begin + Util.check_file_for_readability( path ) + rescue IOError => e + Util.fatal_error( prg_name, e.to_s ) + end + end + + def Util.get_env_variable_value( env_variable ) + value = ENV[env_variable] + if value == nil || value.empty? + error_msg = "apparently environment variable #{env_variable} has not been set" + raise StandardError, error_msg + end + value + end + + + # raises ArgumentError + def Util.file2array( path, split_by_semicolon ) + Util.check_file_for_readability( path ) + a = Array.new() + c = 0 + File.open( path ) do | file | + while line = file.gets + if ( line =~ /^\s*(\S.*?)\s*$/ ) + s = $1 + if ( split_by_semicolon && s =~/;/ ) + sa = s.split( /;/ ) + for i in 0 ... sa.length() + a[ c ] = sa[ i ].strip! + end else - io.print( " > " + message ) - end - io.print( Constants::LINE_DELIMITER ) - io.print( Constants::LINE_DELIMITER ) - exit( -1 ) - end - - def Util.print_message( prg_name, message, io = STDOUT ) - if ( !Util.is_string_empty?( prg_name ) ) - io.print( "[" + prg_name + "] > " + message ) - else - io.print( " > " + message ) - end - io.print( Constants::LINE_DELIMITER ) - end - - def Util.print_warning_message( prg_name, message, io = STDOUT ) - if ( !Util.is_string_empty?( prg_name ) ) - io.print( "[" + prg_name + "] > WARNING: " + message ) - else - io.print( " > " + message ) - end - io.print( Constants::LINE_DELIMITER ) - end - - def Util.is_string_empty?( s ) - return ( s == nil || s.length < 1 ) - end - - # From "Ruby Cookbook" - # counts_hash: key is a "name", value is the count (integer) - def Util.draw_histogram( counts_hash, char = "#" ) - pairs = counts_hash.keys.collect { |x| [ x.to_s, counts_hash[ x ] ] }.sort - largest_key_size = pairs.max { |x, y| x[ 0 ].size <=> y[ 0 ].size }[ 0 ].size - pairs.inject( "" ) do | s, kv | - s << "#{ kv[ 0 ].ljust( largest_key_size ) } | #{ char*kv[ 1 ] }" + Constants::LINE_DELIMITER - end - end - - def Util.looks_like_fasta?( path ) - Util.check_file_for_readability( path ) - File.open( path ) do | file | - while line = file.gets - if ( line !~ /\S/ || line =~ /^\s*#/ ) - elsif line =~ /^\s*>\s*(.+)/ - return true - else - return false - end - end - end - error_msg = "unexpected format" - raise IOError, error_msg - end - - end # class Util + a[ c ] = s + end + c += 1 + end + end + end + return a + end + + def Util.print_program_information( prg_name, + prg_version, + prg_desc, + date, + copyright, + contact, + www, + io = STDOUT ) + + if RUBY_VERSION !~ /1.9/ + puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " ) + exit( -1 ) + end + + ruby_version = RUBY_VERSION + l = prg_name.length + prg_version.length + date.length + ruby_version.length + 12 + io.print( Evoruby::Constants::LINE_DELIMITER ) + io.print( prg_name + " " + prg_version + " [" + date + "] [ruby " + ruby_version + "]") + io.print( Evoruby::Constants::LINE_DELIMITER ) + l.times { + io.print( "_" ) + } + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + io.print( prg_desc ) + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + io.print( "Copyright (C) " + copyright ) + io.print( Constants::LINE_DELIMITER ) + io.print( "Contact: " + contact ) + io.print( Constants::LINE_DELIMITER ) + io.print( " " + www ) + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + end + + def Util.fatal_error( prg_name, message, io = STDOUT ) + io.print( Constants::LINE_DELIMITER ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + io.print( Constants::LINE_DELIMITER ) + exit( -1 ) + end + + def Util.print_message( prg_name, message, io = STDOUT ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + end + + def Util.print_warning_message( prg_name, message, io = STDOUT ) + if ( !Util.is_string_empty?( prg_name ) ) + io.print( "[" + prg_name + "] > WARNING: " + message ) + else + io.print( " > " + message ) + end + io.print( Constants::LINE_DELIMITER ) + end + + def Util.is_string_empty?( s ) + return ( s == nil || s.length < 1 ) + end + + # From "Ruby Cookbook" + # counts_hash: key is a "name", value is the count (integer) + def Util.draw_histogram( counts_hash, char = "#" ) + pairs = counts_hash.keys.collect { |x| [ x.to_s, counts_hash[ x ] ] }.sort + largest_key_size = pairs.max { |x, y| x[ 0 ].size <=> y[ 0 ].size }[ 0 ].size + pairs.inject( "" ) do | s, kv | + s << "#{ kv[ 0 ].ljust( largest_key_size ) } | #{ char*kv[ 1 ] }" + Constants::LINE_DELIMITER + end + end + + def Util.looks_like_fasta?( path ) + Util.check_file_for_readability( path ) + File.open( path ) do | file | + while line = file.gets + if ( line !~ /\S/ || line =~ /^\s*#/ ) + elsif line =~ /^\s*>\s*(.+)/ + return true + else + return false + end + end + end + error_msg = "unexpected format" + raise IOError, error_msg + end + + end # class Util end # module Evoruby