in progress
[jalview.git] / forester / ruby / scripts / scoptastic.rb
1 #!/usr/local/bin/ruby -w
2 #
3 # = scoptastic
4 #
5 # Copyright::  Copyright (C) 2008-2009 Christian M. Zmasek.
6 #              All rights reserved.
7 # License::    GNU Lesser General Public License (LGPL)
8 #
9 # $Id: scoptastic.rb,v 1.3 2008/08/28 17:09:07 cmzmasek Exp $
10 #
11 # To create Pfam id to SCOP mappings, one for each of four levels of SCOP
12 # classification.
13 #
14 # Created 2008-06-25 in San Diego, CA, USA by CMZ
15 #
16 # Usage: scoptastic.rb scoptastic.rb <Pfam id to ac map file, e.g.
17 # pfam_summarize.rb output> <Pfam ac to SCOP classification map file> <Pfam id
18 # to SCOP outfile root>
19
20
21 require 'iconv'
22
23 module ForesterScripts
24
25     if RUBY_VERSION !~ /1.9/
26                       puts( "Your ruby version is #{RUBY_VERSION}, expected 1.9.x " )
27                       exit( -1 )
28                 end     
29     
30     CLASS_LEVEL_SUFFIX       = "_SCOP_2_CLASS"
31     FOLD_LEVEL_SUFFIX        = "_SCOP_3_FOLD"
32     SUPERFAMILY_LEVEL_SUFFIX = "_SCOP_4_SUPERFAMILY"
33     FAMILY_LEVEL_SUFFIX      = "_SCOP_5_FAMILY"
34
35     SEP = "\t"
36     LINE_DELIMITER  = "\n"
37
38     if ( ARGV == nil || ARGV.length != 3 )
39         puts( "usage: scoptastic.rb <Pfam id to ac map file, e.g. pfam_summarize.rb output> <Pfam ac to SCOP classification map file> <Pfam id to SCOP outfile root>" )
40         exit( -1 )
41     end
42
43     pfam_id_to_ac   = ARGV[ 0 ]
44     pfam_ac_to_scop = ARGV[ 1 ]
45     outfile         = ARGV[ 2 ]
46
47     if ( !File.exists?( pfam_id_to_ac ) )
48         puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] does not exist" )
49         exit( -1 )
50     end
51     if ( !File.exists?( pfam_ac_to_scop ) )
52         puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] does not exist" )
53         exit( -1 )
54     end
55     if ( File.exists?( outfile + CLASS_LEVEL_SUFFIX ) )
56         puts( "Outfile [" + outfile + CLASS_LEVEL_SUFFIX + "] already exists" )
57         exit( -1 )
58     end
59     if ( File.exists?( outfile +  FOLD_LEVEL_SUFFIX ) )
60         puts( "Outfile [" + outfile +  FOLD_LEVEL_SUFFIX + "] already exists" )
61         exit( -1 )
62     end
63     if ( File.exists?( outfile + SUPERFAMILY_LEVEL_SUFFIX ) )
64         puts( "Outfile [" + outfile + SUPERFAMILY_LEVEL_SUFFIX + "] already exists" )
65         exit( -1 )
66     end
67     if ( File.exists?( outfile + FAMILY_LEVEL_SUFFIX ) )
68         puts( "Outfile [" + outfile + FAMILY_LEVEL_SUFFIX + "] already exists" )
69         exit( -1 )
70     end
71
72     ic = Iconv.new( 'UTF-8//IGNORE', 'UTF-8' )
73
74     pfam_ac_to_id_map = Hash.new
75
76     pfam_ac_to_scop_map = Hash.new
77
78     count = 0
79
80     File.open( pfam_id_to_ac  ) do | file |
81         while line = file.gets
82             line = ic.iconv( line )
83             if ( line !~ /^#/ && line =~ /\S/ )
84                 if ( line =~ /^(\S+)\s+(PF\d+)/ )
85                     pfam_ac_to_id_map[ $2 ] = $1
86                     count += 1
87                 else
88                     puts( "Pfam id to ac map file [" + pfam_id_to_ac + "] format error [line: " + line + "]" )
89                     exit( -1 )
90                 end
91             end
92         end
93     end
94     puts()
95     puts( "Extracted #{count} Pfam id to ac mappings from file [#{pfam_id_to_ac}]" )
96
97     count = 0
98     File.open( pfam_ac_to_scop ) do | file |
99         while line = file.gets
100             line = ic.iconv( line )
101             if ( line !~ /^#/ && line =~ /\S/ )
102                 if ( line =~ /^(PF\d+)\.?\d*\s+([a-z]\.\d+\.\d+\.\d+)/ )
103                     pfam_ac_to_scop_map[ $1 ] = $2
104                     count += 1
105                 else
106                     puts( "Pfam ac to SCOP classification map file [" + pfam_ac_to_scop + "] format error [line: " + line + "]" )
107                     exit( -1 )
108                 end
109             end
110         end
111     end
112
113     puts( "Extracted #{count} Pfam ac to SCOP classification mappings from file [#{pfam_ac_to_scop}]" )
114
115     out_class_level = File.open( outfile + CLASS_LEVEL_SUFFIX, 'w' )
116     out_fold_level = File.open( outfile + FOLD_LEVEL_SUFFIX  , 'w' )
117     out_superfamily_level = File.open( outfile + SUPERFAMILY_LEVEL_SUFFIX, 'w' )
118     out_family_level = File.open( outfile + FAMILY_LEVEL_SUFFIX, 'w' )
119
120     count = 0
121     pfam_ac_to_scop_map.each { | pfam_ac,scop |
122         if ( pfam_ac_to_id_map.has_key?( pfam_ac ) )
123             pfam_id = pfam_ac_to_id_map[ pfam_ac ]
124             scop_split = scop.split( "\." )
125
126             out_class_level.write( pfam_id )
127             out_fold_level.write( pfam_id )
128             out_superfamily_level.write( pfam_id )
129             out_family_level.write( pfam_id )
130
131             out_class_level.write( SEP )
132             out_fold_level.write( SEP )
133             out_superfamily_level.write( SEP )
134             out_family_level.write( SEP )
135
136             out_class_level.write( scop_split[ 0 ] )
137             out_fold_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] )
138             out_superfamily_level.write( scop_split[ 0 ] + "." + scop_split[ 1 ] + "." + scop_split[ 2 ] )
139             out_family_level.write( scop )
140
141             out_class_level.write( LINE_DELIMITER )
142             out_fold_level.write( LINE_DELIMITER )
143             out_superfamily_level.write( LINE_DELIMITER )
144             out_family_level.write( LINE_DELIMITER )
145             count += 1
146         else
147             puts( "Pfam ac #{pfam_ac} not found in Pfam id to ac map file [" + pfam_id_to_ac + "]" )
148             exit( -1 )
149         end
150     }
151
152     out_class_level.close
153     out_fold_level.close
154     out_superfamily_level.close
155     out_family_level.close
156
157     puts()
158     puts( "Wrote #{count} Pfam id to SCOP mappings to files '#{outfile + CLASS_LEVEL_SUFFIX}', '#{outfile + FOLD_LEVEL_SUFFIX}', '#{outfile + SUPERFAMILY_LEVEL_SUFFIX}', and '#{ outfile + FAMILY_LEVEL_SUFFIX}'" )
159     puts( "OK" )
160     puts()
161
162 end # module ForesterScripts
163