6 # Copyright (C) 2001 Washington University School of Medicine
7 # and Howard Hughes Medical Institute
11 # Author: Christian M. Zmasek
12 # zmasek@genetics.wustl.edu
13 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 05/23/02
17 # Purpose. Extracts AC, DE, and OS from a "trembl.dat" file.
18 # The output is used by "rio.pl".
19 # If a species list (format: SWISS-PROT-code=full name) is supplied:
20 # only sequences from species found in this list are written to
21 # outfile and their full species names replaced with their SWISS-PROT
24 # Usage. extractTrembl.pl <infile> <outfile> [species list]
26 # Remark. Need to re-run this if species in species tree or species list
27 # are added/changed or if a new version of Pfam is used!!
29 # Some "heuristic" is required for Synechococcus, Synechocystis, Anabaena:
35 my $VERSION = "1.001";
44 my %Species_names = (); # full name -> SWISS-PROT name.
47 if ( @ARGV != 2 && @ARGV != 3 ) {
48 &errorInCommandLine();
52 $outfile = $ARGV[ 1 ];
55 $speciesfile = $ARGV[ 2 ];
56 unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) {
57 die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n";
59 &readSpeciesNamesFile( $speciesfile );
63 die "\n$0: <<$outfile>> already exists.\n\n";
65 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
66 die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
69 open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
70 open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
72 print OUT "# extractTrembl.pl version: $VERSION\n";
73 print OUT "# trembl.dat file: $infile\n";
74 print OUT "# output file : $outfile\n";
75 print OUT "# species file : $speciesfile\n";
76 print OUT "# date : ".`date`."\n\n";
80 while ( $return_line = <IN> ) {
81 if ( $return_line =~ /^AC\s+(\S+);/ ) {
85 elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) {
94 elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) {
96 if ( $speciesfile ne ""
97 && ( $os =~ /Synechococcus/
98 || $os =~ /Synechocystis/
99 || $os =~ /Anabaena/ ) ) {
100 if ( $os =~ /PCC\s*(\d+)/ ) {
116 if ( $speciesfile ne "" ) {
117 unless ( exists( $Species_names{ $os } ) ) {
124 $os = $Species_names{ $os };
127 elsif ( $return_line =~ /^\/\// && $read == 1 ) {
129 print OUT "$ac;$de;$os\n";
139 print OUT "\n # $i entries.\n";
147 # Reads in species file.
148 # Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis")
149 # Lines beginning with "#" are ignored.
150 # One argument: species file-name
151 # Last modified: 04/24/01
152 sub readSpeciesNamesFile {
153 my $infile = $_[ 0 ];
154 my $return_line = "";
158 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
159 die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
162 open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
163 while ( $return_line = <IN_RSNF> ) {
164 if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) {
169 $Species_names{ $full } = $sp;
179 sub errorInCommandLine {
181 print " extractTrembl.pl $VERSION\n";
182 print " ----------------\n";
184 print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
186 print " Purpose. Extracts AC, DE, and OS from a \"trembl.dat\" file.\n";
187 print " The resulting output is used by \"rio.pl\".\n";
188 print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n";
189 print " only sequences from species found in this list are written to\n";
190 print " outfile and their full species names replaced with their SWISS-PROT\n";
191 print " code (recommended).\n";
193 print " Usage. extractTrembl.pl <infile> <outfile> [species list]\n";
195 print " Remark. Need to re-run this if species in species tree or species list\n";
196 print " are added/changed or if a new version of Pfam is used!!\n";