6 # Copyright (C) 2001 Washington University School of Medicine
7 # and Howard Hughes Medical Institute
11 # Author: Christian M. Zmasek
12 # zmasek@genetics.wustl.edu
13 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 05/23/02
17 # Purpose. Extracts ID, DE, and species from a "sprot.dat" file.
18 # The output is used by "rio.pl".
19 # If a species list (format: SWISS-PROT-code=full name) is supplied:
20 # only sequences from species found in this list are written to
21 # outfile (recommended).
23 # Usage. extractSWISS-PROT.pl <infile> <outfile> [species list]
25 # Remark. Need to re-run this if species in species tree or species list
26 # are added/changed or if a new version of Pfam is used!!
32 my $VERSION = "1.001";
41 my %Species_names = (); # SWISS-PROT name -> "".
44 if ( @ARGV != 2 && @ARGV != 3 ) {
45 &errorInCommandLine();
49 $outfile = $ARGV[ 1 ];
52 $speciesfile = $ARGV[ 2 ];
53 unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) {
54 die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n";
56 &readSpeciesNamesFile( $speciesfile );
60 die "\n$0: <<$outfile>> already exists.\n\n";
62 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
63 die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n";
66 open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
67 open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
69 print OUT "# extractTrembl.pl version: $VERSION\n";
70 print OUT "# trembl.dat file: $infile\n";
71 print OUT "# output file : $outfile\n";
72 print OUT "# species file : $speciesfile\n";
73 print OUT "# date : ".`date`."\n\n";
77 while ( $return_line = <IN> ) {
78 if ( $return_line =~ /^ID\s+(\S+)/ ) {
81 if ( $ac =~ /[A-Z0-9]+_([A-Z0-9]+)/ ) {
85 die "\n$0: Unexpected format: $ac.\n\n";
87 if ( $speciesfile ne "" ) {
88 unless ( exists( $Species_names{ $os } ) ) {
97 elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) {
105 elsif ( $return_line =~ /^\/\// && $read == 1 ) {
107 print OUT "$ac;$de;$os\n";
117 print OUT "\n # $i entries.\n";
125 # Reads in species file.
126 # Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis")
127 # Lines beginning with "#" are ignored.
128 # One argument: species file-name
129 # Last modified: 04/24/01
130 sub readSpeciesNamesFile {
131 my $infile = $_[ 0 ];
132 my $return_line = "";
136 unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) {
137 die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n";
140 open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n";
141 while ( $return_line = <IN_RSNF> ) {
142 if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) {
147 $Species_names{ $sp } = "";
157 sub errorInCommandLine {
159 print " extractSWISS-PROT.pl $VERSION\n";
160 print " --------------------\n";
162 print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
164 print " Purpose. Extracts ID, DE, and species from a \"sprot.dat\" file.\n";
165 print " The resulting output is used by \"rio.pl\".\n";
166 print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n";
167 print " only sequences from species found in this list are written to\n";
168 print " outfile (recommended).\n";
170 print " Usage. extractSWISS-PROT.pl <infile> <outfile> [species list]\n";
172 print " Remark. Need to re-run this if species in species tree or species list\n";
173 print " are added/changed or if a new version of Pfam is used!!\n";