3 # countSpeciesSPTrEMBL.pl
4 # -----------------------
6 # Copyright (C) 2003 Christian M. Zmasek
10 # Last modified: 02/27/03
11 # Author: Christian M. Zmasek
12 # zmasek@genetics.wustl.edu
13 # http://www.genetics.wustl.edu/eddy/people/zmasek/
15 # Last modified 05/23/02
17 # Purpose. Counts species in SWISS-PROT and TrEMBL.
19 # Usage. countSpeciesSPTrEMBL.pl <path/to/trembl.dat> <path/to/sprot.dat> <outfile>
26 my $VERSION = "1.000";
34 my %species_count = (); # full name -> count.
38 &errorInCommandLine();
41 $infile_tr = $ARGV[ 0 ];
42 $infile_sp = $ARGV[ 1 ];
43 $outfile = $ARGV[ 2 ];
48 die "\n$0: <<$outfile>> already exists.\n\n";
50 unless ( ( -s $infile_tr ) && ( -f $infile_tr ) && ( -T $infile_tr ) ) {
51 die "\n$0: <$infile_tr>> does not exist, is empty, or is not a plain textfile.\n\n";
53 unless ( ( -s $infile_sp ) && ( -f $infile_sp ) && ( -T $infile_sp ) ) {
54 die "\n$0: <<$infile_sp>> does not exist, is empty, or is not a plain textfile.\n\n";
57 open( IN_TR, "$infile_tr" ) || die "\n$0: Cannot open file <<$infile_tr>>: $!\n";
58 open( IN_SP, "$infile_sp" ) || die "\n$0: Cannot open file <<$infile_sp>>: $!\n";
59 open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n";
64 while ( $return_line = <IN_TR> ) {
65 if ( $return_line =~ /^AC\s+(\S+);/ ) {
68 elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) {
74 if ( exists( $species_count{ $os } ) ) {
75 $species_count{ $os } = $species_count{ $os } + 1;
78 $species_count{ $os } = 1;
82 elsif ( $return_line =~ /^\/\// && $read == 1 ) {
94 while ( $return_line = <IN_SP> ) {
95 if ( $return_line =~ /^ID\s+(\S+)/ ) {
98 elsif ( $return_line =~ /^OS\s+(.+)\s*$/ && $read == 1 ) {
105 if ( exists( $species_count{ $os } ) ) {
106 $species_count{ $os } = $species_count{ $os } + 1;
109 $species_count{ $os } = 1;
113 elsif ( $return_line =~ /^\/\// && $read == 1 ) {
122 foreach my $species ( sort { $species_count{ $b } <=> $species_count{ $a } } keys %species_count ) {
123 print OUT "$species: $species_count{$species}\n";
127 print "\n\nDone!\n\n";
138 sub errorInCommandLine {
140 print " countSpeciesSPTrEMBL.pl $VERSION\n";
141 print " -----------------------\n";
143 print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n";
145 print " Purpose. Counts species in SWISS-PROT and TrEMBL.\n";
147 print " Usage. countSpeciesSPTrEMBL.pl <path/to/trembl.dat> <path/to/sprot.dat> <outfile>\n";