3 # Unpack a pfam flatfile, containing many alignments,
4 # into separate SELEX-format alignment files.
6 # Assumes that ID is the first line in a record,
7 # that SQ is the last line before the alignment starts,
8 # and that there is one aligned sequence per line.
12 ################################################################
13 # PFAMSERVER - The Washington University/St. Louis Pfam web server
14 # Copyright (C) 1995-1999 Washington University School of Medicine
15 # Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd.
16 # Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research
19 # This source code is distributed under the terms of the
20 # GNU General Public License. See the files COPYRIGHT and LICENSE
23 ################################################################
25 $cpl = 50; # 50 sequence characters per line
26 $/ = "\n//"; # paragraph mode on // separators
32 while ($line = shift(@lines)) {
34 if ($line =~ /^\#/) { next; }
35 elsif ($line =~ /^(\S+)\s+(\S+)/) {
41 elsif ($line =~ /^\#=GF ID (\S+)\s*$/) {
43 print "working on $root\n";
45 system ("mv $root $root.orig");
46 print "$root exists -- moved to $root.orig\n";
48 open(SELEX,">$root") || die;
49 print SELEX "#=ID $root\n";
51 elsif ($line =~ /^\#=GF AC (.+)$/) { print SELEX "#=AC $1\n"; }
52 elsif ($line =~ /^\#=GF DE (.+)$/) { print SELEX "#=DE $1\n"; }
54 elsif ($line =~ /^\#=GF GA (\S+)\s+(\S+)/)
55 { print SELEX "#=GA $1 $2\n"; }
57 elsif ($line =~ /^\#=GF TC (\S+) (\S+)/)
58 { print SELEX "#=TC $1 $2\n"; }
60 elsif ($line =~ /^\#=GF NC (\S+) (\S+)/)
61 { print SELEX "#=NC $1 $2\n"; }
63 elsif ($line =~ /^\#=GF SQ \d+/) {
64 print SELEX "# $line";
67 elsif ($line =~ /^\/\//) {
71 print SELEX "# $line";
75 # figure out maximum name length
77 for ($idx = 0; $idx < $nseq; $idx++) {
78 if (length($name[$idx]) > $maxnamelen) {
79 $maxnamelen = length($name[$idx]);
82 # break the alignment across
84 $alen = length($aseq[0]);
85 for ($pos = 0; $pos < $alen; $pos += $cpl) {
86 for ($idx = 0; $idx < $nseq; $idx++) {
87 printf(SELEX "%-${maxnamelen}s %s\n",
88 $name[$idx], substr($aseq[$idx], $pos, $cpl));