binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a protein sequence given a path to a
  14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -in/-sequence <FILE1>
  24
  25 The path to the sequence file (in FASTA format) you want to predict.
  26
  27 =item -output <FILEPREFIX>
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  30
  31 =item -outfile <FILE2>
  32
  33 An output file, by default it is undefined and the -output option is working
  34
  35 =item -logfile <FILE3>
  36
  37 Logging file, by default it is undefined and logging switched off
  38
  39 =item -dbpath PATH
  40
  41 Path to the uniref database used for PSI-BLAST querying. default value: .
  42
  43 =item -dbname database
  44
  45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  46 Default value: uniref90
  47
  48 =item -psi path
  49
  50 Path to a PSIBLAST output file.
  51
  52 =item -ncpu NNN
  53
  54 Number of CPU used by jpred.pl. Maximum value is 8
  55 Default: NNN = 1
  56
  57 =item -pred-nohits
  58
  59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  60
  61 =item -no-final
  62
  63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  64 The option untoggles this step and stops jpred at the concise file generation
  65
  66 =item -jabaws
  67
  68 Redefined basic variable in order to use the script within JABAWS
  69
  70 =item -verbose
  71
  72 Verbose mode. Print more information while running jpred.
  73
  74 =item -debug
  75
  76 Debug mode. Print debugging information while running jpred.
  77
  78 =item -help
  79
  80 Gives help on the programs usage.
  81
  82 =item -man
  83
  84 Displays this man page.
  85
  86 =back
  87
  88 =head1 BUGS
  89
  90 Can't cope with gaps in the query sequence.
  91
  92 Doesn't accept alignments.
  93
  94 If you find any others please contact authors.
  95
  96 =head1 AUTHORS
  97
  98 Jonathan Barber <jon@compbio.dundee.ac.uk>
  99 Chris Cole <christian@cole.name>
 100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 101
 102 =cut
 103
 104 ## TODO check for gaps in query sequence
 105 ##      check for disallowed chars in sequence
 106
 107 use strict;
 108 use warnings;
 109 use Getopt::Long;
 110 use Pod::Usage;
 111 use Carp;
 112 use Data::Dumper;
 113 use File::Temp;
 114
 115 #use UNIVERSAL;
 116
 117 # path to Jpred modules
 118 use FindBin qw($Bin);
 119 use lib "$Bin/lib";
 120
 121 # internal jpred modules
 122 use Jpred;
 123 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
 124 use HMMER::Profile;
 125 use HMMER::Profile::Jnet;
 126 use PSIBLAST;
 127 use PSIBLAST::PSSM;
 128 use PSIBLAST::Run;
 129 use FASTA::File;
 130 use FASTA;
 131 use Index;
 132 use Pairwise;
 133 use OC;
 134 use Utils qw(profile);
 135 use Run qw(check);
 136
 137 our $VERSION = '3.0.2';
 138 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 139 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 140
 141 # Gap characters in sequences
 142 our $GAP = '-';
 143
 144 #####################################################################################################
 145 # Light object to hold sequence information, light to prevent memory overload
 146 # for big search results. Even so this is quite memory intensive, I can't see
 147 # how to reduce the size further.
 148 {
 149
 150   package PSISEQ;
 151
 152   sub new {
 153     my ( $class, %args ) = @_;
 154     my $self = [];
 155     bless $self, $class;
 156     for ( keys %args ) { $self->$_( $args{$_} ) }
 157     return $self;
 158   }
 159
 160   sub id {
 161     return defined $_[1]
 162       ? $_[0]->[0] = $_[1]
 163       : $_[0]->[0];
 164   }
 165
 166   sub start {
 167     return defined $_[1]
 168       ? $_[0]->[1] = $_[1]
 169       : $_[0]->[1];
 170   }
 171
 172   sub align {
 173     if ( defined $_[1] ) {
 174       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 175
 176       $_[0]->[2] = join( "", @{ $_[1] } );
 177     } else {
 178       return [ split( //, $_[0]->[2] ) ];
 179     }
 180   }
 181 }
 182
 183 #####################################################################################################
 184 my $infile;
 185 my $outfile;
 186 my $prefix;
 187 my $psiblast;
 188 my $jabaws;
 189 my $logfile;
 190 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 191 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 192 my $db_path    = "/homes/www-jpred/databases";
 193 my $db_entry   = "cluster";
 194 my $nofinal;
 195
 196 my ( $help, $man, $DEBUG, $VERBOSE );
 197
 198 GetOptions(
 199   "in|sequence=s" => \$infile,
 200   "output=s"      => \$prefix,
 201   "outfile=s"     => \$outfile,
 202   "logfile=s"     => \$logfile,
 203   "psi=s"         => \$psiblast,
 204   "dbname=s"      => \$db_entry,
 205   "dbpath=s"      => \$db_path,
 206   "ncpu=s"        => \$ncpu,
 207   "pred-nohits"   => \$predNoHits,
 208   "no-final"      => \$nofinal,
 209   "jabaws"        => \$jabaws,
 210   "help"          => \$help,
 211   "man"           => \$man,
 212   "debug"         => \$DEBUG,
 213   "verbose"       => \$VERBOSE
 214 ) or pod2usage(2);
 215 pod2usage(1) if $help;
 216 pod2usage( verbose => 2 ) if $man;
 217
 218 #####################################################################################################
 219 # Key to database information and information for accessing them
 220 my $database = {
 221
 222   ## default database used by Jpred
 223   uniref90 => {
 224     database   => $db_path . "/uniref90.filt",
 225     unfiltered => $db_path . "/uniref90",
 226   },
 227
 228   ## database used during Jnet training
 229   training => {
 230     database   => $db_path . "/training/uniref90.filt",
 231     unfiltered => $db_path . "/training/uniref90",
 232   },
 233
 234   ## database used for ported Jpred
 235   ported_db => {
 236     database   => $db_path . "/uniref90.filt",
 237     unfiltered => $db_path . "/uniref90",
 238   },
 239
 240   ## cluster-specific path for Jpred
 241   cluster => {
 242     database   => $db_path . "uniref90.filt",
 243     unfiltered => $db_path . "uniref90",
 244   },
 245
 246   ## these other DBs are experimental ones used during development.
 247   ## check they exist before using them.
 248   # generic entry for use with validate_jnet.pl
 249   # real db location defined by validate_jnet.pl
 250   swall => {
 251     database   => $db_path . "/swall/swall.filt",
 252     unfiltered => $db_path . "/swall/swall",
 253   },
 254
 255   # Path to PSIBLAST db
 256   uniprot => {
 257     database   => $db_path . "/3/swall.filt",
 258     unfiltered => $db_path . "/3/swall",
 259   },
 260
 261   uniref50 => {
 262     database   => $db_path . "/6/swall.filt",
 263     unfiltered => $db_path . "/6/swall",
 264   },
 265 };
 266
 267 pod2usage(' ERROR! Input file with a sequence should be provided with  -sequence/-in')                           unless $infile;
 268 pod2usage(' ERROR! Unknown database at ' . $db_path . '. Use -dbpath and -dbname for configuring the database' ) unless exists $database->{$db_entry};
 269 pod2usage(' ERROR! UNIREF database doesn\'t exist. Use -dbpath and -dbname for configuring the database')        unless -f $database->{$db_entry}{database};
 270
 271 if ( defined $prefix ) {
 272   unless (defined $outfile) {
 273     $outfile = $prefix. ".res.fasta";
 274   }
 275 } else {
 276   if (defined $outfile) {
 277     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 278     $prefix = $outfile . ".tmp";
 279   } else {
 280     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 281     $prefix = $infile . ".res";
 282     $outfile = $prefix. ".jnet";
 283   }
 284 }
 285
 286 if ( 1 > $ncpu or 8 < $ncpu ) {
 287   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 288   $ncpu = 1;
 289 }
 290
 291 $VERBOSE = 1 if $DEBUG;
 292
 293 my $LOG;
 294 if (defined $logfile) {
 295   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 296 }
 297
 298 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 299   my ( $key, $value ) = @{$_};
 300   defined $value or next;
 301   my $par = join( ": ", $key, $value );
 302   print "$par\n" if $DEBUG;
 303   print $LOG "$par\n" if $LOG;
 304 }
 305
 306 #####################################################################################################
 307 if (defined $jabaws) {
 308   setup_jpred_env($Bin);
 309   setup_env($Bin);
 310 }
 311 my $platform = check_OS();
 312 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 313 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 314
 315 #####################################################################################################
 316 my $query = FASTA::File->new( read_file => $infile );
 317
 318 my @seqs;
 319 my $psi = PSIBLAST->new;
 320 unless ( defined $psiblast && -e $psiblast ) {
 321   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 322   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 323   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 324   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 325   my $psi_run = PSIBLAST::Run->new(
 326     debug    => $DEBUG,
 327     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 328     path     => $psiblastbin,
 329     input    => $infile,
 330     output   => "$prefix.blast",
 331     matrix   => "$prefix.profile",
 332     database => $database->{$db_entry}{database},
 333   );
 334
 335   # For reduced databases, get the size of the orginal and use this as
 336   # the equivilent DB size
 337   if ( $db_entry =~ /^sp_red_/ ) {
 338     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 339     $psi_run->args(
 340       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 341     );
 342   }
 343   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 344   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 345   print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 346   print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 347
 348   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 349   print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 350   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 351
 352   $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 353   system("gzip -9f $prefix.blast");
 354 } else {
 355   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 356   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 357 }
 358
 359 #####################################################################################################
 360 # Convert the last itteration into a collection of PSISEQ objects
 361 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 362   my ( $id, $start, $align ) = @{$_};
 363   push @seqs,
 364     PSISEQ->new(
 365     id    => $id,
 366     start => $start,
 367     align => [ split( //, $align ) ]
 368     );
 369 }
 370
 371 #####################################################################################################
 372 ## When there are no PSI-BLAST hits generate an HMM from the query
 373 ## and run Jnet against that only.
 374 ## Accuracy won't be as good, but at least it's a prediction
 375 if ( @seqs == 0 ) {
 376   if ( $predNoHits == 0 ) {
 377     warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 378     print ">>100% complete\n";
 379     print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 380     print $LOG ">>100% complete\n" if $LOG;
 381     close ($LOG) if $LOG;
 382     exit;
 383   } else {
 384     print ">>50% complete\n";
 385     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 386     print "Running HMMer on query...\n" if $VERBOSE;
 387     print $LOG ">>50% complete\n" if $LOG;
 388     print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 389     print $LOG "Running HMMer on query...\n" if $LOG;
 390
 391     # copy input query to alignment
 392     #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 393     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 394     open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 395     while (<$ifh>) { print $ofh, $_ }
 396     close $ifh;
 397     close $ofh;
 398
 399     # Temp files required for HMMer
 400     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 401
 402     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 403     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 404
 405     # Read in the HMMER file
 406     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 407
 408     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 409     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 410     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 411     $hmmer_jnet->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 412     print ">>70% complete\n";
 413     print $LOG ">>70% complete\n" if $LOG;
 414
 415     # Run Jnet for the prediction
 416     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 417     print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 418     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 419     print ">>90% complete\n";
 420     print $LOG ">>90% complete\n" if $LOG;
 421     print $jnetlog if $VERBOSE;
 422     print $LOG $jnetlog if $LOG;
 423   }
 424 } else {
 425   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 426   print ">>40% complete\n";
 427   print $LOG ">>40% complete\n" if $LOG;
 428
 429   #####################################################################################################
 430   # Make PSIBLAST truncated alignments the right length
 431   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 432   print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 433   @seqs = extend( $query, @seqs );
 434   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 435
 436   #####################################################################################################
 437   # Remove masking from PSIBLAST alignment
 438   print "Unmasking the alignments...\n" if $VERBOSE;
 439   print $LOG "Unmasking the alignments...\n" if $LOG;
 440   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 441   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 442   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 443
 444   #####################################################################################################
 445   # Convert the sequences to upper case
 446   print "Converting sequences to the same case...\n" if $VERBOSE;
 447   print $LOG "Converting sequences to the same case...\n" if $LOG;
 448   toupper($_) for @seqs;    # CC sub is below
 449   psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 450
 451   #####################################################################################################
 452   # Remove excessive sequences
 453   print "Remove excessive sequences...\n" if $VERBOSE;
 454   print $LOG "Remove excessive sequences...\n" if $LOG;
 455   @seqs = reduce( $MAX_ALIGN, @seqs );
 456   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 457
 458   #####################################################################################################
 459   # Remove sequences that are too long or too short
 460   print "Remove sequences which too long or short...\n" if $VERBOSE;
 461   print $LOG "Remove sequences which too long or short...\n" if $LOG;
 462   @seqs = del_long_seqs( 50, @seqs );
 463   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 464
 465   #####################################################################################################
 466   # Remove redundant sequences based upon pairwise identity and OC clustering
 467   print "Remove redundant sequences...\n" if $VERBOSE;
 468   print $LOG "Remove redundant sequences...\n" if $LOG;
 469   @seqs = nonred( $NR_CUT, @seqs );
 470   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 471
 472   #####################################################################################################
 473   # Check that we haven't got rid of all of the sequences
 474   if ( @seqs < 2 ) {
 475     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 476     print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 477     @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 478   }
 479   unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 480
 481   # Remove gaps in the query sequence and same positions in the alignment
 482   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 483   print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 484   degap(@seqs);
 485   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 486
 487   # Output the alignment for the prediction
 488   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 489   print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 490   psiseq2fasta( "$prefix.align", @seqs );
 491
 492   #####################################################################################################
 493   # Equivilent to getpssm script
 494   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 495   print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 496   my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 497   $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 498   print ">>50% complete\n";
 499   print $LOG ">>50% complete\n" if $LOG;
 500
 501   #####################################################################################################
 502   # Run HMMER on the sequences
 503   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 504   print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 505   my $hmmer = hmmer(@seqs);             # CC sub is below
 506   $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 507   print ">>70% complete\n";
 508   print $LOG ">>70% complete\n" if $LOG;
 509
 510   #####################################################################################################
 511   # Run Jnet for the prediction
 512   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 513   print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 514   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 515   print ">>90% complete\n";
 516   print $LOG ">>90% complete\n" if $LOG;
 517   print $jnetlog if $VERBOSE;
 518   print $LOG $jnetlog if $LOG;
 519 }
 520
 521 unless ( defined $nofinal ) {
 522   my $aligfile      = $prefix . ".align";
 523   my $jnetfile      = $prefix . ".jnet";
 524   my $jnetfastafile = $prefix . ".jnet.fasta";
 525   concise2fasta( $jnetfile, $jnetfastafile );
 526   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 527   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 528   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 529   while (<$IN2>) { print $OUT $_; }
 530   while (<$IN1>) { print $OUT $_; }
 531   close($IN1);
 532   close($IN2);
 533   close($OUT);
 534   unlink $jnetfastafile;
 535 } else {
 536   if (defined $outfile and $prefix.".jnet" ne $outfile) {
 537     rename $prefix.".jnet", $outfile;
 538   }
 539 }
 540
 541 print ">>100% complete\n";
 542 print "Jpred Finished\n";
 543 print $LOG ">>100% complete\n" if $LOG;
 544 print $LOG "Jpred Finished\n" if $LOG;
 545 close ($LOG) if $LOG;
 546 exit;
 547
 548 #####################################################################################################
 549 # Functions
 550 #####################################################################################################
 551
 552 #####################################################################################################
 553
 554 =begin :private
 555
 556 =head2 fasta2concise ($infile, $outfile)
 557
 558 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 559
 560 =cut
 561
 562 sub fasta2concise {
 563   my $infile  = shift;
 564   my $outfile = shift;
 565
 566   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 567   my ( $seq, @seqs, @title );
 568   while (<$IN>) {
 569     if (s/^>//) {
 570       if ($seq) {
 571         $seq =~ s/\n|\s//g;
 572         $seq =~ s/(.)/$1,/g;
 573         push @seqs, $seq;
 574         $seq = "";
 575       }
 576       chomp;
 577       push @title, $_;
 578     } else {
 579       chomp;
 580       $seq .= $_;
 581     }
 582   }
 583   $seq =~ s/\n|\s//g;
 584   $seq =~ s/(.)/$1,/g;
 585   push @seqs, $seq;
 586   close($IN);
 587
 588   if ( @title != @seqs ) {
 589     die("non matching number of titles and sequences!\n");
 590   }
 591
 592   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 593   foreach ( 0 .. $#title ) {
 594     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 595   }
 596   close($OUT);
 597 }
 598
 599 =begin :private
 600
 601 =head2 concise2fasta ($infile, $outfile)
 602
 603 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 604
 605 =cut
 606
 607 #####################################################################################################
 608 sub concise2fasta {
 609   my $infile  = shift;
 610   my $outfile = shift;
 611
 612   my ( @seqs, %seq, @pred, %pred );
 613
 614   my @var = (
 615     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 616     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 617   );
 618
 619   # parse input concise file
 620   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 621   while (<$IN>) {
 622     unless (/^\n/) {
 623       my ( $id, $seq ) = split( ":", $_ );
 624       if ( defined $id and defined $seq ) {    # Check we have proper values
 625         $seq =~ s/,//g;
 626         chomp $seq;
 627         if ( $id =~ /;/ ) {                    # Then it's an alignment
 628           @_ = split( ";", $id );
 629           push @seqs, $_[1];
 630           $seq{ $_[1] } = $seq;
 631         }
 632         foreach my $v (@var) {
 633           if ( $v eq $id ) {
 634             push @pred, $v;
 635             $pred{$v} = $seq;
 636           }
 637         }
 638       }
 639     }
 640   }
 641   close($IN);
 642
 643   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 644
 645   # print out sequences found with PSI-BLAST
 646   foreach (@seqs) {
 647     $seq{$_} =~ s/(.{72})/$1\n/g;
 648     print $OUT ">$_\n$seq{$_}\n";
 649   }
 650
 651   # print out predictions
 652   foreach my $p (@pred) {
 653     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 654     $pred{$p} =~ s/B/E/g;
 655     $pred{$p} =~ s/G/H/g;
 656
 657     if ( $p =~ /SOL/ ) {
 658       $pred{$p} =~ s/E/B/g;
 659     }
 660     $pred{$p} =~ s/(.{72})/$1\n/g;
 661     print $OUT ">$p\n$pred{$p}\n";
 662   }
 663   close($OUT);
 664 }
 665
 666 =begin :private
 667
 668 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 669
 670 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 671 Does it in place.
 672
 673 =cut
 674
 675 #####################################################################################################
 676 sub remove_seq_masks {
 677   my $idx = shift;
 678   my $seq = shift;
 679
 680   # Only bother if we have a sequence which has been masked
 681   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 682
 683   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 684   my $searchdb = $database->{$db_entry}{unfiltered};
 685   $start--;    # We need this to be zero based
 686
 687   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 688   my @seqs = $f->get_entries;
 689   my $db   = shift @seqs;
 690
 691   unless ($db) {
 692     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 693   } elsif (@seqs) {
 694     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 695   }
 696
 697   my @db_seq = @{ $db->seq };
 698
 699   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 700   my $i = 0;
 701   for ( 0 .. $#align ) {
 702     next if $align[$_] =~ /$GAP/;
 703
 704     if ( $align[$_] eq 'X' ) {
 705       unless ( defined $db_seq[ $i + $start ] ) {
 706         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 707       }
 708       $align[$_] = $db_seq[ $i + $start ];
 709     }
 710     $i++;
 711   }
 712   $seq->align( \@align );
 713 }
 714
 715 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 716
 717 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 718
 719 Equivilent to reduce script.
 720
 721 =cut
 722
 723 #####################################################################################################
 724 sub reduce {
 725   my ( $max, @seqs ) = @_;
 726
 727   my $num = int( @seqs / $max );
 728   my @res = shift @seqs;
 729
 730   # Don't bother if we have less than the required sequences
 731   if ( @seqs < $max or 0 == $num ) {
 732     return @res, @seqs;
 733   }
 734
 735   for ( 0 .. $#seqs ) {
 736     push @res, $seqs[$_] if ( 0 == $_ % $num );
 737   }
 738
 739   return @res;
 740 }
 741
 742 =head2 nonred($cutoff, @PSISEQS);
 743
 744 Removes redundant sequences at $cutoff level of sequence identity.
 745
 746 Equivilent to nonred script.
 747
 748 =cut
 749
 750 #####################################################################################################
 751 sub nonred {
 752   my ( $cutoff, @seqs ) = @_;
 753
 754   # Run pairwise and then OC and remove similar sequences
 755   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 756   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 757   if     ( @seqs == 1 ) {
 758     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 759     return @seqs;
 760   }
 761
 762   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 763   my $fasta = FASTA::File->new;
 764   foreach (@seqs) {
 765     my $f = FASTA->new( id => $_->id );
 766     $f->seq( @{ $_->align } );
 767     $fasta->add_entries($f);
 768   }
 769
 770   # Run pairwise
 771   my $pair = Pairwise->new( path => $pairwise );
 772   my $foo = join "\n", $pair->run($fasta) or die $!;
 773
 774   # Run OC
 775   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 776   $ocres->read_string( join "", $ocres->run($foo) );
 777
 778   # Now found those entries that are unrelated
 779   my @keep;
 780   for ( $ocres->get_groups ) {
 781     my ( $group, $score, $size, @labels ) = @{$_};
 782
 783     # Keep the QUERY from the cluster, otherwise just get onelt
 784     if ( grep { /QUERY/ } @labels ) {
 785
 786       # Make sure the query stays at the start
 787       unshift @keep, "QUERY";
 788       next;
 789     } else {
 790       push @keep, shift @labels;
 791     }
 792   }
 793   push @keep, $ocres->get_unclustered;
 794
 795   # Return the matching entries.
 796   # We use the temporay to mimic the selection of sequences in the nonred
 797   # script, which took the sequences in the order they occured in the file.
 798   my @filtered_res;
 799   for (@seqs) {
 800     my $label = $_->id;
 801     if ( grep { $_ =~ /^$label$/ } @keep ) {
 802       push @filtered_res, $_;
 803     }
 804   }
 805
 806   return @filtered_res;
 807 }
 808
 809 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 810
 811 Removes those sequences that are more than $percentage longer or shorter than the first
 812 sequence in @seqs. @seqs is an array of PSISEQ objects.
 813
 814 Equivilent to trim_seqs script.
 815
 816 =cut
 817
 818 #####################################################################################################
 819 sub del_long_seqs {
 820   my ( $factor, @seqs ) = @_;
 821
 822   # Keep the query sequence (we assume query is the 1st FASTA record)
 823   my $query = shift @seqs;
 824   my @res   = $query;
 825
 826   # Find the length of the query sequence without any gaps
 827   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 828
 829   # Calculate the upper and lower allowed bounds
 830   my ( $upper, $lower );
 831   {
 832     my $bounds = ( $length / 100 ) * $factor;
 833     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 834   }
 835
 836   # Now try each sequence and see how long they are
 837   for (@seqs) {
 838     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 839     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 840   }
 841
 842   return @res;
 843 }
 844
 845 =head2 toupper ($PSISEQ)
 846
 847 Converts sequence held in PSISEQ object to uppercase.
 848
 849 =cut
 850
 851 #####################################################################################################
 852 sub toupper {
 853   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 854 }
 855
 856 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 857
 858 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 859
 860 Equivilent to fasta2jnet script.
 861
 862 =cut
 863
 864 #####################################################################################################
 865 sub degap {
 866   my (@seqs) = @_;
 867
 868   # Find where the gaps in the query sequence are
 869   my @gaps;
 870   my $i = 0;
 871   for ( @{ $seqs[0]->align } ) {
 872     push @gaps, $i if $_ !~ /$GAP/;
 873     $i++;
 874   }
 875
 876   return unless @gaps;
 877
 878   # Remove the gaps in all the sequences.
 879   # Derefences the array reference and takes a slice inside the method call
 880   # arguments, then coerce back to an array ref.
 881   for (@seqs) {
 882     $_->align( [ @{ $_->align }[@gaps] ] );
 883   }
 884 }
 885
 886 =head2 getfreq($filename, @PSISEQS)
 887
 888 Creates a PSIBLAST like profile and writes it to the $filename.
 889
 890 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 891 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 892 but it's *not* the same.
 893
 894 =cut
 895
 896 #####################################################################################################
 897 sub getfreq {
 898   my ( $fn, @seqs ) = @_;
 899
 900   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 901
 902   # Create a PSIBLAST like profile from the alignment
 903   # This doesn't greate the same result as old Jpred, I think this is because
 904   # differences in the input between old and new
 905
 906   # For testing
 907   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 908   #   my @profile = profile(
 909   #      map { join "", @{ $_->seq } } $f->get_entries
 910   #   );
 911
 912   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 913
 914   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 915   print $fh join( " ", @{$_} ), "\n" for @profile;
 916 }
 917
 918 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 919
 920 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 921
 922 Equivilent to gethmm script.
 923
 924 =cut
 925
 926 #####################################################################################################
 927 sub hmmer {
 928   my (@seqs) = @_;
 929
 930   # Temp files required
 931   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 932
 933   # Create the FASTA file
 934   psiseq2fasta( $tmp_fasta, @seqs );
 935
 936   # Run HMMer
 937   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 938   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 939
 940   # Read in the HMMER file
 941   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 942
 943   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 944   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 945   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 946
 947   return $hmmer_jnet;
 948 }
 949
 950 =head2 jnet($hmmer, $out, $pssm )
 951
 952 Runs Jnet. Pass the paths of the filenames to do the prediction on
 953
 954 =cut
 955
 956 #####################################################################################################
 957 sub jnet {
 958   my ( $hmmer, $outfile, $pssm ) = @_;
 959   if ($pssm) {
 960
 961     # run Jnet prediction with all the profile data
 962     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 963   } else {
 964
 965     # run Jnet prediction with only HMMer and alignment data
 966     system("$jnet --concise --hmmer $hmmer > $outfile");
 967   }
 968   check( "jnet", $? ) or exit 1;
 969   my $res = "";
 970   my $logging = "";
 971   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 972   while (<$IN>) {
 973     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
 974       $res .= $_;
 975     } else {
 976       $logging .= $_;
 977     }
 978   }
 979   close $IN;
 980   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 981   print $OUT "\n".$res;
 982   close $OUT;
 983   return $logging;
 984 }
 985
 986 =head1 psiseq2fasta($path, @PSISEQ)
 987
 988 Writes a FASTA file given an array of PSISEQ objects.
 989
 990 =cut
 991
 992 #####################################################################################################
 993 sub psiseq2fasta {
 994   my ( $fn, @seqs ) = @_;
 995
 996   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 997   confess "JPRED: not passed PSISEQs" unless @seqs;
 998
 999   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1000
1001   # Changed from using FASTA::File object due to seg faulting on some large,
1002   # long alignments, presumably due to running out of memory.
1003   #   my $fasta = FASTA::File->new;
1004   #   $fasta->add_entries(
1005   #      map {
1006   #         my $f = FASTA->new(id => $_->id);
1007   #         $f->seq(@{$_->align});
1008   #         $f;
1009   #      } @seqs
1010   #   );
1011   #   $fasta->write_file($fn);
1012
1013   my $fh;
1014   $fn =~ /\.gz$/
1015     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1016     : ( open $fh, ">$fn" or die "JPRED: $!" );
1017
1018   for (@seqs) {
1019     print $fh ">", $_->id, "\n";
1020     my $seq = join( "", @{ $_->align } );
1021     $seq =~ s/(.{72})/$1\n/g;
1022     chomp $seq;
1023     print $fh $seq . "\n";
1024   }
1025   close $fh;
1026 }
1027
1028 =head1 @PSISEQ = fasta2psiseq($path)
1029
1030 Reads in a FASTA file and returns an array of PSISEQ objects.
1031
1032 =cut
1033
1034 #####################################################################################################
1035 sub fasta2psiseq {
1036   my ($fn) = @_;
1037
1038   my $fasta =
1039     $fn =~ /\.gz$/
1040     ? FASTA::File->new( read_gzip_file => $fn )
1041     : FASTA::File->new( read_file      => $fn );
1042   $fasta or die "JPRED: $!";
1043
1044   my @seqs;
1045   for ( $fasta->get_entries ) {
1046     my $seq = PSISEQ->new;
1047     $seq->id( $_->id );
1048     $seq->align( [ @{ $_->seq } ] );
1049     push @seqs, $seq;
1050   }
1051
1052   return @seqs;
1053 }
1054
1055 #####################################################################################################
1056 sub fasta2psiseq_lowmemory {
1057   my $filename = shift;
1058
1059   local $/ = "\n";
1060   my $fh;
1061   $filename =~ /\.gz$/
1062     ? ( open $fh, "gzip -dc $filename|" or die $! )
1063     : ( open $fh, $filename or die $! );
1064
1065   my @seqs;
1066   my ( $id, $seq );
1067   while (<$fh>) {
1068     chomp;
1069     if (s/^>//) {
1070       $seq =~ s/ //g if defined $seq;
1071
1072       if ( $id and $seq ) {
1073         my $psi = PSISEQ->new( id => $id );
1074         $psi->align( [ split //, $seq ] );
1075         push @seqs, $psi;
1076         undef $id;
1077         undef $seq;
1078       } else {
1079         $id = $_;
1080       }
1081     } else {
1082       $seq .= $_;
1083     }
1084   }
1085   if ( $id and $seq ) {
1086     my $psi = PSISEQ->new( id => $id );
1087     $psi->seq($seq);
1088     push @seqs, $psi;
1089   }
1090
1091   return @seqs;
1092 }
1093
1094 =head1 extend($FASTA::File, @PSISEQ)
1095
1096 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1097 against the query, this function extends the alignment so that all of the query is
1098 present in the alignment. The other sequences are extended with gap characters.
1099
1100 =cut
1101
1102 #####################################################################################################
1103 sub extend {
1104   my ( $query, @seqs ) = @_;
1105
1106   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1107   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1108
1109   # Get the query sequence
1110   my $q_query = $query->get_entry(0);
1111
1112   # Get the query sequence from the PSIBLAST alignment
1113   my $a_query = shift @seqs;
1114
1115   # The gap size required to expand the alignment
1116   my ( $start_gap_length, $end_gap_length );
1117   {
1118
1119     # Make handling the sequence easier
1120     my $q_seq = join "", @{ [ $q_query->seq ] };
1121     my $a_seq = join "", @{ $a_query->align };
1122
1123     $q_seq =~ s/-//g;
1124     $a_seq =~ s/-//g;
1125
1126     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1127
1128     $start_gap_length = index( $q_seq, $a_seq );
1129     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1130
1131     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1132   }
1133
1134   # Add the gaps to the end of the alignments
1135   for (@seqs) {
1136     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1137   }
1138
1139   # Put the query sequence back
1140   unshift @seqs,
1141     PSISEQ->new(
1142     id    => $a_query->id,
1143     align => [
1144       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1145       @{ $a_query->align },
1146       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1147     ]
1148     );
1149
1150   return @seqs;
1151 }
1152
1153 =head1 $int = fasta_seq_length($path)
1154
1155 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1156
1157 =cut
1158
1159 #####################################################################################################
1160 sub fasta_seq_length {
1161   my ($fn) = @_;
1162   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1163
1164   my $length = 0;
1165   local $_, $/ = "\n";
1166   while (<$fh>) {
1167     next if /^>/;
1168     chomp;
1169     $length += tr///c;
1170   }
1171
1172   return $length;
1173 }
1174
1175 =head1 log(@messages)
1176
1177 Report messages to STDERR
1178
1179 =cut