binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a protein sequence given a path to a
  14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -in/-sequence <FILE1>
  24
  25 The path to the sequence file (in FASTA format) you want to predict.
  26
  27 =item -output <FILEPREFIX>
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  30
  31 =item -outfile <FILE2>
  32
  33 An output file, by default it is undefined and the -output option is working
  34
  35 =item -logfile <FILE3>
  36
  37 Logging file, by default it is undefined and logging switched off
  38
  39 =item -dbpath PATH
  40
  41 Path to the uniref database used for PSI-BLAST querying. default value: .
  42
  43 =item -dbname database
  44
  45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  46 Default value: uniref90
  47
  48 =item -psi path
  49
  50 Path to a PSIBLAST output file.
  51
  52 =item -ncpu NNN
  53
  54 Number of CPU used by jpred.pl. Maximum value is 8
  55 Default: NNN = 1
  56
  57 =item -pred-nohits
  58
  59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  60
  61 =item -no-final
  62
  63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  64 The option untoggles this step and stops jpred at the concise file generation
  65
  66 =item -jabaws
  67
  68 Redefined basic variable in order to use the script within JABAWS
  69
  70 =item -verbose
  71
  72 Verbose mode. Print more information while running jpred.
  73
  74 =item -debug
  75
  76 Debug mode. Print debugging information while running jpred.
  77
  78 =item -help
  79
  80 Gives help on the programs usage.
  81
  82 =item -man
  83
  84 Displays this man page.
  85
  86 =back
  87
  88 =head1 BUGS
  89
  90 Can't cope with gaps in the query sequence.
  91
  92 Doesn't accept alignments.
  93
  94 If you find any others please contact authors.
  95
  96 =head1 AUTHORS
  97
  98 Jonathan Barber <jon@compbio.dundee.ac.uk>
  99 Chris Cole <christian@cole.name>
 100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 101
 102 =cut
 103
 104 ## TODO check for gaps in query sequence
 105 ##      check for disallowed chars in sequence
 106
 107 use strict;
 108 use warnings;
 109 use Getopt::Long;
 110 use Pod::Usage;
 111 use Carp;
 112 use Data::Dumper;
 113 use File::Temp;
 114
 115 #use UNIVERSAL;
 116
 117 # path to Jpred modules
 118 use FindBin qw($Bin);
 119 use lib "$Bin/lib";
 120
 121 # internal jpred modules
 122 use Jpred;
 123 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
 124 use HMMER::Profile;
 125 use HMMER::Profile::Jnet;
 126 use PSIBLAST;
 127 use PSIBLAST::PSSM;
 128 use PSIBLAST::Run;
 129 use FASTA::File;
 130 use FASTA;
 131 use Index;
 132 use Pairwise;
 133 use OC;
 134 use Utils qw(profile);
 135 use Run qw(check);
 136
 137 our $VERSION = '3.0.2';
 138 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 139 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 140
 141 # Gap characters in sequences
 142 our $GAP = '-';
 143
 144 #####################################################################################################
 145 # Light object to hold sequence information, light to prevent memory overload
 146 # for big search results. Even so this is quite memory intensive, I can't see
 147 # how to reduce the size further.
 148 {
 149
 150   package PSISEQ;
 151
 152   sub new {
 153     my ( $class, %args ) = @_;
 154     my $self = [];
 155     bless $self, $class;
 156     for ( keys %args ) { $self->$_( $args{$_} ) }
 157     return $self;
 158   }
 159
 160   sub id {
 161     return defined $_[1]
 162       ? $_[0]->[0] = $_[1]
 163       : $_[0]->[0];
 164   }
 165
 166   sub start {
 167     return defined $_[1]
 168       ? $_[0]->[1] = $_[1]
 169       : $_[0]->[1];
 170   }
 171
 172   sub align {
 173     if ( defined $_[1] ) {
 174       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 175
 176       $_[0]->[2] = join( "", @{ $_[1] } );
 177     } else {
 178       return [ split( //, $_[0]->[2] ) ];
 179     }
 180   }
 181 }
 182
 183 #####################################################################################################
 184 my $infile;
 185 my $outfile;
 186 my $prefix;
 187 my $psiblast;
 188 my $jabaws;
 189 my $logfile;
 190 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 191 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 192 my $db_path    = "/homes/www-jpred/databases";
 193 my $db_entry   = "cluster";
 194 my $nofinal;
 195
 196 my ( $help, $man, $DEBUG, $VERBOSE );
 197
 198 GetOptions(
 199   "in|sequence=s" => \$infile,
 200   "output=s"      => \$prefix,
 201   "outfile=s"     => \$outfile,
 202   "logfile=s"     => \$logfile,
 203   "psi=s"         => \$psiblast,
 204   "dbname=s"      => \$db_entry,
 205   "dbpath=s"      => \$db_path,
 206   "ncpu=s"        => \$ncpu,
 207   "pred-nohits"   => \$predNoHits,
 208   "no-final"      => \$nofinal,
 209   "jabaws"        => \$jabaws,
 210   "help"          => \$help,
 211   "man"           => \$man,
 212   "debug"         => \$DEBUG,
 213   "verbose"       => \$VERBOSE
 214 ) or pod2usage(2);
 215 pod2usage(1) if $help;
 216 pod2usage( verbose => 2 ) if $man;
 217
 218 #####################################################################################################
 219 # Key to database information and information for accessing them
 220 if (defined $jabaws) {
 221   $db_path    = "/data/UNIREFdb";
 222   $db_entry   = "cluster";
 223 }
 224
 225 my $database = {
 226
 227   ## default database used by Jpred
 228   uniref90 => {
 229     database   => $db_path . "/uniref90.filt",
 230     unfiltered => $db_path . "/uniref90",
 231   },
 232
 233   ## database used during Jnet training
 234   training => {
 235     database   => $db_path . "/training/uniref90.filt",
 236     unfiltered => $db_path . "/training/uniref90",
 237   },
 238
 239   ## database used for ported Jpred
 240   ported_db => {
 241     database   => $db_path . "/uniref90.filt",
 242     unfiltered => $db_path . "/uniref90",
 243   },
 244
 245   ## cluster-specific path for Jpred
 246   cluster => {
 247     database   => $db_path . "/uniref90.filt",
 248     unfiltered => $db_path . "/uniref90",
 249   },
 250
 251   ## these other DBs are experimental ones used during development.
 252   ## check they exist before using them.
 253   # generic entry for use with validate_jnet.pl
 254   # real db location defined by validate_jnet.pl
 255   swall => {
 256     database   => $db_path . "/swall/swall.filt",
 257     unfiltered => $db_path . "/swall/swall",
 258   },
 259
 260   # Path to PSIBLAST db
 261   uniprot => {
 262     database   => $db_path . "/3/swall.filt",
 263     unfiltered => $db_path . "/3/swall",
 264   },
 265
 266   uniref50 => {
 267     database   => $db_path . "/6/swall.filt",
 268     unfiltered => $db_path . "/6/swall",
 269   },
 270 };
 271
 272 my $dp = $database->{$db_entry}{database};
 273 pod2usage("ERROR! Input file with a sequence should be provided with -sequence/-in")                              unless $infile;
 274 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database" )           unless exists $database->{$db_entry};
 275 pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
 276
 277 if ( defined $prefix ) {
 278   unless (defined $outfile) {
 279     $outfile = $prefix. ".res.fasta";
 280   }
 281 } else {
 282   if (defined $outfile) {
 283     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 284     $prefix = $outfile . ".tmp";
 285   } else {
 286     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 287     $prefix = $infile . ".res";
 288     $outfile = $prefix. ".jnet";
 289   }
 290 }
 291
 292 if ( 1 > $ncpu or 8 < $ncpu ) {
 293   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 294   $ncpu = 1;
 295 }
 296
 297 $VERBOSE = 1 if $DEBUG;
 298
 299 my $LOG;
 300 if (defined $logfile) {
 301   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 302 }
 303
 304 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 305   my ( $key, $value ) = @{$_};
 306   defined $value or next;
 307   my $par = join( ": ", $key, $value );
 308   print "$par\n" if $DEBUG;
 309   print $LOG "$par\n" if $LOG;
 310 }
 311
 312 #####################################################################################################
 313 if (defined $jabaws) {
 314   setup_jpred_env($Bin);
 315   setup_env($Bin);
 316 }
 317 my $platform = check_OS();
 318 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 319 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 320
 321 #####################################################################################################
 322 my $query = FASTA::File->new( read_file => $infile );
 323
 324 my @seqs;
 325 my $psi = PSIBLAST->new;
 326 unless ( defined $psiblast && -e $psiblast ) {
 327   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 328   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 329   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 330   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 331   my $psi_run = PSIBLAST::Run->new(
 332     debug    => $DEBUG,
 333     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 334     path     => $psiblastbin,
 335     input    => $infile,
 336     output   => "$prefix.blast",
 337     matrix   => "$prefix.profile",
 338     database => $database->{$db_entry}{database},
 339   );
 340
 341   # For reduced databases, get the size of the orginal and use this as
 342   # the equivilent DB size
 343   if ( $db_entry =~ /^sp_red_/ ) {
 344     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 345     $psi_run->args(
 346       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 347     );
 348   }
 349   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 350   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 351   print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 352   print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 353
 354   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 355   print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 356   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 357
 358   $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 359   system("gzip -9f $prefix.blast");
 360 } else {
 361   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 362   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 363 }
 364
 365 #####################################################################################################
 366 # Convert the last itteration into a collection of PSISEQ objects
 367 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 368   my ( $id, $start, $align ) = @{$_};
 369   push @seqs,
 370     PSISEQ->new(
 371     id    => $id,
 372     start => $start,
 373     align => [ split( //, $align ) ]
 374     );
 375 }
 376
 377 #####################################################################################################
 378 ## When there are no PSI-BLAST hits generate an HMM from the query
 379 ## and run Jnet against that only.
 380 ## Accuracy won't be as good, but at least it's a prediction
 381 if ( @seqs == 0 ) {
 382   if ( $predNoHits == 0 ) {
 383     warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 384     print ">>100% complete\n";
 385     print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 386     print $LOG ">>100% complete\n" if $LOG;
 387     close ($LOG) if $LOG;
 388     exit;
 389   } else {
 390     print ">>50% complete\n";
 391     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 392     print "Running HMMer on query...\n" if $VERBOSE;
 393     print $LOG ">>50% complete\n" if $LOG;
 394     print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 395     print $LOG "Running HMMer on query...\n" if $LOG;
 396
 397     # copy input query to alignment
 398     #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 399     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 400     open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 401     while (<$ifh>) { print $ofh, $_ }
 402     close $ifh;
 403     close $ofh;
 404
 405     # Temp files required for HMMer
 406     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 407
 408     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 409     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 410
 411     # Read in the HMMER file
 412     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 413
 414     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 415     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 416     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 417     $hmmer_jnet->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 418     print ">>70% complete\n";
 419     print $LOG ">>70% complete\n" if $LOG;
 420
 421     # Run Jnet for the prediction
 422     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 423     print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 424     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 425     print ">>90% complete\n";
 426     print $LOG ">>90% complete\n" if $LOG;
 427     print $jnetlog if $VERBOSE;
 428     print $LOG $jnetlog if $LOG;
 429   }
 430 } else {
 431   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 432   print ">>40% complete\n";
 433   print $LOG ">>40% complete\n" if $LOG;
 434
 435   #####################################################################################################
 436   # Make PSIBLAST truncated alignments the right length
 437   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 438   print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 439   @seqs = extend( $query, @seqs );
 440   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 441
 442   #####################################################################################################
 443   # Remove masking from PSIBLAST alignment
 444   print "Unmasking the alignments...\n" if $VERBOSE;
 445   print $LOG "Unmasking the alignments...\n" if $LOG;
 446   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 447   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 448   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 449
 450   #####################################################################################################
 451   # Convert the sequences to upper case
 452   print "Converting sequences to the same case...\n" if $VERBOSE;
 453   print $LOG "Converting sequences to the same case...\n" if $LOG;
 454   toupper($_) for @seqs;    # CC sub is below
 455   psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 456
 457   #####################################################################################################
 458   # Remove excessive sequences
 459   print "Remove excessive sequences...\n" if $VERBOSE;
 460   print $LOG "Remove excessive sequences...\n" if $LOG;
 461   @seqs = reduce( $MAX_ALIGN, @seqs );
 462   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 463
 464   #####################################################################################################
 465   # Remove sequences that are too long or too short
 466   print "Remove sequences which too long or short...\n" if $VERBOSE;
 467   print $LOG "Remove sequences which too long or short...\n" if $LOG;
 468   @seqs = del_long_seqs( 50, @seqs );
 469   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 470
 471   #####################################################################################################
 472   # Remove redundant sequences based upon pairwise identity and OC clustering
 473   print "Remove redundant sequences...\n" if $VERBOSE;
 474   print $LOG "Remove redundant sequences...\n" if $LOG;
 475   @seqs = nonred( $NR_CUT, @seqs );
 476   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 477
 478   #####################################################################################################
 479   # Check that we haven't got rid of all of the sequences
 480   if ( @seqs < 2 ) {
 481     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 482     print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 483     @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 484   }
 485   unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 486
 487   # Remove gaps in the query sequence and same positions in the alignment
 488   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 489   print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 490   degap(@seqs);
 491   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 492
 493   # Output the alignment for the prediction
 494   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 495   print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 496   psiseq2fasta( "$prefix.align", @seqs );
 497
 498   #####################################################################################################
 499   # Equivilent to getpssm script
 500   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 501   print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 502   my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 503   $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 504   print ">>50% complete\n";
 505   print $LOG ">>50% complete\n" if $LOG;
 506
 507   #####################################################################################################
 508   # Run HMMER on the sequences
 509   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 510   print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 511   my $hmmer = hmmer(@seqs);             # CC sub is below
 512   $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 513   print ">>70% complete\n";
 514   print $LOG ">>70% complete\n" if $LOG;
 515
 516   #####################################################################################################
 517   # Run Jnet for the prediction
 518   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 519   print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 520   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 521   print ">>90% complete\n";
 522   print $LOG ">>90% complete\n" if $LOG;
 523   print $jnetlog if $VERBOSE;
 524   print $LOG $jnetlog if $LOG;
 525 }
 526
 527 unless ( defined $nofinal ) {
 528   my $aligfile      = $prefix . ".align";
 529   my $jnetfile      = $prefix . ".jnet";
 530   my $jnetfastafile = $prefix . ".jnet.fasta";
 531   concise2fasta( $jnetfile, $jnetfastafile );
 532   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 533   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 534   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 535   while (<$IN2>) { print $OUT $_; }
 536   while (<$IN1>) { print $OUT $_; }
 537   close($IN1);
 538   close($IN2);
 539   close($OUT);
 540   unlink $jnetfastafile;
 541 } else {
 542   if (defined $outfile and $prefix.".jnet" ne $outfile) {
 543     rename $prefix.".jnet", $outfile;
 544   }
 545 }
 546
 547 print ">>100% complete\n";
 548 print "Jpred Finished\n";
 549 print $LOG ">>100% complete\n" if $LOG;
 550 print $LOG "Jpred Finished\n" if $LOG;
 551 close ($LOG) if $LOG;
 552 exit;
 553
 554 #####################################################################################################
 555 # Functions
 556 #####################################################################################################
 557
 558 #####################################################################################################
 559
 560 =begin :private
 561
 562 =head2 fasta2concise ($infile, $outfile)
 563
 564 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 565
 566 =cut
 567
 568 sub fasta2concise {
 569   my $infile  = shift;
 570   my $outfile = shift;
 571
 572   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 573   my ( $seq, @seqs, @title );
 574   while (<$IN>) {
 575     if (s/^>//) {
 576       if ($seq) {
 577         $seq =~ s/\n|\s//g;
 578         $seq =~ s/(.)/$1,/g;
 579         push @seqs, $seq;
 580         $seq = "";
 581       }
 582       chomp;
 583       push @title, $_;
 584     } else {
 585       chomp;
 586       $seq .= $_;
 587     }
 588   }
 589   $seq =~ s/\n|\s//g;
 590   $seq =~ s/(.)/$1,/g;
 591   push @seqs, $seq;
 592   close($IN);
 593
 594   if ( @title != @seqs ) {
 595     die("non matching number of titles and sequences!\n");
 596   }
 597
 598   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 599   foreach ( 0 .. $#title ) {
 600     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 601   }
 602   close($OUT);
 603 }
 604
 605 =begin :private
 606
 607 =head2 concise2fasta ($infile, $outfile)
 608
 609 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 610
 611 =cut
 612
 613 #####################################################################################################
 614 sub concise2fasta {
 615   my $infile  = shift;
 616   my $outfile = shift;
 617
 618   my ( @seqs, %seq, @pred, %pred );
 619
 620   my @var = (
 621     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 622     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 623   );
 624
 625   # parse input concise file
 626   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 627   while (<$IN>) {
 628     unless (/^\n/) {
 629       my ( $id, $seq ) = split( ":", $_ );
 630       if ( defined $id and defined $seq ) {    # Check we have proper values
 631         $seq =~ s/,//g;
 632         chomp $seq;
 633         if ( $id =~ /;/ ) {                    # Then it's an alignment
 634           @_ = split( ";", $id );
 635           push @seqs, $_[1];
 636           $seq{ $_[1] } = $seq;
 637         }
 638         foreach my $v (@var) {
 639           if ( $v eq $id ) {
 640             push @pred, $v;
 641             $pred{$v} = $seq;
 642           }
 643         }
 644       }
 645     }
 646   }
 647   close($IN);
 648
 649   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 650
 651   # print out sequences found with PSI-BLAST
 652   foreach (@seqs) {
 653     $seq{$_} =~ s/(.{72})/$1\n/g;
 654     print $OUT ">$_\n$seq{$_}\n";
 655   }
 656
 657   # print out predictions
 658   foreach my $p (@pred) {
 659     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 660     $pred{$p} =~ s/B/E/g;
 661     $pred{$p} =~ s/G/H/g;
 662
 663     if ( $p =~ /SOL/ ) {
 664       $pred{$p} =~ s/E/B/g;
 665     }
 666     $pred{$p} =~ s/(.{72})/$1\n/g;
 667     print $OUT ">$p\n$pred{$p}\n";
 668   }
 669   close($OUT);
 670 }
 671
 672 =begin :private
 673
 674 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 675
 676 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 677 Does it in place.
 678
 679 =cut
 680
 681 #####################################################################################################
 682 sub remove_seq_masks {
 683   my $idx = shift;
 684   my $seq = shift;
 685
 686   # Only bother if we have a sequence which has been masked
 687   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 688
 689   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 690   my $searchdb = $database->{$db_entry}{unfiltered};
 691   $start--;    # We need this to be zero based
 692
 693   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 694   my @seqs = $f->get_entries;
 695   my $db   = shift @seqs;
 696
 697   unless ($db) {
 698     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 699   } elsif (@seqs) {
 700     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 701   }
 702
 703   my @db_seq = @{ $db->seq };
 704
 705   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 706   my $i = 0;
 707   for ( 0 .. $#align ) {
 708     next if $align[$_] =~ /$GAP/;
 709
 710     if ( $align[$_] eq 'X' ) {
 711       unless ( defined $db_seq[ $i + $start ] ) {
 712         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 713       }
 714       $align[$_] = $db_seq[ $i + $start ];
 715     }
 716     $i++;
 717   }
 718   $seq->align( \@align );
 719 }
 720
 721 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 722
 723 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 724
 725 Equivilent to reduce script.
 726
 727 =cut
 728
 729 #####################################################################################################
 730 sub reduce {
 731   my ( $max, @seqs ) = @_;
 732
 733   my $num = int( @seqs / $max );
 734   my @res = shift @seqs;
 735
 736   # Don't bother if we have less than the required sequences
 737   if ( @seqs < $max or 0 == $num ) {
 738     return @res, @seqs;
 739   }
 740
 741   for ( 0 .. $#seqs ) {
 742     push @res, $seqs[$_] if ( 0 == $_ % $num );
 743   }
 744
 745   return @res;
 746 }
 747
 748 =head2 nonred($cutoff, @PSISEQS);
 749
 750 Removes redundant sequences at $cutoff level of sequence identity.
 751
 752 Equivilent to nonred script.
 753
 754 =cut
 755
 756 #####################################################################################################
 757 sub nonred {
 758   my ( $cutoff, @seqs ) = @_;
 759
 760   # Run pairwise and then OC and remove similar sequences
 761   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 762   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 763   if     ( @seqs == 1 ) {
 764     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 765     return @seqs;
 766   }
 767
 768   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 769   my $fasta = FASTA::File->new;
 770   foreach (@seqs) {
 771     my $f = FASTA->new( id => $_->id );
 772     $f->seq( @{ $_->align } );
 773     $fasta->add_entries($f);
 774   }
 775
 776   # Run pairwise
 777   my $pair = Pairwise->new( path => $pairwise );
 778   my $foo = join "\n", $pair->run($fasta) or die $!;
 779
 780   # Run OC
 781   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 782   $ocres->read_string( join "", $ocres->run($foo) );
 783
 784   # Now found those entries that are unrelated
 785   my @keep;
 786   for ( $ocres->get_groups ) {
 787     my ( $group, $score, $size, @labels ) = @{$_};
 788
 789     # Keep the QUERY from the cluster, otherwise just get onelt
 790     if ( grep { /QUERY/ } @labels ) {
 791
 792       # Make sure the query stays at the start
 793       unshift @keep, "QUERY";
 794       next;
 795     } else {
 796       push @keep, shift @labels;
 797     }
 798   }
 799   push @keep, $ocres->get_unclustered;
 800
 801   # Return the matching entries.
 802   # We use the temporay to mimic the selection of sequences in the nonred
 803   # script, which took the sequences in the order they occured in the file.
 804   my @filtered_res;
 805   for (@seqs) {
 806     my $label = $_->id;
 807     if ( grep { $_ =~ /^$label$/ } @keep ) {
 808       push @filtered_res, $_;
 809     }
 810   }
 811
 812   return @filtered_res;
 813 }
 814
 815 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 816
 817 Removes those sequences that are more than $percentage longer or shorter than the first
 818 sequence in @seqs. @seqs is an array of PSISEQ objects.
 819
 820 Equivilent to trim_seqs script.
 821
 822 =cut
 823
 824 #####################################################################################################
 825 sub del_long_seqs {
 826   my ( $factor, @seqs ) = @_;
 827
 828   # Keep the query sequence (we assume query is the 1st FASTA record)
 829   my $query = shift @seqs;
 830   my @res   = $query;
 831
 832   # Find the length of the query sequence without any gaps
 833   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 834
 835   # Calculate the upper and lower allowed bounds
 836   my ( $upper, $lower );
 837   {
 838     my $bounds = ( $length / 100 ) * $factor;
 839     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 840   }
 841
 842   # Now try each sequence and see how long they are
 843   for (@seqs) {
 844     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 845     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 846   }
 847
 848   return @res;
 849 }
 850
 851 =head2 toupper ($PSISEQ)
 852
 853 Converts sequence held in PSISEQ object to uppercase.
 854
 855 =cut
 856
 857 #####################################################################################################
 858 sub toupper {
 859   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 860 }
 861
 862 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 863
 864 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 865
 866 Equivilent to fasta2jnet script.
 867
 868 =cut
 869
 870 #####################################################################################################
 871 sub degap {
 872   my (@seqs) = @_;
 873
 874   # Find where the gaps in the query sequence are
 875   my @gaps;
 876   my $i = 0;
 877   for ( @{ $seqs[0]->align } ) {
 878     push @gaps, $i if $_ !~ /$GAP/;
 879     $i++;
 880   }
 881
 882   return unless @gaps;
 883
 884   # Remove the gaps in all the sequences.
 885   # Derefences the array reference and takes a slice inside the method call
 886   # arguments, then coerce back to an array ref.
 887   for (@seqs) {
 888     $_->align( [ @{ $_->align }[@gaps] ] );
 889   }
 890 }
 891
 892 =head2 getfreq($filename, @PSISEQS)
 893
 894 Creates a PSIBLAST like profile and writes it to the $filename.
 895
 896 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 897 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 898 but it's *not* the same.
 899
 900 =cut
 901
 902 #####################################################################################################
 903 sub getfreq {
 904   my ( $fn, @seqs ) = @_;
 905
 906   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 907
 908   # Create a PSIBLAST like profile from the alignment
 909   # This doesn't greate the same result as old Jpred, I think this is because
 910   # differences in the input between old and new
 911
 912   # For testing
 913   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 914   #   my @profile = profile(
 915   #      map { join "", @{ $_->seq } } $f->get_entries
 916   #   );
 917
 918   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 919
 920   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 921   print $fh join( " ", @{$_} ), "\n" for @profile;
 922 }
 923
 924 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 925
 926 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 927
 928 Equivilent to gethmm script.
 929
 930 =cut
 931
 932 #####################################################################################################
 933 sub hmmer {
 934   my (@seqs) = @_;
 935
 936   # Temp files required
 937   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 938
 939   # Create the FASTA file
 940   psiseq2fasta( $tmp_fasta, @seqs );
 941
 942   # Run HMMer
 943   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 944   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 945
 946   # Read in the HMMER file
 947   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 948
 949   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 950   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 951   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 952
 953   return $hmmer_jnet;
 954 }
 955
 956 =head2 jnet($hmmer, $out, $pssm )
 957
 958 Runs Jnet. Pass the paths of the filenames to do the prediction on
 959
 960 =cut
 961
 962 #####################################################################################################
 963 sub jnet {
 964   my ( $hmmer, $outfile, $pssm ) = @_;
 965   if ($pssm) {
 966
 967     # run Jnet prediction with all the profile data
 968     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 969   } else {
 970
 971     # run Jnet prediction with only HMMer and alignment data
 972     system("$jnet --concise --hmmer $hmmer > $outfile");
 973   }
 974   check( "jnet", $? ) or exit 1;
 975   my $res = "";
 976   my $logging = "";
 977   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 978   while (<$IN>) {
 979     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
 980       $res .= $_;
 981     } else {
 982       $logging .= $_;
 983     }
 984   }
 985   close $IN;
 986   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 987   print $OUT "\n".$res;
 988   close $OUT;
 989   return $logging;
 990 }
 991
 992 =head1 psiseq2fasta($path, @PSISEQ)
 993
 994 Writes a FASTA file given an array of PSISEQ objects.
 995
 996 =cut
 997
 998 #####################################################################################################
 999 sub psiseq2fasta {
1000   my ( $fn, @seqs ) = @_;
1001
1002   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1003   confess "JPRED: not passed PSISEQs" unless @seqs;
1004
1005   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1006
1007   # Changed from using FASTA::File object due to seg faulting on some large,
1008   # long alignments, presumably due to running out of memory.
1009   #   my $fasta = FASTA::File->new;
1010   #   $fasta->add_entries(
1011   #      map {
1012   #         my $f = FASTA->new(id => $_->id);
1013   #         $f->seq(@{$_->align});
1014   #         $f;
1015   #      } @seqs
1016   #   );
1017   #   $fasta->write_file($fn);
1018
1019   my $fh;
1020   $fn =~ /\.gz$/
1021     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1022     : ( open $fh, ">$fn" or die "JPRED: $!" );
1023
1024   for (@seqs) {
1025     print $fh ">", $_->id, "\n";
1026     my $seq = join( "", @{ $_->align } );
1027     $seq =~ s/(.{72})/$1\n/g;
1028     chomp $seq;
1029     print $fh $seq . "\n";
1030   }
1031   close $fh;
1032 }
1033
1034 =head1 @PSISEQ = fasta2psiseq($path)
1035
1036 Reads in a FASTA file and returns an array of PSISEQ objects.
1037
1038 =cut
1039
1040 #####################################################################################################
1041 sub fasta2psiseq {
1042   my ($fn) = @_;
1043
1044   my $fasta =
1045     $fn =~ /\.gz$/
1046     ? FASTA::File->new( read_gzip_file => $fn )
1047     : FASTA::File->new( read_file      => $fn );
1048   $fasta or die "JPRED: $!";
1049
1050   my @seqs;
1051   for ( $fasta->get_entries ) {
1052     my $seq = PSISEQ->new;
1053     $seq->id( $_->id );
1054     $seq->align( [ @{ $_->seq } ] );
1055     push @seqs, $seq;
1056   }
1057
1058   return @seqs;
1059 }
1060
1061 #####################################################################################################
1062 sub fasta2psiseq_lowmemory {
1063   my $filename = shift;
1064
1065   local $/ = "\n";
1066   my $fh;
1067   $filename =~ /\.gz$/
1068     ? ( open $fh, "gzip -dc $filename|" or die $! )
1069     : ( open $fh, $filename or die $! );
1070
1071   my @seqs;
1072   my ( $id, $seq );
1073   while (<$fh>) {
1074     chomp;
1075     if (s/^>//) {
1076       $seq =~ s/ //g if defined $seq;
1077
1078       if ( $id and $seq ) {
1079         my $psi = PSISEQ->new( id => $id );
1080         $psi->align( [ split //, $seq ] );
1081         push @seqs, $psi;
1082         undef $id;
1083         undef $seq;
1084       } else {
1085         $id = $_;
1086       }
1087     } else {
1088       $seq .= $_;
1089     }
1090   }
1091   if ( $id and $seq ) {
1092     my $psi = PSISEQ->new( id => $id );
1093     $psi->seq($seq);
1094     push @seqs, $psi;
1095   }
1096
1097   return @seqs;
1098 }
1099
1100 =head1 extend($FASTA::File, @PSISEQ)
1101
1102 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1103 against the query, this function extends the alignment so that all of the query is
1104 present in the alignment. The other sequences are extended with gap characters.
1105
1106 =cut
1107
1108 #####################################################################################################
1109 sub extend {
1110   my ( $query, @seqs ) = @_;
1111
1112   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1113   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1114
1115   # Get the query sequence
1116   my $q_query = $query->get_entry(0);
1117
1118   # Get the query sequence from the PSIBLAST alignment
1119   my $a_query = shift @seqs;
1120
1121   # The gap size required to expand the alignment
1122   my ( $start_gap_length, $end_gap_length );
1123   {
1124
1125     # Make handling the sequence easier
1126     my $q_seq = join "", @{ [ $q_query->seq ] };
1127     my $a_seq = join "", @{ $a_query->align };
1128
1129     $q_seq =~ s/-//g;
1130     $a_seq =~ s/-//g;
1131
1132     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1133
1134     $start_gap_length = index( $q_seq, $a_seq );
1135     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1136
1137     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1138   }
1139
1140   # Add the gaps to the end of the alignments
1141   for (@seqs) {
1142     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1143   }
1144
1145   # Put the query sequence back
1146   unshift @seqs,
1147     PSISEQ->new(
1148     id    => $a_query->id,
1149     align => [
1150       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1151       @{ $a_query->align },
1152       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1153     ]
1154     );
1155
1156   return @seqs;
1157 }
1158
1159 =head1 $int = fasta_seq_length($path)
1160
1161 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1162
1163 =cut
1164
1165 #####################################################################################################
1166 sub fasta_seq_length {
1167   my ($fn) = @_;
1168   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1169
1170   my $length = 0;
1171   local $_, $/ = "\n";
1172   while (<$fh>) {
1173     next if /^>/;
1174     chomp;
1175     $length += tr///c;
1176   }
1177
1178   return $length;
1179 }
1180
1181 =head1 log(@messages)
1182
1183 Report messages to STDERR
1184
1185 =cut