From 3ce02f963043c66aedd16f12c789592e86149398 Mon Sep 17 00:00:00 2001 From: pvtroshin Date: Tue, 20 Jul 2010 12:48:05 +0000 Subject: [PATCH] Changes from JWS3 is merged From now on JWS3 project should not contain any new data. To be deleted later git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@2611 e3abac25-378b-4346-85de-24260fe3988d --- conf/temp/blastdbcmd.txt | 98 ++++++++ conf/temp/jackhmmer.txt | 87 +++++++ conf/temp/psiblast.txt | 249 ++++++++++++++++++++ .../pipeline/_jpred/BlastBlastComparator.java | 65 +++++ .../pipeline/_jpred/BlastHmmerComparator.java | 60 +++++ runner/compbio/pipeline/_jpred/BlastParser.java | 148 ++++++++---- runner/compbio/pipeline/_jpred/Hit.java | 61 +++++ .../pipeline/_jpred/JackHmmerHitParser.java | 80 +++++++ runner/compbio/runner/disorder/RonnWrapper.java | 129 ++++++++++ 9 files changed, 933 insertions(+), 44 deletions(-) create mode 100644 conf/temp/blastdbcmd.txt create mode 100644 conf/temp/jackhmmer.txt create mode 100644 conf/temp/psiblast.txt create mode 100644 runner/compbio/pipeline/_jpred/BlastBlastComparator.java create mode 100644 runner/compbio/pipeline/_jpred/BlastHmmerComparator.java create mode 100644 runner/compbio/pipeline/_jpred/Hit.java create mode 100644 runner/compbio/pipeline/_jpred/JackHmmerHitParser.java create mode 100644 runner/compbio/runner/disorder/RonnWrapper.java diff --git a/conf/temp/blastdbcmd.txt b/conf/temp/blastdbcmd.txt new file mode 100644 index 0000000..368428d --- /dev/null +++ b/conf/temp/blastdbcmd.txt @@ -0,0 +1,98 @@ +Analog of fastacmd for blast+ + +-bash-3.2$ /local/opt/bin/blastdbcmd -help +USAGE + blastdbcmd [-h] [-help] [-db dbname] [-dbtype molecule_type] + [-entry sequence_identifier] [-entry_batch input_file] [-pig PIG] [-info] + [-range numbers] [-strand strand] [-mask_sequence_with numbers] + [-out output_file] [-outfmt format] [-target_only] [-get_dups] + [-line_length number] [-ctrl_a] [-version] + +DESCRIPTION + BLAST database client, version 2.2.23+ + +OPTIONAL ARGUMENTS + -h + Print USAGE and DESCRIPTION; ignore other arguments + -help + Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments + -version + Print version number; ignore other arguments + + *** BLAST database options + -db + BLAST database name + Default = `nr' + -dbtype + Molecule type stored in BLAST database + Default = `guess' + + *** Retrieval options + -entry + Comma-delimited search string(s) of sequence identifiers: + e.g.: 555, AC147927, 'gnl|dbname|tag', or 'all' to select all + sequences in the database + * Incompatible with: entry_batch, pig, info + -entry_batch + Input file for batch processing (Format: one entry per line) + * Incompatible with: entry, pig, info + -pig =0> + PIG to retrieve + * Incompatible with: entry, entry_batch, target_only, info + -info + Print BLAST database information + * Incompatible with: entry, entry_batch, outfmt, strand, target_only, + ctrl_a, get_dups, pig, range + + *** Sequence retrieval configuration options + -range + Range of sequence to extract (Format: start-stop) + * Incompatible with: info + -strand + Strand of nucleotide sequence to extract + Default = `plus' + * Incompatible with: info + -mask_sequence_with + Produce lower-case masked FASTA using the algorithm IDs specified (Format: + N,M,...) + + *** Output configuration options + -out + Output file name + Default = `-' + -outfmt + Output format, where the available format specifiers are: + %f means sequence in FASTA format + %s means sequence data (without defline) + %a means accession + %g means gi + %o means ordinal id (OID) + %t means sequence title + %l means sequence length + %T means taxid + %L means common taxonomic name + %S means scientific name + %P means PIG + %mX means sequence masking data, where X is an optional comma- + separted list of integers to specify the algorithm ID(s) to + diaplay (or all masks if absent or invalid specification). + Masking data will be displayed as a series of 'N-M' values + separated by ';' or the word 'none' if none are available. + For every format except '%f', each line of output will correspond to + a sequence. + Default = `%f' + * Incompatible with: info + -target_only + Definition line should contain target GI only + * Incompatible with: pig, info, get_dups + -get_dups + Retrieve duplicate accessions + * Incompatible with: info, target_only + + *** Output configuration options for FASTA format + -line_length =1> + Line length for output + Default = `80' + -ctrl_a + Use Ctrl-A as the non-redundant defline separator + * Incompatible with: info diff --git a/conf/temp/jackhmmer.txt b/conf/temp/jackhmmer.txt new file mode 100644 index 0000000..d98a115 --- /dev/null +++ b/conf/temp/jackhmmer.txt @@ -0,0 +1,87 @@ +-bash-3.2$ /sw/opt/hmmer3/bin/jackhmmer -h +# jackhmmer :: iteratively search a protein sequence against a protein database +# HMMER 3.0 (March 2010); http://hmmer.org/ +# Copyright (C) 2010 Howard Hughes Medical Institute. +# Freely distributed under the GNU General Public License (GPLv3). +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Usage: jackhmmer [-options] + +where basic options are: + -h : show brief help on version and usage + -N : set maximum number of iterations to [5] (n>0) + +options directing output: + -o : direct output to file , not stdout + -A : save multiple alignment of hits to file + --tblout : save parseable table of per-sequence hits to file + --domtblout : save parseable table of per-domain hits to file + --chkhmm : save HMM checkpoints to files -.hmm + --chkali : save alignment checkpoints to files -.sto + --acc : prefer accessions over names in output + --noali : don't output alignments, so output is smaller + --notextw : unlimit ASCII text output line width + --textw : set max width of ASCII text output lines [120] (n>=120) + +options controlling scoring system in first iteration: + --popen : gap open probability [0.02] (0<=x<0.5) + --pextend : gap extend probability [0.4] (0<=x<1) + --mxfile : substitution score matrix [default: BLOSUM62] + +options controlling reporting thresholds: + -E : report sequences <= this E-value threshold in output [10.0] (x>0) + -T : report sequences >= this score threshold in output + --domE : report domains <= this E-value threshold in output [10.0] (x>0) + --domT : report domains >= this score cutoff in output + +options controlling significance thresholds for inclusion in next round: + --incE : consider sequences <= this E-value threshold as significant + --incT : consider sequences >= this score threshold as significant + --incdomE : consider domains <= this E-value threshold as significant + --incdomT : consider domains >= this score threshold as significant + +options controlling acceleration heuristics: + --max : Turn all heuristic filters off (less speed, more power) + --F1 : Stage 1 (MSV) threshold: promote hits w/ P <= F1 [0.02] + --F2 : Stage 2 (Vit) threshold: promote hits w/ P <= F2 [1e-3] + --F3 : Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [1e-5] + --nobias : turn off composition bias filter + +options controlling model construction after first iteration: + --fast : assign cols w/ >= symfrac residues as consensus + --hand : manual construction (requires reference annotation) + --symfrac : sets sym fraction controlling --fast construction + --fragthresh : if L < x, tag sequence as a fragment + +options controlling relative weights in models after first iteration: + --wpb : Henikoff position-based weights [default] + --wgsc : Gerstein/Sonnhammer/Chothia tree weights + --wblosum : Henikoff simple filter weights + --wnone : don't do any relative weighting; set all to 1 + --wid : for --wblosum: set identity cutoff [0.62] (0<=x<=1) + +options controlling effective seq number in models after first iteration: + --eent : adjust eff seq # to achieve relative entropy target [default] + --eclust : eff seq # is # of single linkage clusters + --enone : no effective seq # weighting: just use nseq + --eset : set eff seq # for all models to + --ere : for --eent: set minimum rel entropy/position to + --esigma : for --eent: set sigma param to [45.0] + --eid : for --eclust: set fractional identity cutoff to [0.62] + +Options controlling E value calibration: + --EmL : length of sequences for MSV Gumbel mu fit [200] (n>0) + --EmN : number of sequences for MSV Gumbel mu fit [200] (n>0) + --EvL : length of sequences for Viterbi Gumbel mu fit [200] (n>0) + --EvN : number of sequences for Viterbi Gumbel mu fit [200] (n>0) + --EfL : length of sequences for Forward exp tail tau fit [100] (n>0) + --EfN : number of sequences for Forward exp tail tau fit [200] (n>0) + --Eft : tail mass for Forward exponential tail tau fit [0.04] (0 : set # of comparisons done, for E-value calculation + --domZ : set # of significant seqs, for domain E-value calculation + --seed : set RNG seed to (if 0: one-time arbitrary seed) [42] + --qformat : assert query is in format : no autodetection + --tformat : assert target is in format >: no autodetection + --cpu : number of parallel CPU workers to use for multithreads diff --git a/conf/temp/psiblast.txt b/conf/temp/psiblast.txt new file mode 100644 index 0000000..7b62c3e --- /dev/null +++ b/conf/temp/psiblast.txt @@ -0,0 +1,249 @@ + /local/gjb_lab/blast+/bin/psiblast -help +USAGE + psiblast [-h] [-help] [-import_search_strategy filename] + [-export_search_strategy filename] [-db database_name] + [-dbsize num_letters] [-gilist filename] [-negative_gilist filename] + [-entrez_query entrez_query] [-subject subject_input_file] + [-subject_loc range] [-query input_file] [-out output_file] + [-evalue evalue] [-word_size int_value] [-gapopen open_penalty] + [-gapextend extend_penalty] [-xdrop_ungap float_value] + [-xdrop_gap float_value] [-xdrop_gap_final float_value] + [-searchsp int_value] [-seg SEG_options] [-soft_masking soft_masking] + [-matrix matrix_name] [-threshold float_value] [-culling_limit int_value] + [-best_hit_overhang float_value] [-best_hit_score_edge float_value] + [-window_size int_value] [-lcase_masking] [-query_loc range] + [-parse_deflines] [-outfmt format] [-show_gis] + [-num_descriptions int_value] [-num_alignments int_value] [-html] + [-max_target_seqs num_sequences] [-num_threads int_value] [-remote] + [-comp_based_stats compo] [-use_sw_tback] [-gap_trigger float_value] + [-num_iterations int_value] [-out_pssm checkpoint_file] + [-out_ascii_pssm ascii_mtx_file] [-in_msa align_restart] + [-in_pssm psi_chkpt_file] [-pseudocount pseudocount] + [-inclusion_ethresh ethresh] [-phi_pattern file] [-version] + +DESCRIPTION + Position-Specific Initiated BLAST 2.2.22+ + +OPTIONAL ARGUMENTS + -h + Print USAGE and DESCRIPTION; ignore other arguments + -help + Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments + -version + Print version number; ignore other arguments + + *** Input query options + -query + Input file name + Default = `-' + * Incompatible with: in_msa, in_pssm + -query_loc + Location on the query sequence (Format: start-stop) + + *** General search options + -db + BLAST database name + * Incompatible with: subject, subject_loc + -out + Output file name + Default = `-' + -evalue + Expectation value (E) threshold for saving hits + Default = `10' + -word_size =2> + Word size for wordfinder algorithm + -gapopen + Cost to open a gap + -gapextend + Cost to extend a gap + -matrix + Scoring matrix name + Default = `BLOSUM62' + -threshold =0> + Minimum word score such that the word is added to the BLAST lookup table + -comp_based_stats + Use composition-based statistics for blastp / tblastn: + D or d: default (equivalent to 2) + 0 or F or f: no composition-based statistics + 1: Composition-based statistics as in NAR 29:2994-3005, 2001 + 2 or T or t : Composition-based score adjustment as in Bioinformatics + 21:902-911, + 2005, conditioned on sequence properties + 3: Composition-based score adjustment as in Bioinformatics 21:902-911, + 2005, unconditionally + For programs other than tblastn, must either be absent or be D, F or 0 + Default = `2' + + *** BLAST-2-Sequences options + -subject + Subject sequence(s) to search + * Incompatible with: db, gilist, negative_gilist + -subject_loc + Location on the subject sequence (Format: start-stop) + * Incompatible with: db, gilist, negative_gilist, remote + + *** Formatting options + -outfmt + alignment view options: + 0 = pairwise, + 1 = query-anchored showing identities, + 2 = query-anchored no identities, + 3 = flat query-anchored, show identities, + 4 = flat query-anchored, no identities, + 5 = XML Blast output, + 6 = tabular, + 7 = tabular with comment lines, + 8 = Text ASN.1, + 9 = Binary ASN.1 + 10 = Comma-separated values + + Options 6, 7, and 10 can be additionally configured to produce + a custom format specified by space delimited format specifiers. + The supported format specifiers are: + qseqid means Query Seq-id + qgi means Query GI + qacc means Query accesion + sseqid means Subject Seq-id + sallseqid means All subject Seq-id(s), separated by a ';' + sgi means Subject GI + sallgi means All subject GIs + sacc means Subject accession + sallacc means All subject accessions + qstart means Start of alignment in query + qend means End of alignment in query + sstart means Start of alignment in subject + send means End of alignment in subject + qseq means Aligned part of query sequence + sseq means Aligned part of subject sequence + evalue means Expect value + bitscore means Bit score + score means Raw score + length means Alignment length + pident means Percentage of identical matches + nident means Number of identical matches + mismatch means Number of mismatches + positive means Number of positive-scoring matches + gapopen means Number of gap openings + gaps means Total number of gaps + ppos means Percentage of positive-scoring matches + frames means Query and subject frames separated by a '/' + qframe means Query frame + sframe means Subject frame + When not provided, the default value is: + 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send + evalue bitscore', which is equivalent to the keyword 'std' + Default = `0' + -show_gis + Show NCBI GIs in deflines? + -num_descriptions =0> + Number of database sequences to show one-line descriptions for + Default = `500' + -num_alignments =0> + Number of database sequences to show alignments for + Default = `250' + -html + Produce HTML output? + + *** Query filtering options + -seg + Filter query sequence with SEG (Format: 'yes', 'window locut hicut', or + 'no' to disable) + Default = `no' + -soft_masking + Apply filtering locations as soft masks + Default = `false' + -lcase_masking + Use lower case filtering in query and subject sequence(s)? + + *** Restrict search or results + -gilist + Restrict search of database to list of GI's + * Incompatible with: negative_gilist, remote, subject, subject_loc + -negative_gilist + Restrict search of database to everything except the listed GIs + * Incompatible with: gilist, remote, subject, subject_loc + -entrez_query + Restrict search with the given Entrez query + * Requires: remote + -culling_limit =0> + If the query range of a hit is enveloped by that of at least this many + higher-scoring hits, delete the hit + * Incompatible with: best_hit_overhang, best_hit_score_edge + -best_hit_overhang =0 and =<0.5)> + Best Hit algorithm overhang value (recommended value: 0.1) + * Incompatible with: culling_limit + -best_hit_score_edge =0 and =<0.5)> + Best Hit algorithm score edge value (recommended value: 0.1) + * Incompatible with: culling_limit + -max_target_seqs =1> + Maximum number of aligned sequences to keep + + *** Statistical options + -dbsize + Effective length of the database + -searchsp =0> + Effective length of the search space + + *** Search strategy options + -import_search_strategy + Search strategy to use + * Incompatible with: export_search_strategy + -export_search_strategy + File name to record the search strategy used + * Incompatible with: import_search_strategy + + *** Extension options + -xdrop_ungap + X-dropoff value (in bits) for ungapped extensions + -xdrop_gap + X-dropoff value (in bits) for preliminary gapped extensions + -xdrop_gap_final + X-dropoff value (in bits) for final gapped alignment + -window_size =0> + Multiple hits window size, use 0 to specify 1-hit algorithm + -gap_trigger + Number of bits to trigger gapping + Default = `22' + + *** Miscellaneous options + -parse_deflines + Should the query and subject defline(s) be parsed? + -num_threads =1> + Number of threads to use in the BLAST search + Default = `1' + * Incompatible with: remote + -remote + Execute search remotely? + * Incompatible with: gilist, negative_gilist, subject_loc, num_threads, + num_iterations + -use_sw_tback + Compute locally optimal Smith-Waterman alignments? + + *** PSI-BLAST options + -num_iterations =1> + Number of iterations to perform + Default = `1' + * Incompatible with: remote + -out_pssm + File name to store checkpoint file + -out_ascii_pssm + File name to store ASCII version of PSSM + -in_msa + File name of multiple sequence alignment to restart PSI-BLAST + * Incompatible with: in_pssm, query + -in_pssm + PSI-BLAST checkpoint file + * Incompatible with: in_msa, query, phi_pattern + + *** PSSM engine options + -pseudocount + Pseudo-count value used when constructing PSSM + Default = `0' + -inclusion_ethresh + E-value inclusion threshold for pairwise alignments + Default = `0.002' + + *** PHI-BLAST options + -phi_pattern + File name containing pattern to search + * Incompatible with: in_pssm diff --git a/runner/compbio/pipeline/_jpred/BlastBlastComparator.java b/runner/compbio/pipeline/_jpred/BlastBlastComparator.java new file mode 100644 index 0000000..a85c17b --- /dev/null +++ b/runner/compbio/pipeline/_jpred/BlastBlastComparator.java @@ -0,0 +1,65 @@ +/* Copyright (c) 2009 Peter Troshin + * + * Jalview Web Services @version: 2.0 + * + * This library is free software; you can redistribute it and/or modify it under the terms of the + * Apache License version 2 as published by the Apache Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache + * License for more details. + * + * A copy of the license is in apache_license.txt. It is also available here: + * @see: http://www.apache.org/licenses/LICENSE-2.0.txt + * + * Any republication or derived work distributed in source code form + * must include this copyright and license notice. + */ +package compbio.pipeline._jpred; + +import java.io.FileNotFoundException; +import java.util.HashSet; +import java.util.Set; + +import javax.xml.stream.XMLStreamException; + + +public class BlastBlastComparator { + + /** + * args[0] is assumed to be the name of a Blast output file + * + * @throws XMLStreamException + * @throws FileNotFoundException + */ + public static void main(String[] args) throws FileNotFoundException, + XMLStreamException { + BlastParser res1 = new BlastParser(args[0]); + BlastParser res2 = new BlastParser(args[1]); + assert res1.iters.size() == res2.iters.size(); + + for (Integer iterNum : res1.iters.keySet()) { + Set list = res1.iters.get(iterNum); + Set otherList = res2.iters.get(iterNum); + System.out.print("Iter " + iterNum + " arg0: " + list.size()); + System.out.println(" arg1: " + otherList.size()); + BlastParser.printNames(getDiff(list, otherList)); + // System.out.println("Diffs: " + getDiff(list, otherList)); + // System.out.println("Diffs: " + getDiff(otherList, list)); + + } + + } + + static Set getDiff(Set list, Set otherList) { + Set diff = new HashSet(); + for (Hit pseq : list) { + if (otherList.contains(pseq)) { + continue; + } + diff.add(pseq); + } + return diff; + } + +} diff --git a/runner/compbio/pipeline/_jpred/BlastHmmerComparator.java b/runner/compbio/pipeline/_jpred/BlastHmmerComparator.java new file mode 100644 index 0000000..21dec65 --- /dev/null +++ b/runner/compbio/pipeline/_jpred/BlastHmmerComparator.java @@ -0,0 +1,60 @@ +/* Copyright (c) 2009 Peter Troshin + * + * Jalview Web Services @version: 2.0 + * + * This library is free software; you can redistribute it and/or modify it under the terms of the + * Apache License version 2 as published by the Apache Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache + * License for more details. + * + * A copy of the license is in apache_license.txt. It is also available here: + * @see: http://www.apache.org/licenses/LICENSE-2.0.txt + * + * Any republication or derived work distributed in source code form + * must include this copyright and license notice. + */ +package compbio.pipeline._jpred; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import javax.xml.stream.XMLStreamException; + +public class BlastHmmerComparator { + + /** + * args[0] is assumed to be the name of a Blast output file + * + * @throws XMLStreamException + * @throws IOException + */ + public static void main(String[] args) throws XMLStreamException, + IOException { + BlastParser res1 = new BlastParser(args[0]); + JackHmmerHitParser res2 = new JackHmmerHitParser(args[1]); + Set list = res1.iters.get(3); + Set otherList = res2.hits; + System.out.print("Iter " + 3 + " arg0: " + list.size()); + System.out.println(" arg1: " + otherList.size()); + BlastParser.printNames(getDiff(list, otherList)); + BlastParser.printNames(getDiff(otherList, list)); + // System.out.println("Diffs: " + getDiff(list, otherList)); + // System.out.println("Diffs: " + getDiff(otherList, list)); + + } + + static Set getDiff(Set list, Set otherList) { + Set diff = new HashSet(); + for (Hit pseq : list) { + if (otherList.contains(pseq)) { + continue; + } + diff.add(pseq); + } + return diff; + } + +} diff --git a/runner/compbio/pipeline/_jpred/BlastParser.java b/runner/compbio/pipeline/_jpred/BlastParser.java index 73d9a99..7039588 100644 --- a/runner/compbio/pipeline/_jpred/BlastParser.java +++ b/runner/compbio/pipeline/_jpred/BlastParser.java @@ -16,59 +16,119 @@ * must include this copyright and license notice. */ package compbio.pipeline._jpred; -import java.io.*; -import java.util.*; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; - public class BlastParser { - - static class Psiseq { - String id; - String seq; + + Map> iters; + + public BlastParser(String file) throws FileNotFoundException, + XMLStreamException { + XMLInputFactory f = XMLInputFactory.newInstance(); + XMLStreamReader r = f.createXMLStreamReader(new BufferedInputStream( + new FileInputStream(new File(file)))); + Set pl = new HashSet(); + Hit psi = null; + this.iters = new HashMap>(); + Integer iternum = null; + while (r.hasNext()) { + int idx = r.next(); + if (r.isStartElement()) { + String name = r.getLocalName(); + if (name.equals("Iteration_iter-num")) { + iternum = Integer.parseInt(r.getElementText().trim()); + System.out.println("Iter " + iternum); + } + if (name.equals("Hit")) { + psi = new Hit(); + } + if (name.equals("Hit_num")) { + psi.number = r.getElementText(); + } + if (name.equals("Hit_accession")) { + psi.accession = r.getElementText(); + // System.out.println(psi.id); + } + if (name.equals("Hit_def")) { + // System.out.println(r.getElementText()); + psi.name = r.getElementText().split("\\s+")[0].trim(); + // System.out.println(psi.id); + } + if (name.equals("Hsp_hseq")) { + psi.seq = r.getElementText(); + // System.out.println(psi.seq); + } + if (name.equals("Hsp_evalue")) { + psi.evalue = r.getElementText(); + // System.out.println(psi.seq); + } + + } + + if (r.isEndElement()) { + String name = r.getLocalName(); + if (name.equals("Hit")) { + boolean replaced = pl.add(psi); + assert replaced : "Expect unique elements only!"; + psi = null; + } + if (name.equals("Iteration")) { + iters.put(iternum, pl); + pl = new HashSet(); + } + } } - /** - * args[0] is assumed to be the name of a Blast output file - * @throws XMLStreamException - * @throws FileNotFoundException - */ - public static void main(String[] args) throws FileNotFoundException, XMLStreamException { - XMLInputFactory f = XMLInputFactory.newInstance(); - XMLStreamReader r = f.createXMLStreamReader( new BufferedInputStream(new FileInputStream(new File(args[0])))); - List pl = new ArrayList(); - Psiseq psi = null; - while(r.hasNext()) { - int idx = r.next(); - //System.out.println(idx); - if(r.isStartElement()) { - String name = r.getLocalName(); - if(name.equals("Hit") ) { - psi = new Psiseq(); - } - if(name.equals("Hit_id") ) { - //System.out.println(r.getElementText()); - psi.id = r.getElementText(); - System.out.println(psi.id); - } - if(name.equals("Hsp_hseq")) { - psi.seq = r.getElementText(); - System.out.println(psi.seq); - } - } - - if(r.isEndElement()) { - String name = r.getLocalName(); - if(name.equals("Hit") ) { - pl.add(psi); - psi = null; - } - } + } - } + /** + * args[0] is assumed to be the name of a Blast output file + * + * @throws XMLStreamException + * @throws FileNotFoundException + */ + public static void main(String[] args) throws FileNotFoundException, + XMLStreamException { + BlastParser parser = new BlastParser(args[0]); + printHits(parser.iters); + } + static final void printHits(Map> iterNumPsiSeqs) { + for (Integer iterNum : iterNumPsiSeqs.keySet()) { + System.out.println("Iteration " + iterNum); + printHits(iterNumPsiSeqs.get(iterNum)); } + } + + static final void printHits(Collection psiseqs) { + assert psiseqs != null; + System.out.println("Total hits: " + psiseqs.size()); + for (Hit pseq : psiseqs) { + System.out.println("Hit: " + pseq.number + " Accession: " + + pseq.accession + " name " + pseq.name); + } + } + + static final void printNames(Collection psiseqs) { + assert psiseqs != null; + System.out.println("Total hits: " + psiseqs.size()); + for (Hit pseq : psiseqs) { + System.out.print(pseq.number + " "); + System.out.println(pseq.name); + } + } + } diff --git a/runner/compbio/pipeline/_jpred/Hit.java b/runner/compbio/pipeline/_jpred/Hit.java new file mode 100644 index 0000000..c8351d9 --- /dev/null +++ b/runner/compbio/pipeline/_jpred/Hit.java @@ -0,0 +1,61 @@ +/** + * + */ +package compbio.pipeline._jpred; + +import java.util.Comparator; + +public class Hit { + String name; + String number; + String accession; + String seq; + String evalue; + + @Override + public String toString() { + return "accession=" + accession + ", name=" + name + ", num=" + number + + ", evalue=" + evalue + "\n"; // + ", seq=" + seq + + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Hit other = (Hit) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + return true; + } + + public static final class NumberComporator implements Comparator { + @Override + public int compare(Hit o1, Hit o2) { + return Integer.valueOf(o1.number).compareTo( + Integer.valueOf(o2.number)); + } + } + + public static final class EvalueComporator implements Comparator { + @Override + public int compare(Hit o1, Hit o2) { + return Double.valueOf(o1.evalue).compareTo( + Double.valueOf(o2.evalue)); + } + } +} \ No newline at end of file diff --git a/runner/compbio/pipeline/_jpred/JackHmmerHitParser.java b/runner/compbio/pipeline/_jpred/JackHmmerHitParser.java new file mode 100644 index 0000000..fb91920 --- /dev/null +++ b/runner/compbio/pipeline/_jpred/JackHmmerHitParser.java @@ -0,0 +1,80 @@ +package compbio.pipeline._jpred; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Scanner; +import java.util.Set; + +/** + * Parser for the following files: + * + * @author pvtroshin + * + */ +public class JackHmmerHitParser { + //# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- + //# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target + //# ------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- + //tr|Q6TVU2|Q6TVU2_ORFV - gi_74230740_gb_ABA00545.1 - 4.5e-271 910.4 0.0 5.1e-271 910.2 0.0 1.0 1 0 0 1 1 1 1 Putative uncharacterized protein OS=Orf virus PE=4 SV=1 + + Set hits; + + public JackHmmerHitParser(String file) throws IOException { + + BufferedReader bfr = new BufferedReader(new InputStreamReader( + new FileInputStream(file), "ISO-8859-1"), 64000); + // throw away first three lines; + this.hits = new HashSet(); + String line = bfr.readLine(); + bfr.readLine(); + bfr.readLine(); + int hitc = 0; + while ((line = bfr.readLine()) != null) { + hitc++; + Scanner scan = new Scanner(line); + scan.useDelimiter("\\s+"); + extractData(scan, hitc); + } + List lhits = new ArrayList(hits); + Collections.sort(lhits, new Hit.NumberComporator()); + } + + void extractData(Scanner scan, int hitcounter) { + Hit pseq = new Hit(); + + String tname = scan.next(); + pseq.name = tname; + //System.out.println(tname); + + String tacc = scan.next(); + pseq.accession = tacc; + //System.out.println(tacc); + String qname = scan.next(); + //System.out.println(qname); + String qacc = scan.next(); + //System.out.println(qacc); + + Double evalue = scan.nextDouble(); + //System.out.println(evalue); + pseq.evalue = evalue.toString(); + + Double score = scan.nextDouble(); + //System.out.println(score); + pseq.evalue = evalue.toString(); + pseq.number = new Integer(hitcounter).toString(); + boolean unique = hits.add(pseq); + assert unique : "Unique hits are expected!"; + } + + public static void main(String[] args) throws IOException { + assert args[0] != null; + JackHmmerHitParser parser = new JackHmmerHitParser(args[0]); + BlastParser.printHits(parser.hits); + } +} diff --git a/runner/compbio/runner/disorder/RonnWrapper.java b/runner/compbio/runner/disorder/RonnWrapper.java new file mode 100644 index 0000000..a45601a --- /dev/null +++ b/runner/compbio/runner/disorder/RonnWrapper.java @@ -0,0 +1,129 @@ +/* Copyright (c) 2009 Peter Troshin + * + * Jalview Web Services @version: 2.0 + * + * This library is free software; you can redistribute it and/or modify it under the terms of the + * Apache License version 2 as published by the Apache Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache + * License for more details. + * + * A copy of the license is in apache_license.txt. It is also available here: + * @see: http://www.apache.org/licenses/LICENSE-2.0.txt + * + * Any republication or derived work distributed in source code form + * must include this copyright and license notice. + */ + +package compbio.runner.disorder; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.apache.log4j.Logger; + +import compbio.data.sequence.Alignment; +import compbio.data.sequence.UnknownFileFormatException; +import compbio.engine.client.Executable; +import compbio.engine.client.PipedExecutable; +import compbio.engine.client.SkeletalExecutable; +import compbio.metadata.Limit; +import compbio.metadata.LimitsManager; +import compbio.metadata.ResultNotAvailableException; +import compbio.runner.Util; + +public class RonnWrapper extends SkeletalExecutable implements + PipedExecutable { + /* + * RONN does not accept stdin the file name must be defined as parameter It + * can only analyse ONE sequence per run! (or may be not, but the results + * gets overriden!) FASTA format is accepted. + * + * To run it do the following: + * + * 1) copy ronn executables and task file to work directory + * + * 2) execute run processes one by one for each sequence + */ + + private static final String command = "/homes/pvtroshin/soft/RONNv3_fasta/Ronn_runner.sh"; + + private static Logger log = Logger.getLogger(RonnWrapper.class); + + // Cache for Limits information + private static LimitsManager limits; + + public static final String KEY_VALUE_SEPARATOR = Util.SPACE; + + @SuppressWarnings("unchecked") + @Override + public Alignment getResults(String workDirectory) + throws ResultNotAvailableException { + try { + return Util.readClustalFile(workDirectory, getOutput()); + } catch (FileNotFoundException e) { + log.error(e.getMessage(), e.getCause()); + throw new ResultNotAvailableException(e); + } catch (IOException e) { + log.error(e.getMessage(), e.getCause()); + throw new ResultNotAvailableException(e); + } catch (UnknownFileFormatException e) { + log.error(e.getMessage(), e.getCause()); + throw new ResultNotAvailableException(e); + } catch (NullPointerException e) { + log.error(e.getMessage(), e.getCause()); + throw new ResultNotAvailableException(e); + } + } + + @Override + public List getCreatedFiles() { + return Arrays.asList(getOutput(), getError()); + } + + @Override + public RonnWrapper setInput(String inFile) { + String input = getInput(); + super.setInput(inFile); + return this; + } + + @Override + public Limit getLimit(String presetName) { + if (limits == null) { + limits = getLimits(); + } + Limit limit = null; + if (limits != null) { + // this returns default limit if preset is undefined! + limit = limits.getLimitByName(presetName); + } + // If limit is not defined for a particular preset, then return default + // limit + if (limit == null) { + log.debug("Limit for the preset " + presetName + + " is not found. Using default"); + limit = limits.getDefaultLimit(); + } + return limit; + } + + @Override + public LimitsManager getLimits() { + // synchronise on static field + synchronized (log) { + if (limits == null) { + limits = Util.getLimits(this.getClass()); + } + } + return limits; + } + + @Override + public Class> getType() { + return this.getClass(); + } +} -- 1.7.10.2