--- /dev/null
+Analog of fastacmd for blast+\r
+\r
+-bash-3.2$ /local/opt/bin/blastdbcmd -help\r
+USAGE\r
+ blastdbcmd [-h] [-help] [-db dbname] [-dbtype molecule_type]\r
+ [-entry sequence_identifier] [-entry_batch input_file] [-pig PIG] [-info]\r
+ [-range numbers] [-strand strand] [-mask_sequence_with numbers]\r
+ [-out output_file] [-outfmt format] [-target_only] [-get_dups]\r
+ [-line_length number] [-ctrl_a] [-version]\r
+\r
+DESCRIPTION\r
+ BLAST database client, version 2.2.23+\r
+\r
+OPTIONAL ARGUMENTS\r
+ -h\r
+ Print USAGE and DESCRIPTION; ignore other arguments\r
+ -help\r
+ Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments\r
+ -version\r
+ Print version number; ignore other arguments\r
+\r
+ *** BLAST database options\r
+ -db <String>\r
+ BLAST database name\r
+ Default = `nr'\r
+ -dbtype <String, `guess', `nucl', `prot'>\r
+ Molecule type stored in BLAST database\r
+ Default = `guess'\r
+\r
+ *** Retrieval options\r
+ -entry <String>\r
+ Comma-delimited search string(s) of sequence identifiers:\r
+ e.g.: 555, AC147927, 'gnl|dbname|tag', or 'all' to select all\r
+ sequences in the database\r
+ * Incompatible with: entry_batch, pig, info\r
+ -entry_batch <File_In>\r
+ Input file for batch processing (Format: one entry per line)\r
+ * Incompatible with: entry, pig, info\r
+ -pig <Integer, >=0>\r
+ PIG to retrieve\r
+ * Incompatible with: entry, entry_batch, target_only, info\r
+ -info\r
+ Print BLAST database information\r
+ * Incompatible with: entry, entry_batch, outfmt, strand, target_only,\r
+ ctrl_a, get_dups, pig, range\r
+\r
+ *** Sequence retrieval configuration options\r
+ -range <String>\r
+ Range of sequence to extract (Format: start-stop)\r
+ * Incompatible with: info\r
+ -strand <String, `minus', `plus'>\r
+ Strand of nucleotide sequence to extract\r
+ Default = `plus'\r
+ * Incompatible with: info\r
+ -mask_sequence_with <String>\r
+ Produce lower-case masked FASTA using the algorithm IDs specified (Format:\r
+ N,M,...)\r
+\r
+ *** Output configuration options\r
+ -out <File_Out>\r
+ Output file name\r
+ Default = `-'\r
+ -outfmt <String>\r
+ Output format, where the available format specifiers are:\r
+ %f means sequence in FASTA format\r
+ %s means sequence data (without defline)\r
+ %a means accession\r
+ %g means gi\r
+ %o means ordinal id (OID)\r
+ %t means sequence title\r
+ %l means sequence length\r
+ %T means taxid\r
+ %L means common taxonomic name\r
+ %S means scientific name\r
+ %P means PIG\r
+ %mX means sequence masking data, where X is an optional comma-\r
+ separted list of integers to specify the algorithm ID(s) to\r
+ diaplay (or all masks if absent or invalid specification).\r
+ Masking data will be displayed as a series of 'N-M' values\r
+ separated by ';' or the word 'none' if none are available.\r
+ For every format except '%f', each line of output will correspond to\r
+ a sequence.\r
+ Default = `%f'\r
+ * Incompatible with: info\r
+ -target_only\r
+ Definition line should contain target GI only\r
+ * Incompatible with: pig, info, get_dups\r
+ -get_dups\r
+ Retrieve duplicate accessions\r
+ * Incompatible with: info, target_only\r
+\r
+ *** Output configuration options for FASTA format\r
+ -line_length <Integer, >=1>\r
+ Line length for output\r
+ Default = `80'\r
+ -ctrl_a\r
+ Use Ctrl-A as the non-redundant defline separator\r
+ * Incompatible with: info\r
--- /dev/null
+-bash-3.2$ /sw/opt/hmmer3/bin/jackhmmer -h\r
+# jackhmmer :: iteratively search a protein sequence against a protein database\r
+# HMMER 3.0 (March 2010); http://hmmer.org/\r
+# Copyright (C) 2010 Howard Hughes Medical Institute.\r
+# Freely distributed under the GNU General Public License (GPLv3).\r
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\r
+Usage: jackhmmer [-options] <query seqfile> <target seqdb>\r
+\r
+where basic options are:\r
+ -h : show brief help on version and usage\r
+ -N <n> : set maximum number of iterations to <n> [5] (n>0)\r
+\r
+options directing output:\r
+ -o <f> : direct output to file <f>, not stdout\r
+ -A <f> : save multiple alignment of hits to file <s>\r
+ --tblout <f> : save parseable table of per-sequence hits to file <s>\r
+ --domtblout <f> : save parseable table of per-domain hits to file <s>\r
+ --chkhmm <f> : save HMM checkpoints to files <s>-<iteration>.hmm\r
+ --chkali <f> : save alignment checkpoints to files <s>-<iteration>.sto\r
+ --acc : prefer accessions over names in output\r
+ --noali : don't output alignments, so output is smaller\r
+ --notextw : unlimit ASCII text output line width\r
+ --textw <n> : set max width of ASCII text output lines [120] (n>=120)\r
+\r
+options controlling scoring system in first iteration:\r
+ --popen <x> : gap open probability [0.02] (0<=x<0.5)\r
+ --pextend <x> : gap extend probability [0.4] (0<=x<1)\r
+ --mxfile <f> : substitution score matrix [default: BLOSUM62]\r
+\r
+options controlling reporting thresholds:\r
+ -E <x> : report sequences <= this E-value threshold in output [10.0] (x>0)\r
+ -T <x> : report sequences >= this score threshold in output\r
+ --domE <x> : report domains <= this E-value threshold in output [10.0] (x>0)\r
+ --domT <x> : report domains >= this score cutoff in output\r
+\r
+options controlling significance thresholds for inclusion in next round:\r
+ --incE <x> : consider sequences <= this E-value threshold as significant\r
+ --incT <x> : consider sequences >= this score threshold as significant\r
+ --incdomE <x> : consider domains <= this E-value threshold as significant\r
+ --incdomT <x> : consider domains >= this score threshold as significant\r
+\r
+options controlling acceleration heuristics:\r
+ --max : Turn all heuristic filters off (less speed, more power)\r
+ --F1 <x> : Stage 1 (MSV) threshold: promote hits w/ P <= F1 [0.02]\r
+ --F2 <x> : Stage 2 (Vit) threshold: promote hits w/ P <= F2 [1e-3]\r
+ --F3 <x> : Stage 3 (Fwd) threshold: promote hits w/ P <= F3 [1e-5]\r
+ --nobias : turn off composition bias filter\r
+\r
+options controlling model construction after first iteration:\r
+ --fast : assign cols w/ >= symfrac residues as consensus\r
+ --hand : manual construction (requires reference annotation)\r
+ --symfrac <x> : sets sym fraction controlling --fast construction\r
+ --fragthresh <x> : if L < x<L>, tag sequence as a fragment\r
+\r
+options controlling relative weights in models after first iteration:\r
+ --wpb : Henikoff position-based weights [default]\r
+ --wgsc : Gerstein/Sonnhammer/Chothia tree weights\r
+ --wblosum : Henikoff simple filter weights\r
+ --wnone : don't do any relative weighting; set all to 1\r
+ --wid <x> : for --wblosum: set identity cutoff [0.62] (0<=x<=1)\r
+\r
+options controlling effective seq number in models after first iteration:\r
+ --eent : adjust eff seq # to achieve relative entropy target [default]\r
+ --eclust : eff seq # is # of single linkage clusters\r
+ --enone : no effective seq # weighting: just use nseq\r
+ --eset <x> : set eff seq # for all models to <x>\r
+ --ere <x> : for --eent: set minimum rel entropy/position to <x>\r
+ --esigma <x> : for --eent: set sigma param to <x> [45.0]\r
+ --eid <x> : for --eclust: set fractional identity cutoff to <x> [0.62]\r
+\r
+Options controlling E value calibration:\r
+ --EmL <n> : length of sequences for MSV Gumbel mu fit [200] (n>0)\r
+ --EmN <n> : number of sequences for MSV Gumbel mu fit [200] (n>0)\r
+ --EvL <n> : length of sequences for Viterbi Gumbel mu fit [200] (n>0)\r
+ --EvN <n> : number of sequences for Viterbi Gumbel mu fit [200] (n>0)\r
+ --EfL <n> : length of sequences for Forward exp tail tau fit [100] (n>0)\r
+ --EfN <n> : number of sequences for Forward exp tail tau fit [200] (n>0)\r
+ --Eft <x> : tail mass for Forward exponential tail tau fit [0.04] (0<x<1)\r
+\r
+Other expert options:\r
+ --nonull2 : turn off biased composition score corrections\r
+ -Z <x> : set # of comparisons done, for E-value calculation\r
+ --domZ <x> : set # of significant seqs, for domain E-value calculation\r
+ --seed <n> : set RNG seed to <n> (if 0: one-time arbitrary seed) [42]\r
+ --qformat <s> : assert query <seqfile> is in format <s>: no autodetection\r
+ --tformat <s> : assert target <seqdb> is in format <s>>: no autodetection\r
+ --cpu <n> : number of parallel CPU workers to use for multithreads\r
--- /dev/null
+ /local/gjb_lab/blast+/bin/psiblast -help\r
+USAGE\r
+ psiblast [-h] [-help] [-import_search_strategy filename]\r
+ [-export_search_strategy filename] [-db database_name]\r
+ [-dbsize num_letters] [-gilist filename] [-negative_gilist filename]\r
+ [-entrez_query entrez_query] [-subject subject_input_file]\r
+ [-subject_loc range] [-query input_file] [-out output_file]\r
+ [-evalue evalue] [-word_size int_value] [-gapopen open_penalty]\r
+ [-gapextend extend_penalty] [-xdrop_ungap float_value]\r
+ [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r
+ [-searchsp int_value] [-seg SEG_options] [-soft_masking soft_masking]\r
+ [-matrix matrix_name] [-threshold float_value] [-culling_limit int_value]\r
+ [-best_hit_overhang float_value] [-best_hit_score_edge float_value]\r
+ [-window_size int_value] [-lcase_masking] [-query_loc range]\r
+ [-parse_deflines] [-outfmt format] [-show_gis]\r
+ [-num_descriptions int_value] [-num_alignments int_value] [-html]\r
+ [-max_target_seqs num_sequences] [-num_threads int_value] [-remote]\r
+ [-comp_based_stats compo] [-use_sw_tback] [-gap_trigger float_value]\r
+ [-num_iterations int_value] [-out_pssm checkpoint_file]\r
+ [-out_ascii_pssm ascii_mtx_file] [-in_msa align_restart]\r
+ [-in_pssm psi_chkpt_file] [-pseudocount pseudocount]\r
+ [-inclusion_ethresh ethresh] [-phi_pattern file] [-version]\r
+\r
+DESCRIPTION\r
+ Position-Specific Initiated BLAST 2.2.22+\r
+\r
+OPTIONAL ARGUMENTS\r
+ -h\r
+ Print USAGE and DESCRIPTION; ignore other arguments\r
+ -help\r
+ Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments\r
+ -version\r
+ Print version number; ignore other arguments\r
+\r
+ *** Input query options\r
+ -query <File_In>\r
+ Input file name\r
+ Default = `-'\r
+ * Incompatible with: in_msa, in_pssm\r
+ -query_loc <String>\r
+ Location on the query sequence (Format: start-stop)\r
+\r
+ *** General search options\r
+ -db <String>\r
+ BLAST database name\r
+ * Incompatible with: subject, subject_loc\r
+ -out <File_Out>\r
+ Output file name\r
+ Default = `-'\r
+ -evalue <Real>\r
+ Expectation value (E) threshold for saving hits\r
+ Default = `10'\r
+ -word_size <Integer, >=2>\r
+ Word size for wordfinder algorithm\r
+ -gapopen <Integer>\r
+ Cost to open a gap\r
+ -gapextend <Integer>\r
+ Cost to extend a gap\r
+ -matrix <String>\r
+ Scoring matrix name\r
+ Default = `BLOSUM62'\r
+ -threshold <Real, >=0>\r
+ Minimum word score such that the word is added to the BLAST lookup table\r
+ -comp_based_stats <String>\r
+ Use composition-based statistics for blastp / tblastn:\r
+ D or d: default (equivalent to 2)\r
+ 0 or F or f: no composition-based statistics\r
+ 1: Composition-based statistics as in NAR 29:2994-3005, 2001\r
+ 2 or T or t : Composition-based score adjustment as in Bioinformatics\r
+ 21:902-911,\r
+ 2005, conditioned on sequence properties\r
+ 3: Composition-based score adjustment as in Bioinformatics 21:902-911,\r
+ 2005, unconditionally\r
+ For programs other than tblastn, must either be absent or be D, F or 0\r
+ Default = `2'\r
+\r
+ *** BLAST-2-Sequences options\r
+ -subject <File_In>\r
+ Subject sequence(s) to search\r
+ * Incompatible with: db, gilist, negative_gilist\r
+ -subject_loc <String>\r
+ Location on the subject sequence (Format: start-stop)\r
+ * Incompatible with: db, gilist, negative_gilist, remote\r
+\r
+ *** Formatting options\r
+ -outfmt <String>\r
+ alignment view options:\r
+ 0 = pairwise,\r
+ 1 = query-anchored showing identities,\r
+ 2 = query-anchored no identities,\r
+ 3 = flat query-anchored, show identities,\r
+ 4 = flat query-anchored, no identities,\r
+ 5 = XML Blast output,\r
+ 6 = tabular,\r
+ 7 = tabular with comment lines,\r
+ 8 = Text ASN.1,\r
+ 9 = Binary ASN.1\r
+ 10 = Comma-separated values\r
+\r
+ Options 6, 7, and 10 can be additionally configured to produce\r
+ a custom format specified by space delimited format specifiers.\r
+ The supported format specifiers are:\r
+ qseqid means Query Seq-id\r
+ qgi means Query GI\r
+ qacc means Query accesion\r
+ sseqid means Subject Seq-id\r
+ sallseqid means All subject Seq-id(s), separated by a ';'\r
+ sgi means Subject GI\r
+ sallgi means All subject GIs\r
+ sacc means Subject accession\r
+ sallacc means All subject accessions\r
+ qstart means Start of alignment in query\r
+ qend means End of alignment in query\r
+ sstart means Start of alignment in subject\r
+ send means End of alignment in subject\r
+ qseq means Aligned part of query sequence\r
+ sseq means Aligned part of subject sequence\r
+ evalue means Expect value\r
+ bitscore means Bit score\r
+ score means Raw score\r
+ length means Alignment length\r
+ pident means Percentage of identical matches\r
+ nident means Number of identical matches\r
+ mismatch means Number of mismatches\r
+ positive means Number of positive-scoring matches\r
+ gapopen means Number of gap openings\r
+ gaps means Total number of gaps\r
+ ppos means Percentage of positive-scoring matches\r
+ frames means Query and subject frames separated by a '/'\r
+ qframe means Query frame\r
+ sframe means Subject frame\r
+ When not provided, the default value is:\r
+ 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send\r
+ evalue bitscore', which is equivalent to the keyword 'std'\r
+ Default = `0'\r
+ -show_gis\r
+ Show NCBI GIs in deflines?\r
+ -num_descriptions <Integer, >=0>\r
+ Number of database sequences to show one-line descriptions for\r
+ Default = `500'\r
+ -num_alignments <Integer, >=0>\r
+ Number of database sequences to show alignments for\r
+ Default = `250'\r
+ -html\r
+ Produce HTML output?\r
+\r
+ *** Query filtering options\r
+ -seg <String>\r
+ Filter query sequence with SEG (Format: 'yes', 'window locut hicut', or\r
+ 'no' to disable)\r
+ Default = `no'\r
+ -soft_masking <Boolean>\r
+ Apply filtering locations as soft masks\r
+ Default = `false'\r
+ -lcase_masking\r
+ Use lower case filtering in query and subject sequence(s)?\r
+\r
+ *** Restrict search or results\r
+ -gilist <String>\r
+ Restrict search of database to list of GI's\r
+ * Incompatible with: negative_gilist, remote, subject, subject_loc\r
+ -negative_gilist <String>\r
+ Restrict search of database to everything except the listed GIs\r
+ * Incompatible with: gilist, remote, subject, subject_loc\r
+ -entrez_query <String>\r
+ Restrict search with the given Entrez query\r
+ * Requires: remote\r
+ -culling_limit <Integer, >=0>\r
+ If the query range of a hit is enveloped by that of at least this many\r
+ higher-scoring hits, delete the hit\r
+ * Incompatible with: best_hit_overhang, best_hit_score_edge\r
+ -best_hit_overhang <Real, (>=0 and =<0.5)>\r
+ Best Hit algorithm overhang value (recommended value: 0.1)\r
+ * Incompatible with: culling_limit\r
+ -best_hit_score_edge <Real, (>=0 and =<0.5)>\r
+ Best Hit algorithm score edge value (recommended value: 0.1)\r
+ * Incompatible with: culling_limit\r
+ -max_target_seqs <Integer, >=1>\r
+ Maximum number of aligned sequences to keep\r
+\r
+ *** Statistical options\r
+ -dbsize <Int8>\r
+ Effective length of the database\r
+ -searchsp <Int8, >=0>\r
+ Effective length of the search space\r
+\r
+ *** Search strategy options\r
+ -import_search_strategy <File_In>\r
+ Search strategy to use\r
+ * Incompatible with: export_search_strategy\r
+ -export_search_strategy <File_Out>\r
+ File name to record the search strategy used\r
+ * Incompatible with: import_search_strategy\r
+\r
+ *** Extension options\r
+ -xdrop_ungap <Real>\r
+ X-dropoff value (in bits) for ungapped extensions\r
+ -xdrop_gap <Real>\r
+ X-dropoff value (in bits) for preliminary gapped extensions\r
+ -xdrop_gap_final <Real>\r
+ X-dropoff value (in bits) for final gapped alignment\r
+ -window_size <Integer, >=0>\r
+ Multiple hits window size, use 0 to specify 1-hit algorithm\r
+ -gap_trigger <Real>\r
+ Number of bits to trigger gapping\r
+ Default = `22'\r
+\r
+ *** Miscellaneous options\r
+ -parse_deflines\r
+ Should the query and subject defline(s) be parsed?\r
+ -num_threads <Integer, >=1>\r
+ Number of threads to use in the BLAST search\r
+ Default = `1'\r
+ * Incompatible with: remote\r
+ -remote\r
+ Execute search remotely?\r
+ * Incompatible with: gilist, negative_gilist, subject_loc, num_threads,\r
+ num_iterations\r
+ -use_sw_tback\r
+ Compute locally optimal Smith-Waterman alignments?\r
+\r
+ *** PSI-BLAST options\r
+ -num_iterations <Integer, >=1>\r
+ Number of iterations to perform\r
+ Default = `1'\r
+ * Incompatible with: remote\r
+ -out_pssm <File_Out>\r
+ File name to store checkpoint file\r
+ -out_ascii_pssm <File_Out>\r
+ File name to store ASCII version of PSSM\r
+ -in_msa <File_In>\r
+ File name of multiple sequence alignment to restart PSI-BLAST\r
+ * Incompatible with: in_pssm, query\r
+ -in_pssm <File_In>\r
+ PSI-BLAST checkpoint file\r
+ * Incompatible with: in_msa, query, phi_pattern\r
+\r
+ *** PSSM engine options\r
+ -pseudocount <Integer>\r
+ Pseudo-count value used when constructing PSSM\r
+ Default = `0'\r
+ -inclusion_ethresh <Real>\r
+ E-value inclusion threshold for pairwise alignments\r
+ Default = `0.002'\r
+\r
+ *** PHI-BLAST options\r
+ -phi_pattern <File_In>\r
+ File name containing pattern to search\r
+ * Incompatible with: in_pssm\r
--- /dev/null
+/* Copyright (c) 2009 Peter Troshin\r
+ * \r
+ * Jalview Web Services @version: 2.0 \r
+ * \r
+ * This library is free software; you can redistribute it and/or modify it under the terms of the\r
+ * Apache License version 2 as published by the Apache Software Foundation\r
+ * \r
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
+ * License for more details.\r
+ * \r
+ * A copy of the license is in apache_license.txt. It is also available here:\r
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * \r
+ * Any republication or derived work distributed in source code form\r
+ * must include this copyright and license notice.\r
+ */\r
+package compbio.pipeline._jpred;\r
+\r
+import java.io.FileNotFoundException;\r
+import java.util.HashSet;\r
+import java.util.Set;\r
+\r
+import javax.xml.stream.XMLStreamException;\r
+\r
+\r
+public class BlastBlastComparator {\r
+\r
+ /**\r
+ * args[0] is assumed to be the name of a Blast output file\r
+ * \r
+ * @throws XMLStreamException\r
+ * @throws FileNotFoundException\r
+ */\r
+ public static void main(String[] args) throws FileNotFoundException,\r
+ XMLStreamException {\r
+ BlastParser res1 = new BlastParser(args[0]);\r
+ BlastParser res2 = new BlastParser(args[1]);\r
+ assert res1.iters.size() == res2.iters.size();\r
+\r
+ for (Integer iterNum : res1.iters.keySet()) {\r
+ Set<Hit> list = res1.iters.get(iterNum);\r
+ Set<Hit> otherList = res2.iters.get(iterNum);\r
+ System.out.print("Iter " + iterNum + " arg0: " + list.size());\r
+ System.out.println(" arg1: " + otherList.size());\r
+ BlastParser.printNames(getDiff(list, otherList));\r
+ // System.out.println("Diffs: " + getDiff(list, otherList));\r
+ // System.out.println("Diffs: " + getDiff(otherList, list));\r
+\r
+ }\r
+\r
+ }\r
+\r
+ static Set<Hit> getDiff(Set<Hit> list, Set<Hit> otherList) {\r
+ Set diff = new HashSet<Hit>();\r
+ for (Hit pseq : list) {\r
+ if (otherList.contains(pseq)) {\r
+ continue;\r
+ }\r
+ diff.add(pseq);\r
+ }\r
+ return diff;\r
+ }\r
+\r
+}\r
--- /dev/null
+/* Copyright (c) 2009 Peter Troshin\r
+ * \r
+ * Jalview Web Services @version: 2.0 \r
+ * \r
+ * This library is free software; you can redistribute it and/or modify it under the terms of the\r
+ * Apache License version 2 as published by the Apache Software Foundation\r
+ * \r
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
+ * License for more details.\r
+ * \r
+ * A copy of the license is in apache_license.txt. It is also available here:\r
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * \r
+ * Any republication or derived work distributed in source code form\r
+ * must include this copyright and license notice.\r
+ */\r
+package compbio.pipeline._jpred;\r
+\r
+import java.io.IOException;\r
+import java.util.HashSet;\r
+import java.util.Set;\r
+\r
+import javax.xml.stream.XMLStreamException;\r
+\r
+public class BlastHmmerComparator {\r
+\r
+ /**\r
+ * args[0] is assumed to be the name of a Blast output file\r
+ * \r
+ * @throws XMLStreamException\r
+ * @throws IOException\r
+ */\r
+ public static void main(String[] args) throws XMLStreamException,\r
+ IOException {\r
+ BlastParser res1 = new BlastParser(args[0]);\r
+ JackHmmerHitParser res2 = new JackHmmerHitParser(args[1]);\r
+ Set<Hit> list = res1.iters.get(3);\r
+ Set<Hit> otherList = res2.hits;\r
+ System.out.print("Iter " + 3 + " arg0: " + list.size());\r
+ System.out.println(" arg1: " + otherList.size());\r
+ BlastParser.printNames(getDiff(list, otherList));\r
+ BlastParser.printNames(getDiff(otherList, list));\r
+ // System.out.println("Diffs: " + getDiff(list, otherList));\r
+ // System.out.println("Diffs: " + getDiff(otherList, list));\r
+\r
+ }\r
+\r
+ static Set<Hit> getDiff(Set<Hit> list, Set<Hit> otherList) {\r
+ Set diff = new HashSet<Hit>();\r
+ for (Hit pseq : list) {\r
+ if (otherList.contains(pseq)) {\r
+ continue;\r
+ }\r
+ diff.add(pseq);\r
+ }\r
+ return diff;\r
+ }\r
+\r
+}\r
* must include this copyright and license notice.\r
*/\r
package compbio.pipeline._jpred;\r
-import java.io.*;\r
-import java.util.*;\r
+\r
+import java.io.BufferedInputStream;\r
+import java.io.File;\r
+import java.io.FileInputStream;\r
+import java.io.FileNotFoundException;\r
+import java.util.Collection;\r
+import java.util.HashMap;\r
+import java.util.HashSet;\r
+import java.util.Map;\r
+import java.util.Set;\r
\r
import javax.xml.stream.XMLInputFactory;\r
import javax.xml.stream.XMLStreamException;\r
import javax.xml.stream.XMLStreamReader;\r
\r
-\r
public class BlastParser {\r
- \r
- static class Psiseq {\r
- String id;\r
- String seq;\r
+\r
+ Map<Integer, Set<Hit>> iters;\r
+\r
+ public BlastParser(String file) throws FileNotFoundException,\r
+ XMLStreamException {\r
+ XMLInputFactory f = XMLInputFactory.newInstance();\r
+ XMLStreamReader r = f.createXMLStreamReader(new BufferedInputStream(\r
+ new FileInputStream(new File(file))));\r
+ Set<Hit> pl = new HashSet<Hit>();\r
+ Hit psi = null;\r
+ this.iters = new HashMap<Integer, Set<Hit>>();\r
+ Integer iternum = null;\r
+ while (r.hasNext()) {\r
+ int idx = r.next();\r
+ if (r.isStartElement()) {\r
+ String name = r.getLocalName();\r
+ if (name.equals("Iteration_iter-num")) {\r
+ iternum = Integer.parseInt(r.getElementText().trim());\r
+ System.out.println("Iter " + iternum);\r
+ }\r
+ if (name.equals("Hit")) {\r
+ psi = new Hit();\r
+ }\r
+ if (name.equals("Hit_num")) {\r
+ psi.number = r.getElementText();\r
+ }\r
+ if (name.equals("Hit_accession")) {\r
+ psi.accession = r.getElementText();\r
+ // System.out.println(psi.id);\r
+ }\r
+ if (name.equals("Hit_def")) {\r
+ // System.out.println(r.getElementText());\r
+ psi.name = r.getElementText().split("\\s+")[0].trim();\r
+ // System.out.println(psi.id);\r
+ }\r
+ if (name.equals("Hsp_hseq")) {\r
+ psi.seq = r.getElementText();\r
+ // System.out.println(psi.seq);\r
+ }\r
+ if (name.equals("Hsp_evalue")) {\r
+ psi.evalue = r.getElementText();\r
+ // System.out.println(psi.seq);\r
+ }\r
+\r
+ }\r
+\r
+ if (r.isEndElement()) {\r
+ String name = r.getLocalName();\r
+ if (name.equals("Hit")) {\r
+ boolean replaced = pl.add(psi);\r
+ assert replaced : "Expect unique elements only!";\r
+ psi = null;\r
+ }\r
+ if (name.equals("Iteration")) {\r
+ iters.put(iternum, pl);\r
+ pl = new HashSet<Hit>();\r
+ }\r
+ }\r
}\r
- /**\r
- * args[0] is assumed to be the name of a Blast output file\r
- * @throws XMLStreamException \r
- * @throws FileNotFoundException \r
- */\r
- public static void main(String[] args) throws FileNotFoundException, XMLStreamException {\r
- XMLInputFactory f = XMLInputFactory.newInstance();\r
- XMLStreamReader r = f.createXMLStreamReader( new BufferedInputStream(new FileInputStream(new File(args[0]))));\r
- List<Psiseq> pl = new ArrayList<Psiseq>();\r
- Psiseq psi = null;\r
- while(r.hasNext()) {\r
- int idx = r.next(); \r
- //System.out.println(idx);\r
\r
- if(r.isStartElement()) {\r
- String name = r.getLocalName();\r
- if(name.equals("Hit") ) {\r
- psi = new Psiseq();\r
- }\r
- if(name.equals("Hit_id") ) {\r
- //System.out.println(r.getElementText());\r
- psi.id = r.getElementText();\r
- System.out.println(psi.id);\r
- } \r
- if(name.equals("Hsp_hseq")) {\r
- psi.seq = r.getElementText();\r
- System.out.println(psi.seq);\r
- } \r
- }\r
- \r
- if(r.isEndElement()) {\r
- String name = r.getLocalName();\r
- if(name.equals("Hit") ) {\r
- pl.add(psi);\r
- psi = null;\r
- }\r
- }\r
+ }\r
\r
- } \r
+ /**\r
+ * args[0] is assumed to be the name of a Blast output file\r
+ * \r
+ * @throws XMLStreamException\r
+ * @throws FileNotFoundException\r
+ */\r
+ public static void main(String[] args) throws FileNotFoundException,\r
+ XMLStreamException {\r
+ BlastParser parser = new BlastParser(args[0]);\r
+ printHits(parser.iters);\r
+ }\r
\r
+ static final void printHits(Map<Integer, Set<Hit>> iterNumPsiSeqs) {\r
+ for (Integer iterNum : iterNumPsiSeqs.keySet()) {\r
+ System.out.println("Iteration " + iterNum);\r
+ printHits(iterNumPsiSeqs.get(iterNum));\r
}\r
+ }\r
+\r
+ static final void printHits(Collection<Hit> psiseqs) {\r
+ assert psiseqs != null;\r
+ System.out.println("Total hits: " + psiseqs.size());\r
+ for (Hit pseq : psiseqs) {\r
+ System.out.println("Hit: " + pseq.number + " Accession: "\r
+ + pseq.accession + " name " + pseq.name);\r
+ }\r
+ }\r
+\r
+ static final void printNames(Collection<Hit> psiseqs) {\r
+ assert psiseqs != null;\r
+ System.out.println("Total hits: " + psiseqs.size());\r
+ for (Hit pseq : psiseqs) {\r
+ System.out.print(pseq.number + " ");\r
+ System.out.println(pseq.name);\r
+ }\r
+ }\r
+\r
}\r
--- /dev/null
+/**\r
+ * \r
+ */\r
+package compbio.pipeline._jpred;\r
+\r
+import java.util.Comparator;\r
+\r
+public class Hit {\r
+ String name;\r
+ String number;\r
+ String accession;\r
+ String seq;\r
+ String evalue;\r
+\r
+ @Override\r
+ public String toString() {\r
+ return "accession=" + accession + ", name=" + name + ", num=" + number\r
+ + ", evalue=" + evalue + "\n"; // + ", seq=" + seq +\r
+ }\r
+\r
+ @Override\r
+ public int hashCode() {\r
+ final int prime = 31;\r
+ int result = 1;\r
+ result = prime * result + ((name == null) ? 0 : name.hashCode());\r
+ return result;\r
+ }\r
+\r
+ @Override\r
+ public boolean equals(Object obj) {\r
+ if (this == obj)\r
+ return true;\r
+ if (obj == null)\r
+ return false;\r
+ if (getClass() != obj.getClass())\r
+ return false;\r
+ Hit other = (Hit) obj;\r
+ if (name == null) {\r
+ if (other.name != null)\r
+ return false;\r
+ } else if (!name.equals(other.name))\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+ public static final class NumberComporator implements Comparator<Hit> {\r
+ @Override\r
+ public int compare(Hit o1, Hit o2) {\r
+ return Integer.valueOf(o1.number).compareTo(\r
+ Integer.valueOf(o2.number));\r
+ }\r
+ }\r
+\r
+ public static final class EvalueComporator implements Comparator<Hit> {\r
+ @Override\r
+ public int compare(Hit o1, Hit o2) {\r
+ return Double.valueOf(o1.evalue).compareTo(\r
+ Double.valueOf(o2.evalue));\r
+ }\r
+ }\r
+}
\ No newline at end of file
--- /dev/null
+package compbio.pipeline._jpred;\r
+\r
+import java.io.BufferedReader;\r
+import java.io.FileInputStream;\r
+import java.io.IOException;\r
+import java.io.InputStreamReader;\r
+import java.util.ArrayList;\r
+import java.util.Collections;\r
+import java.util.HashSet;\r
+import java.util.List;\r
+import java.util.Scanner;\r
+import java.util.Set;\r
+\r
+/**\r
+ * Parser for the following files:\r
+ * \r
+ * @author pvtroshin\r
+ * \r
+ */\r
+public class JackHmmerHitParser {\r
+ //# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----\r
+ //# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target\r
+ //# ------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- ---------------------\r
+ //tr|Q6TVU2|Q6TVU2_ORFV - gi_74230740_gb_ABA00545.1 - 4.5e-271 910.4 0.0 5.1e-271 910.2 0.0 1.0 1 0 0 1 1 1 1 Putative uncharacterized protein OS=Orf virus PE=4 SV=1\r
+\r
+ Set<Hit> hits;\r
+\r
+ public JackHmmerHitParser(String file) throws IOException {\r
+\r
+ BufferedReader bfr = new BufferedReader(new InputStreamReader(\r
+ new FileInputStream(file), "ISO-8859-1"), 64000);\r
+ // throw away first three lines; \r
+ this.hits = new HashSet<Hit>();\r
+ String line = bfr.readLine();\r
+ bfr.readLine();\r
+ bfr.readLine();\r
+ int hitc = 0;\r
+ while ((line = bfr.readLine()) != null) {\r
+ hitc++;\r
+ Scanner scan = new Scanner(line);\r
+ scan.useDelimiter("\\s+");\r
+ extractData(scan, hitc);\r
+ }\r
+ List<Hit> lhits = new ArrayList<Hit>(hits);\r
+ Collections.sort(lhits, new Hit.NumberComporator());\r
+ }\r
+\r
+ void extractData(Scanner scan, int hitcounter) {\r
+ Hit pseq = new Hit();\r
+\r
+ String tname = scan.next();\r
+ pseq.name = tname;\r
+ //System.out.println(tname);\r
+\r
+ String tacc = scan.next();\r
+ pseq.accession = tacc;\r
+ //System.out.println(tacc);\r
+ String qname = scan.next();\r
+ //System.out.println(qname);\r
+ String qacc = scan.next();\r
+ //System.out.println(qacc);\r
+\r
+ Double evalue = scan.nextDouble();\r
+ //System.out.println(evalue);\r
+ pseq.evalue = evalue.toString();\r
+\r
+ Double score = scan.nextDouble();\r
+ //System.out.println(score);\r
+ pseq.evalue = evalue.toString();\r
+ pseq.number = new Integer(hitcounter).toString();\r
+ boolean unique = hits.add(pseq);\r
+ assert unique : "Unique hits are expected!";\r
+ }\r
+\r
+ public static void main(String[] args) throws IOException {\r
+ assert args[0] != null;\r
+ JackHmmerHitParser parser = new JackHmmerHitParser(args[0]);\r
+ BlastParser.printHits(parser.hits);\r
+ }\r
+}\r
--- /dev/null
+/* Copyright (c) 2009 Peter Troshin\r
+ * \r
+ * Jalview Web Services @version: 2.0 \r
+ * \r
+ * This library is free software; you can redistribute it and/or modify it under the terms of the\r
+ * Apache License version 2 as published by the Apache Software Foundation\r
+ * \r
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
+ * License for more details.\r
+ * \r
+ * A copy of the license is in apache_license.txt. It is also available here:\r
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * \r
+ * Any republication or derived work distributed in source code form\r
+ * must include this copyright and license notice.\r
+ */\r
+\r
+package compbio.runner.disorder;\r
+\r
+import java.io.FileNotFoundException;\r
+import java.io.IOException;\r
+import java.util.Arrays;\r
+import java.util.List;\r
+\r
+import org.apache.log4j.Logger;\r
+\r
+import compbio.data.sequence.Alignment;\r
+import compbio.data.sequence.UnknownFileFormatException;\r
+import compbio.engine.client.Executable;\r
+import compbio.engine.client.PipedExecutable;\r
+import compbio.engine.client.SkeletalExecutable;\r
+import compbio.metadata.Limit;\r
+import compbio.metadata.LimitsManager;\r
+import compbio.metadata.ResultNotAvailableException;\r
+import compbio.runner.Util;\r
+\r
+public class RonnWrapper extends SkeletalExecutable<RonnWrapper> implements\r
+ PipedExecutable<RonnWrapper> {\r
+ /*\r
+ * RONN does not accept stdin the file name must be defined as parameter It\r
+ * can only analyse ONE sequence per run! (or may be not, but the results\r
+ * gets overriden!) FASTA format is accepted.\r
+ * \r
+ * To run it do the following:\r
+ * \r
+ * 1) copy ronn executables and task file to work directory\r
+ * \r
+ * 2) execute run processes one by one for each sequence\r
+ */\r
+\r
+ private static final String command = "/homes/pvtroshin/soft/RONNv3_fasta/Ronn_runner.sh";\r
+\r
+ private static Logger log = Logger.getLogger(RonnWrapper.class);\r
+\r
+ // Cache for Limits information\r
+ private static LimitsManager<RonnWrapper> limits;\r
+\r
+ public static final String KEY_VALUE_SEPARATOR = Util.SPACE;\r
+\r
+ @SuppressWarnings("unchecked")\r
+ @Override\r
+ public Alignment getResults(String workDirectory)\r
+ throws ResultNotAvailableException {\r
+ try {\r
+ return Util.readClustalFile(workDirectory, getOutput());\r
+ } catch (FileNotFoundException e) {\r
+ log.error(e.getMessage(), e.getCause());\r
+ throw new ResultNotAvailableException(e);\r
+ } catch (IOException e) {\r
+ log.error(e.getMessage(), e.getCause());\r
+ throw new ResultNotAvailableException(e);\r
+ } catch (UnknownFileFormatException e) {\r
+ log.error(e.getMessage(), e.getCause());\r
+ throw new ResultNotAvailableException(e);\r
+ } catch (NullPointerException e) {\r
+ log.error(e.getMessage(), e.getCause());\r
+ throw new ResultNotAvailableException(e);\r
+ }\r
+ }\r
+\r
+ @Override\r
+ public List<String> getCreatedFiles() {\r
+ return Arrays.asList(getOutput(), getError());\r
+ }\r
+\r
+ @Override\r
+ public RonnWrapper setInput(String inFile) {\r
+ String input = getInput();\r
+ super.setInput(inFile);\r
+ return this;\r
+ }\r
+\r
+ @Override\r
+ public Limit<RonnWrapper> getLimit(String presetName) {\r
+ if (limits == null) {\r
+ limits = getLimits();\r
+ }\r
+ Limit<RonnWrapper> limit = null;\r
+ if (limits != null) {\r
+ // this returns default limit if preset is undefined!\r
+ limit = limits.getLimitByName(presetName);\r
+ }\r
+ // If limit is not defined for a particular preset, then return default\r
+ // limit\r
+ if (limit == null) {\r
+ log.debug("Limit for the preset " + presetName\r
+ + " is not found. Using default");\r
+ limit = limits.getDefaultLimit();\r
+ }\r
+ return limit;\r
+ }\r
+\r
+ @Override\r
+ public LimitsManager<RonnWrapper> getLimits() {\r
+ // synchronise on static field\r
+ synchronized (log) {\r
+ if (limits == null) {\r
+ limits = Util.getLimits(this.getClass());\r
+ }\r
+ }\r
+ return limits;\r
+ }\r
+\r
+ @Override\r
+ public Class<? extends Executable<?>> getType() {\r
+ return this.getClass();\r
+ }\r
+}\r