1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
11 #ifndef SQUID_MSA_INCLUDED
12 #define SQUID_MSA_INCLUDED
15 * SRE, Mon May 17 10:24:30 1999
17 * Header file for SQUID's multiple sequence alignment
20 * RCS $Id: msa.h,v 1.1.1.1 2005/03/22 08:34:19 cmzmasek Exp $
23 #include <stdio.h> /* FILE support */
24 #include "gki.h" /* hash table support */
25 #include "ssi.h" /* sequence file index support */
26 #include "squid.h" /* need SQINFO */
28 /****************************************************
29 * Obsolete alignment information, AINFO
30 * Superceded by MSA structure further below; but we
31 * need AINFO for the near future for backwards
33 ****************************************************/
34 /* Structure: aliinfo_s
36 * Purpose: Optional information returned from an alignment file.
38 * flags: always used. Flags for which info is valid/alloced.
40 * alen: mandatory. Alignments are always flushed right
41 * with gaps so that all aseqs are the same length, alen.
42 * Available for all alignment formats.
44 * nseq: mandatory. Aligned seqs are indexed 0..nseq-1.
46 * wgt: 0..nseq-1 vector of sequence weights. Mandatory.
47 * If not explicitly set, weights are initialized to 1.0.
49 * cs: 0..alen-1, just like the alignment. Contains single-letter
50 * secondary structure codes for consensus structure; "<>^+"
51 * for RNA, "EHL." for protein. May be NULL if unavailable
52 * from seqfile. Only available for SELEX format files.
54 * rf: 0..alen-1, just like the alignment. rf is an arbitrary string
55 * of characters, used for annotating columns. Blanks are
56 * interpreted as non-canonical columns and anything else is
57 * considered canonical. Only available from SELEX files.
59 * sqinfo: mandatory. Array of 0..nseq-1
60 * per-sequence information structures, carrying
61 * name, id, accession, coords.
65 int flags; /* flags for what info is valid */
66 int alen; /* length of alignment (columns) */
67 int nseq; /* number of seqs in alignment */
68 float *wgt; /* sequence weights [0..nseq-1] */
69 char *cs; /* consensus secondary structure string */
70 char *rf; /* reference coordinate system */
71 struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */
73 /* Pfam/HMMER pick-ups */
74 char *name; /* name of alignment */
75 char *desc; /* description of alignment */
76 char *acc; /* accession of alignment */
77 char *au; /* "author" information */
78 float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */
79 float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */
80 float ga1, ga2; /* gathering cutoffs */
82 typedef struct aliinfo_s AINFO;
83 #define AINFO_TC (1 << 0)
84 #define AINFO_NC (1 << 1)
85 #define AINFO_GA (1 << 2)
87 /*****************************************************************
89 * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland]
91 * Defines the new data structure and API for multiple
92 * sequence alignment i/o.
93 *****************************************************************/
96 * SRE, Tue May 18 11:33:08 1999
98 * Our object for a multiple sequence alignment.
100 typedef struct msa_struct {
101 /* Mandatory information associated with the alignment.
103 char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */
104 char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */
105 float *wgt; /* sequence weights [0..nseq-1] */
106 int alen; /* length of alignment (columns) */
107 int nseq; /* number of seqs in alignment */
109 /* Optional information that we understand, and might have.
111 int flags; /* flags for what optional info is valid */
112 int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */
113 char *name; /* name of alignment, or NULL */
114 char *desc; /* description of alignment, or NULL */
115 char *acc; /* accession of alignment, or NULL */
116 char *au; /* "author" information, or NULL */
117 char *ss_cons; /* consensus secondary structure string, or NULL */
118 char *sa_cons; /* consensus surface accessibility string, or NULL */
119 char *rf; /* reference coordinate system, or NULL */
120 char **sqacc; /* accession numbers for individual sequences */
121 char **sqdesc; /* description lines for individual sequences */
122 char **ss; /* per-seq secondary structure annotation, or NULL */
123 char **sa; /* per-seq surface accessibility annotation, or NULL */
124 float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */
125 float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */
126 float ga1, ga2; /* gathering cutoffs (per-seq, per-domain) */
128 /* Optional information that we don't understand.
129 * That is, we know what type of information it is, but it's
130 * either (interpreted as) free-text comment, or it's Stockholm
131 * markup with unfamiliar tags.
133 char **comment; /* free text comments, or NULL */
134 int ncomment; /* number of comment lines */
135 int alloc_ncomment; /* number of comment lines alloc'ed */
137 char **gf_tag; /* markup tags for unparsed #=GF lines */
138 char **gf; /* annotations for unparsed #=GF lines */
139 int ngf; /* number of unparsed #=GF lines */
140 int alloc_ngf; /* number of gf lines alloc'ed */
142 char **gs_tag; /* markup tags for unparsed #=GS lines */
143 char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */
144 GKI *gs_idx; /* hash of #=GS tag types */
145 int ngs; /* number of #=GS tag types */
147 char **gc_tag; /* markup tags for unparsed #=GC lines */
148 char **gc; /* [0..ngc-1][0..alen-1] markup */
149 GKI *gc_idx; /* hash of #=GC tag types */
150 int ngc; /* number of #=GC tag types */
152 char **gr_tag; /* markup tags for unparsed #=GR lines */
153 char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */
154 GKI *gr_idx; /* hash of #=GR tag types */
155 int ngr; /* number of #=GR tag types */
157 /* Stuff we need for our own maintenance of the data structure
159 GKI *index; /* name ->seqidx hash table */
160 int nseqalloc; /* number of seqs currently allocated for */
161 int nseqlump; /* lump size for dynamic expansions of nseq */
162 int *sqlen; /* individual sequence lengths during parsing */
163 int *sslen; /* individual ss lengths during parsing */
164 int *salen; /* individual sa lengths during parsing */
165 int lastidx; /* last index we saw; use for guessing next */
167 #define MSA_SET_TC (1 << 0)
168 #define MSA_SET_NC (1 << 1)
169 #define MSA_SET_GA (1 << 2)
170 #define MSA_SET_WGT (1 << 3)
172 /* Structure: MSAFILE
173 * SRE, Tue May 18 11:36:54 1999
175 * Defines an alignment file that's open for reading.
177 typedef struct msafile_struct {
178 FILE *f; /* open file pointer */
179 char *fname; /* name of file. used for diagnostic output */
180 int linenumber; /* what line are we on in the file */
182 char *buf; /* buffer for line input w/ sre_fgets() */
183 int buflen; /* current allocated length for buf */
185 SSIFILE *ssi; /* open SSI index file; or NULL, if none. */
187 int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */
188 int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */
189 int format; /* format of alignment file we're reading */
193 /* Alignment file formats.
194 * Must coexist with sqio.c/squid.h unaligned file format codes.
196 * - 0 is an unknown/unassigned format
197 * - <100 reserved for unaligned formats
198 * - >100 reserved for aligned formats
200 #define MSAFILE_UNKNOWN 0 /* unknown format */
201 #define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */
202 #define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */
203 #define MSAFILE_MSF 103 /* GCG MSF format */
204 #define MSAFILE_CLUSTAL 104 /* Clustal V/W format */
205 #define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */
206 #define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */
207 #define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */
209 #define IsAlignmentFormat(fmt) ((fmt) > 100)
214 extern MSAFILE *MSAFileOpen(char *filename, int format, char *env);
215 extern MSA *MSAFileRead(MSAFILE *afp);
216 extern void MSAFileClose(MSAFILE *afp);
217 extern void MSAFree(MSA *msa);
218 extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline);
220 extern int MSAFileRewind(MSAFILE *afp);
221 extern int MSAFilePositionByKey(MSAFILE *afp, char *key);
222 extern int MSAFilePositionByIndex(MSAFILE *afp, int idx);
224 extern int MSAFileFormat(MSAFILE *afp);
225 extern MSA *MSAAlloc(int nseq, int alen);
226 extern void MSAExpand(MSA *msa);
227 extern char *MSAFileGetLine(MSAFILE *afp);
228 extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc);
229 extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc);
230 extern void MSAAddComment(MSA *msa, char *s);
231 extern void MSAAddGF(MSA *msa, char *tag, char *value);
232 extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value);
233 extern void MSAAppendGC(MSA *msa, char *tag, char *value);
234 extern char *MSAGetGC(MSA *msa, char *tag);
235 extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value);
236 extern void MSAVerifyParse(MSA *msa);
237 extern int MSAGetSeqidx(MSA *msa, char *name, int guess);
239 extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo);
241 extern void MSAMingap(MSA *msa);
242 extern void MSANogap(MSA *msa);
243 extern void MSAShorterAlignment(MSA *msa, int *useme);
244 extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new);
246 extern char *MSAGetSeqAccession(MSA *msa, int idx);
247 extern char *MSAGetSeqDescription(MSA *msa, int idx);
248 extern char *MSAGetSeqSS(MSA *msa, int idx);
249 extern char *MSAGetSeqSA(MSA *msa, int idx);
253 extern MSA *ReadA2M(MSAFILE *afp);
254 extern void WriteA2M(FILE *fp, MSA *msa);
258 extern MSA *ReadClustal(MSAFILE *afp);
259 extern void WriteClustal(FILE *fp, MSA *msa);
263 extern void EPSWriteSmallMSA(FILE *fp, MSA *msa);
267 extern MSA *ReadMSF(MSAFILE *afp);
268 extern void WriteMSF(FILE *fp, MSA *msa);
272 extern MSA *ReadPhylip(MSAFILE *afp);
273 extern void WritePhylip(FILE *fp, MSA *msa);
277 extern MSA *ReadSELEX(MSAFILE *afp);
278 extern void WriteSELEX(FILE *fp, MSA *msa);
282 extern MSA *ReadStockholm(MSAFILE *afp);
283 extern void WriteStockholm(FILE *fp, MSA *msa);
284 extern void WriteStockholmOneBlock(FILE *fp, MSA *msa);
286 #endif /*SQUID_MSA_INCLUDED*/