1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
10 #ifndef SQUID_MSA_INCLUDED
11 #define SQUID_MSA_INCLUDED
14 * SRE, Mon May 17 10:24:30 1999
16 * Header file for SQUID's multiple sequence alignment
19 * RCS $Id: msa.h 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp)
22 #include <stdio.h> /* FILE support */
23 #include "gki.h" /* hash table support */
24 #include "ssi.h" /* sequence file index support */
25 #include "squid.h" /* need SQINFO */
27 /****************************************************
28 * Obsolete alignment information, AINFO
29 * Superceded by MSA structure further below; but we
30 * need AINFO for the near future for backwards
32 ****************************************************/
33 /* Structure: aliinfo_s
35 * Purpose: Optional information returned from an alignment file.
37 * flags: always used. Flags for which info is valid/alloced.
39 * alen: mandatory. Alignments are always flushed right
40 * with gaps so that all aseqs are the same length, alen.
41 * Available for all alignment formats.
43 * nseq: mandatory. Aligned seqs are indexed 0..nseq-1.
45 * wgt: 0..nseq-1 vector of sequence weights. Mandatory.
46 * If not explicitly set, weights are initialized to 1.0.
48 * cs: 0..alen-1, just like the alignment. Contains single-letter
49 * secondary structure codes for consensus structure; "<>^+"
50 * for RNA, "EHL." for protein. May be NULL if unavailable
51 * from seqfile. Only available for SELEX format files.
53 * rf: 0..alen-1, just like the alignment. rf is an arbitrary string
54 * of characters, used for annotating columns. Blanks are
55 * interpreted as non-canonical columns and anything else is
56 * considered canonical. Only available from SELEX files.
58 * sqinfo: mandatory. Array of 0..nseq-1
59 * per-sequence information structures, carrying
60 * name, id, accession, coords.
64 int flags; /* flags for what info is valid */
65 int alen; /* length of alignment (columns) */
66 int nseq; /* number of seqs in alignment */
67 float *wgt; /* sequence weights [0..nseq-1] */
68 char *cs; /* consensus secondary structure string */
69 char *rf; /* reference coordinate system */
70 struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */
72 /* Pfam/HMMER pick-ups */
73 char *name; /* name of alignment */
74 char *desc; /* description of alignment */
75 char *acc; /* accession of alignment */
76 char *au; /* "author" information */
77 float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */
78 float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */
79 float ga1, ga2; /* gathering cutoffs */
81 typedef struct aliinfo_s AINFO;
82 #define AINFO_TC (1 << 0)
83 #define AINFO_NC (1 << 1)
84 #define AINFO_GA (1 << 2)
86 /*****************************************************************
88 * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland]
90 * Defines the new data structure and API for multiple
91 * sequence alignment i/o.
92 *****************************************************************/
94 /* The following constants define the Pfam/Rfam cutoff set we'll propagate
95 * from msa's into HMMER and Infernal models.
97 #define MSA_CUTOFF_TC1 0
98 #define MSA_CUTOFF_TC2 1
99 #define MSA_CUTOFF_GA1 2
100 #define MSA_CUTOFF_GA2 3
101 #define MSA_CUTOFF_NC1 4
102 #define MSA_CUTOFF_NC2 5
103 #define MSA_MAXCUTOFFS 6
106 * SRE, Tue May 18 11:33:08 1999
108 * Our object for a multiple sequence alignment.
110 typedef struct msa_struct {
111 /* Mandatory information associated with the alignment.
113 char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */
114 char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */
115 float *wgt; /* sequence weights [0..nseq-1] */
116 int alen; /* length of alignment (columns) */
117 int nseq; /* number of seqs in alignment */
119 /* Optional information that we understand, and might have.
121 int flags; /* flags for what optional info is valid */
122 int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */
123 char *name; /* name of alignment, or NULL */
124 char *desc; /* description of alignment, or NULL */
125 char *acc; /* accession of alignment, or NULL */
126 char *au; /* "author" information, or NULL */
127 char *ss_cons; /* consensus secondary structure string, or NULL */
128 char *sa_cons; /* consensus surface accessibility string, or NULL */
129 char *rf; /* reference coordinate system, or NULL */
130 char **sqacc; /* accession numbers for individual sequences */
131 char **sqdesc; /* description lines for individual sequences */
132 char **ss; /* per-seq secondary structure annotation, or NULL */
133 char **sa; /* per-seq surface accessibility annotation, or NULL */
134 float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */
135 int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */
137 /* Optional information that we don't understand.
138 * That is, we know what type of information it is, but it's
139 * either (interpreted as) free-text comment, or it's Stockholm
140 * markup with unfamiliar tags.
142 char **comment; /* free text comments, or NULL */
143 int ncomment; /* number of comment lines */
144 int alloc_ncomment; /* number of comment lines alloc'ed */
146 char **gf_tag; /* markup tags for unparsed #=GF lines */
147 char **gf; /* annotations for unparsed #=GF lines */
148 int ngf; /* number of unparsed #=GF lines */
149 int alloc_ngf; /* number of gf lines alloc'ed */
151 char **gs_tag; /* markup tags for unparsed #=GS lines */
152 char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */
153 GKI *gs_idx; /* hash of #=GS tag types */
154 int ngs; /* number of #=GS tag types */
156 char **gc_tag; /* markup tags for unparsed #=GC lines */
157 char **gc; /* [0..ngc-1][0..alen-1] markup */
158 GKI *gc_idx; /* hash of #=GC tag types */
159 int ngc; /* number of #=GC tag types */
161 char **gr_tag; /* markup tags for unparsed #=GR lines */
162 char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */
163 GKI *gr_idx; /* hash of #=GR tag types */
164 int ngr; /* number of #=GR tag types */
166 /* Stuff we need for our own maintenance of the data structure
168 GKI *index; /* name ->seqidx hash table */
169 int nseqalloc; /* number of seqs currently allocated for */
170 int nseqlump; /* lump size for dynamic expansions of nseq */
171 int *sqlen; /* individual sequence lengths during parsing */
172 int *sslen; /* individual ss lengths during parsing */
173 int *salen; /* individual sa lengths during parsing */
174 int lastidx; /* last index we saw; use for guessing next */
176 #define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */
179 /* Structure: MSAFILE
180 * SRE, Tue May 18 11:36:54 1999
182 * Defines an alignment file that's open for reading.
184 typedef struct msafile_struct {
185 FILE *f; /* open file pointer */
186 char *fname; /* name of file. used for diagnostic output */
187 int linenumber; /* what line are we on in the file */
189 char *buf; /* buffer for line input w/ sre_fgets() */
190 int buflen; /* current allocated length for buf */
192 SSIFILE *ssi; /* open SSI index file; or NULL, if none. */
194 int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */
195 int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */
196 int format; /* format of alignment file we're reading */
200 /* Alignment file formats.
201 * Must coexist with sqio.c/squid.h unaligned file format codes.
203 * - 0 is an unknown/unassigned format
204 * - <100 reserved for unaligned formats
205 * - >100 reserved for aligned formats
207 #define MSAFILE_UNKNOWN 0 /* unknown format */
208 #define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */
209 #define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */
210 #define MSAFILE_MSF 103 /* GCG MSF format */
211 #define MSAFILE_CLUSTAL 104 /* Clustal V/W format */
212 #define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */
213 #define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */
214 #define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */
216 #define MSAFILE_VIENNA 108 /* Vienna: concatenated fasta */
219 #define IsAlignmentFormat(fmt) ((fmt) > 100)
224 extern MSAFILE *MSAFileOpen(char *filename, int format, char *env);
225 extern MSA *MSAFileRead(MSAFILE *afp);
226 extern void MSAFileClose(MSAFILE *afp);
227 extern void MSAFree(MSA *msa);
228 extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline);
230 extern int MSAFileRewind(MSAFILE *afp);
231 extern int MSAFilePositionByKey(MSAFILE *afp, char *key);
232 extern int MSAFilePositionByIndex(MSAFILE *afp, int idx);
234 extern int MSAFileFormat(MSAFILE *afp);
235 extern MSA *MSAAlloc(int nseq, int alen);
236 extern void MSAExpand(MSA *msa);
237 extern char *MSAFileGetLine(MSAFILE *afp);
238 extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc);
239 extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc);
240 extern void MSAAddComment(MSA *msa, char *s);
241 extern void MSAAddGF(MSA *msa, char *tag, char *value);
242 extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value);
243 extern void MSAAppendGC(MSA *msa, char *tag, char *value);
244 extern char *MSAGetGC(MSA *msa, char *tag);
245 extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value);
246 extern void MSAVerifyParse(MSA *msa);
247 extern int MSAGetSeqidx(MSA *msa, char *name, int guess);
249 extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo);
251 extern void MSAMingap(MSA *msa);
252 extern void MSANogap(MSA *msa);
253 extern void MSAShorterAlignment(MSA *msa, int *useme);
254 extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new);
256 extern char *MSAGetSeqAccession(MSA *msa, int idx);
257 extern char *MSAGetSeqDescription(MSA *msa, int idx);
258 extern char *MSAGetSeqSS(MSA *msa, int idx);
259 extern char *MSAGetSeqSA(MSA *msa, int idx);
261 extern float MSAAverageSequenceLength(MSA *msa);
265 extern MSA *ReadA2M(MSAFILE *afp);
267 extern void WriteA2M(FILE *fp, MSA *msa, int vienna);
269 extern void WriteA2M(FILE *fp, MSA *msa);
273 extern MSA *ReadClustal(MSAFILE *afp);
274 extern void WriteClustal(FILE *fp, MSA *msa);
278 extern void EPSWriteSmallMSA(FILE *fp, MSA *msa);
282 extern MSA *ReadMSF(MSAFILE *afp);
283 extern void WriteMSF(FILE *fp, MSA *msa);
287 extern MSA *ReadPhylip(MSAFILE *afp);
288 extern void WritePhylip(FILE *fp, MSA *msa);
292 extern MSA *ReadSELEX(MSAFILE *afp);
293 extern void WriteSELEX(FILE *fp, MSA *msa);
294 extern void WriteSELEXOneBlock(FILE *fp, MSA *msa);
298 extern MSA *ReadStockholm(MSAFILE *afp);
299 extern void WriteStockholm(FILE *fp, MSA *msa);
300 extern void WriteStockholmOneBlock(FILE *fp, MSA *msa);
302 #endif /*SQUID_MSA_INCLUDED*/