4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10 * cross-platform end-of-lines.
12 * 27-4-2007, Mark Larkin (UCD): Made 2 small changes to getSecStructure function. There
13 * was a problem with the secondary structure info in windows.
19 #include "ClustalFileParser.h"
24 ClustalFileParser::ClustalFileParser(string filePath)
31 ClustalFileParser::~ClustalFileParser()
37 vector<Sequence> ClustalFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
39 vector<Sequence> seqRangeVector;
44 Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
45 if (parseExitCode!=OK) {
46 seqRangeVector.clear();
47 return seqRangeVector;
49 seqRangeVector.push_back(tempSeq);
51 return seqRangeVector;
55 Sequence ClustalFileParser::getSeq(int seqNum, string *offendingSeq)
57 char line[MAXLINE + 1];
59 char tseq[MAXLINE + 1];
61 char sname[MAXNAMES + 150];
63 for(int i = 1; i < MAXNAMES + 1; i++)
65 line[i] = tseq[i] = sname[i] = '0';
67 string characterSeq = "";
69 string title = ""; // Nothing happens it here!!!
77 _fileIn = new InFileStream; //nige
78 _fileIn->open(fileName.c_str()); //nige
79 _fileIn->seekg(0, std::ios::beg); // start at the beginning
81 _fileIn->getline(line, MAXLINE + 1); // read the title line...ignore it
83 while (_fileIn->getline(line, MAXLINE + 1)) //nige
85 if (!clustalBlankline(line))
88 for (i = 1; i < seqNum; i++)
90 _fileIn->getline(line, MAXLINE + 1); //nige
92 for (j = 0; j <= (int)strlen(line); j++)
97 string _tempStr = string(line);
99 sscanf(_tempStr.c_str(), "%s%s", sname, tseq);
100 for (j = 0; j < MAXNAMES; j++)
108 utilityObject->rTrim(sname);
109 utilityObject->blankToUnderscore(sname); // replace blanks with '_'
110 name = string(sname);
112 for (i = 0; i <= MAXLINE; i++)
115 if (isspace(c) || c == EOS)
123 characterSeq += c; // Add the character to the sequence
129 if (!_fileIn->getline(line, MAXLINE + 1)) // If we cant get another line!
131 freeFileResources(_fileIn);
132 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
134 parseExitCode=SEQUENCETOOBIG;
135 if (offendingSeq!=NULL)
136 offendingSeq->assign(name);
138 return Sequence(blank, blank, blank);
140 return Sequence(characterSeq, name, title);
142 if (clustalBlankline(line))
149 freeFileResources(_fileIn);
151 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
153 parseExitCode=SEQUENCETOOBIG;
154 if (offendingSeq!=NULL)
155 offendingSeq->assign(name);
158 return Sequence(blank, blank, blank);
160 return Sequence(characterSeq, name, title);
164 freeFileResources(_fileIn);
165 cerr << "An exception has occured in the function ClustalFileParser::getSeq()\n"
166 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
172 * The function countSeqs tells us how many sequences are in a clustal format file.
173 * Need to check if the file is open!
175 int ClustalFileParser::countSeqs()
177 char line[MAXLINE + 1];
182 _fileIn = new InFileStream; //nige
183 _fileIn->open(fileName.c_str()); //nige
185 if(!_fileIn->is_open())
187 freeFileResources(_fileIn);
188 return 0; // No sequences found!
192 while (_fileIn->getline(line, MAXLINE + 1))
194 if (!utilityObject->blankLine(line))
200 // This gets us to the begining of the sequence lines!
201 while (_fileIn->getline(line, MAXLINE + 1))
203 if (!clustalBlankline(line))
210 while (_fileIn->getline(line, MAXLINE + 1))
212 if (clustalBlankline(line))
214 freeFileResources(_fileIn);
219 freeFileResources(_fileIn);
220 return (int)0; // if you got to here-funny format/no seqs.
224 freeFileResources(_fileIn);
225 cerr << "An exception has occured in the function ClustalFileParser::countSeqs()\n"
226 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
232 * This function is to get the secondary structure for a Clustal format file.
233 * I am aware that I am using some C and some C++ here, and this may seem like
234 * bad style, but I think it is better to use the C functions for processing the
235 * strings as they are already working.
237 void ClustalFileParser::getSecStructure(vector<char>& gapPenaltyMask,
238 vector<char>& secStructMask, string& secStructName, int &structPenalties, int length)
240 char title[MAXLINE + 1];
242 char line[MAXLINE + 1];
244 char lin2[MAXLINE + 1];
246 char tseq[MAXLINE + 1];
248 char sname[MAXNAMES + 1];
251 for(int i = 1; i < MAXNAMES + 1; i++)
253 title[i] = line[i] = lin2[i] = tseq[i] = sname[i] ='0';
256 int i, j, len, ix, struct_index = 0;
261 _fileIn = new InFileStream; //nige
262 _fileIn->open(fileName.c_str()); //nige
263 _fileIn->seekg(0, std::ios::beg);
265 // NOTE clear out the masks
266 gapPenaltyMask.clear();
267 secStructMask.clear();
269 len = 0; // initialise length to zero
271 if (!_fileIn->getline(line, MAXLINE + 1))
273 freeFileResources(_fileIn);
276 // read the title line...ignore it
278 if (!_fileIn->getline(line, MAXLINE + 1))
280 freeFileResources(_fileIn);
283 // read the next line...
284 // skip any blank lines
287 if (!_fileIn->getline(line, MAXLINE + 1))
289 freeFileResources(_fileIn);
292 if (!utilityObject->blankLine(line))
298 // look for structure table lines
306 if (strncmp(line, "!SS", 3) == 0)
309 sscanf(line + 4, "%s%s", sname, tseq);
310 for (j = 0; j < MAXNAMES; j++)
318 utilityObject->rTrim(sname);
319 utilityObject->blankToUnderscore(sname);
321 if (userParameters->getInteractive())
323 strcpy(title, "Found secondary structure in alignment file: ");
324 strcat(title, sname);
325 (*lin2) = utilityObject->promptForYesNo(title,
326 "Use it to set local gap penalties ");
332 if ((*lin2 != 'n') && (*lin2 != 'N'))
334 structPenalties = SECST;
336 for (i = 0; i < length; i++)
338 secStructMask.push_back('.');
339 gapPenaltyMask.push_back('.');
342 secStructName = string(sname);
344 for (i = 0; len < length; i++)
347 if (c == '\n' || c == EOS)
354 if(len < (int)secStructMask.size())
356 secStructMask[len++] = c; // NOTE array notation = BAD
362 else if (strncmp(line, "!GM", 3) == 0)
365 sscanf(line + 4, "%s%s", sname, tseq);
366 for (j = 0; j < MAXNAMES; j++)
374 utilityObject->rTrim(sname);
375 utilityObject->blankToUnderscore(sname);
377 if (userParameters->getInteractive())
379 strcpy(title, "Found gap penalty mask in alignment file: ");
380 strcat(title, sname);
381 (*lin2) = utilityObject->promptForYesNo(title,
382 "Use it to set local gap penalties ");
388 if ((*lin2 != 'n') && (*lin2 != 'N'))
390 structPenalties = GMASK;
392 for (i = 0; i < length; i++)
394 gapPenaltyMask.push_back('1');
397 secStructName = string(sname);
399 for (i = 0; len < length; i++)
402 if (c == '\n' || c == EOS)
409 if(len < (int)gapPenaltyMask.size())
411 gapPenaltyMask[len++] = c;
417 if (structPenalties != NONE)
421 if (!_fileIn->getline(line, MAXLINE + 1))
423 freeFileResources(_fileIn);
427 if (structPenalties == NONE)
429 freeFileResources(_fileIn);
433 // skip any more comment lines
434 while (line[0] == '!')
436 if (!_fileIn->getline(line, MAXLINE + 1))
438 freeFileResources(_fileIn);
443 // skip the sequence lines and any comments after the alignment
446 if (isspace(line[0]) || line[0] == '\0') // Mark change 27-4-2007
450 if (!_fileIn->getline(line, MAXLINE + 1))
452 freeFileResources(_fileIn);
458 // read the rest of the alignment
462 // skip any blank lines
465 if (!utilityObject->blankLine(line))
469 if (!_fileIn->getline(line, MAXLINE + 1))
471 freeFileResources(_fileIn);
475 // get structure table line
476 for (ix = 0; ix < struct_index; ix++)
480 if (structPenalties == SECST)
482 utilityObject->error("bad secondary structure format\n");
486 utilityObject->error("bad gap penalty mask format\n");
488 structPenalties = NONE;
489 freeFileResources(_fileIn);
492 if (!_fileIn->getline(line, MAXLINE + 1))
494 freeFileResources(_fileIn);
498 if (structPenalties == SECST)
500 if (strncmp(line, "!SS", 3) != 0)
502 utilityObject->error("bad secondary structure format\n");
503 structPenalties = NONE;
504 freeFileResources(_fileIn);
507 sscanf(line + 4, "%s%s", sname, tseq);
508 for (i = 0; len < length; i++)
511 if (c == '\n' || c == EOS)
518 secStructMask[len++] = c;
522 else if (structPenalties == GMASK)
524 if (strncmp(line, "!GM", 3) != 0)
526 utilityObject->error("bad gap penalty mask format\n");
527 structPenalties = NONE;
528 freeFileResources(_fileIn);
531 sscanf(line + 4, "%s%s", sname, tseq);
532 for (i = 0; len < length; i++)
535 if (c == '\n' || c == EOS)
542 gapPenaltyMask[len++] = c;
547 // skip any more comment lines
548 while (line[0] == '!')
550 if (!_fileIn->getline(line, MAXLINE + 1))
552 freeFileResources(_fileIn);
557 // skip the sequence lines
560 if (isspace(line[0]) || line[0] == '\0') // Mark change 27-4-2007
564 if (!_fileIn->getline(line, MAXLINE + 1))
566 freeFileResources(_fileIn);
571 freeFileResources(_fileIn);
575 freeFileResources(_fileIn);
576 cerr << "An exception has occured in the function ClustalFileParser::getSecStructure()\n"
577 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
582 bool ClustalFileParser::clustalBlankline(char* line)
591 for (i = 0; line[i] != '\n' && line[i] != EOS; i++)
593 if (isdigit(line[i]) || isspace(line[i]) || (line[i] == '*') ||
594 (line[i] == ':') || (line[i] == '.'))