4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10 * cross-platform end-of-lines.
16 #include "EMBLFileParser.h"
22 * constructor sets up chartab array.
24 EMBLFileParser::EMBLFileParser(string filePath)
31 * dont need to destruct anything.
33 EMBLFileParser::~EMBLFileParser()
40 * get range of sequences
42 vector<Sequence> EMBLFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
44 vector<Sequence> seqRangeVector;
49 Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
50 if (parseExitCode!=OK) {
51 seqRangeVector.clear();
52 return seqRangeVector;
54 seqRangeVector.push_back(tempSeq);
56 return seqRangeVector;
61 * get the sequence seqNum in the file.
63 Sequence EMBLFileParser::getSeq(int seqNum, string *offendingSeq)
65 char _line[MAXLINE + 1];
66 //char _tseq[MAXLINE + 1];
67 char _sname[MAXNAMES + 1];
68 //char _title[MAXTITLES + 1];
69 string characterSeq = "";
78 int _currentSeqNum = 0;
83 _fileIn = new InFileStream; //nige
84 _fileIn->open(fileName.c_str()); //nige
85 _fileIn->seekg(0, std::ios::beg); // start at the beginning
87 // Read in lines until we get to the begining of sequence seqNum.
88 while (_currentSeqNum != seqNum)
90 while(!utilityObject->lineType(_line, "ID"))
92 if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
95 return Sequence(blank, blank, blank);
99 if(_currentSeqNum == seqNum) // Found the sequence
103 // Get next line so that we are past the '>' line
104 _fileIn->getline(_line, MAXLINE + 1);
107 for (i = 5; i <= (int)strlen(_line); i++)
114 strncpy(_sname, _line + i, MAXNAMES); // remember entryname
115 for (i = 0; i <= (int)strlen(_sname); i++)
117 if (_sname[i] == ' ')
124 _sname[MAXNAMES] = EOS;
125 utilityObject->rTrim(_sname);
126 utilityObject->blankToUnderscore(_sname);
127 name = string(_sname);
128 // Andreas Wilm (UCD): why cout here? cout << name << "\n";
130 while (!utilityObject->lineType(_line, "SQ"))
132 if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
135 // FIXME AW: why return with a name but otherwise empty seq?
136 return Sequence(blank, name, blank);
140 while (_fileIn->getline(_line, MAXLINE + 1))
142 if (gotSeq && utilityObject->blankLine(_line))
147 // NOTE I changed this to -1 and -2 because the getline doesnt return the \n
148 if (strlen(_line) > 2 && _line[strlen(_line) - 1] == '.' &&
149 _line[strlen(_line) - 2] == '.')
154 for (i = 0; i <= MAXLINE; i++)
157 if (c == '\n' || c == EOS || c == '/')
176 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
178 parseExitCode=SEQUENCETOOBIG;
179 if (offendingSeq!=NULL)
180 offendingSeq->assign(name);
182 return Sequence(blank, blank, blank);
184 return Sequence(characterSeq, name, title);
189 cerr << "There was an exception in the EMBLFileParser::getSeq function.\n"
190 << "Need to end program\n";
196 * count the number of sequences in the file and return the number
198 int EMBLFileParser::countSeqs()
200 char line[MAXLINE + 1];
210 _fileIn = new InFileStream; //nige
211 _fileIn->open(fileName.c_str()); //nige
213 if(!_fileIn->is_open())
215 return 0; // No sequences found!
218 while (_fileIn->getline(line, MAXLINE + 1))
220 if (utilityObject->lineType(line, "ID"))
232 cerr << "An exception has occured in the function EMBLFileParser::countSeqs()\n"
233 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
239 * get secondary structure information from the file.
241 void EMBLFileParser::getSecStructure(vector<char>& gapPenaltyMask, vector<char>& secStructMask, string& secStructName, int &structPenalties, int length)
243 char _title[MAXLINE + 1];
244 char _line[MAXLINE + 1];
245 char _lin2[MAXLINE + 1];
246 char _sname[MAXNAMES + 1];
247 char _feature[MAXLINE + 1];
253 _fileIn = new InFileStream; //nige
254 _fileIn->open(fileName.c_str()); //nige
255 _fileIn->seekg(0, std::ios::beg);
257 // clear out the masks
258 gapPenaltyMask.clear();
259 secStructMask.clear();
261 // find the start of the sequence entry
264 while (!utilityObject->lineType(_line, "ID"))
266 if (!_fileIn->getline(_line, MAXLINE + 1))
273 for (i = 5; i <= (int)strlen(_line); i++)
280 strncpy(_sname, _line + i, MAXNAMES); // remember entryname
281 for (i = 0; i <= (int)strlen(_sname); i++)
283 if (_sname[i] == ' ')
289 _sname[MAXNAMES] = EOS;
290 utilityObject->rTrim(_sname);
291 utilityObject->blankToUnderscore(_sname);
293 // look for secondary structure feature table / gap penalty mask
294 while (_fileIn->getline(_line, MAXLINE + 1))
296 if (utilityObject->lineType(_line, "FT"))
298 sscanf(_line + 2, "%s", _feature);
299 if (strcmp(_feature, "HELIX") == 0 || strcmp(_feature, "STRAND") == 0)
301 if (userParameters->getInteractive())
304 "Found secondary structure in alignment file: ");
305 strcat(_title, _sname);
306 (*_lin2) = utilityObject->promptForYesNo(_title,
307 "Use it to set local gap penalties ");
313 if ((*_lin2 != 'n') && (*_lin2 != 'N'))
315 structPenalties = SECST;
316 for (i = 0; i < length; i++)
318 secStructMask.push_back('.');
322 getSwissFeature(&_line[2], secStructMask, length);
323 _fileIn->getline(_line, MAXLINE + 1);
325 while (utilityObject->lineType(_line, "FT"));
331 _fileIn->getline(_line, MAXLINE + 1);
333 while (utilityObject->lineType(_line, "FT"));
335 secStructName = string(_sname);
338 else if (utilityObject->lineType(_line, "GM"))
340 if (userParameters->getInteractive())
342 strcpy(_title, "Found gap penalty mask in alignment file: ");
343 strcat(_title, _sname);
344 (*_lin2) = utilityObject->promptForYesNo(_title,
345 "Use it to set local gap penalties ");
351 if ((*_lin2 != 'n') && (*_lin2 != 'N'))
353 structPenalties = GMASK;
354 for (i = 0; i < length; i++)
356 gapPenaltyMask.push_back('1');
360 getSwissMask(&_line[2], gapPenaltyMask, length);
361 _fileIn->getline(_line, MAXLINE + 1);
363 while (utilityObject->lineType(_line, "GM"));
369 _fileIn->getline(_line, MAXLINE + 1);
371 while (utilityObject->lineType(_line, "GM"));
373 secStructName = string(_sname);
375 if (utilityObject->lineType(_line, "SQ"))
380 if (structPenalties != NONE)
391 cerr << "An exception has occured in the function EMBLFileParser::getSecStructure()\n"
392 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
399 * get the sec structure mask
401 void EMBLFileParser::getSwissFeature(char* line, vector<char>& secStructMask, int length)
403 char c, s, feature[MAXLINE + 1];
404 int i, startPos, endPos;
408 if (sscanf(line, "%s%d%d", feature, &startPos, &endPos) != 3)
413 if (strcmp(feature, "HELIX") == 0)
418 else if (strcmp(feature, "STRAND") == 0)
428 if (startPos >= length || endPos >= length)
433 secStructMask[startPos - 1] = s;
434 for (i = startPos; i < endPos - 1; i++)
436 secStructMask[i] = c;
438 secStructMask[endPos - 1] = s;
442 cerr << "An exception has occured in the function EMBLFileParser::getSwissFeature()\n"
443 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
449 * get the gap penalty mask.
451 void EMBLFileParser::getSwissMask(char* line, vector<char>& gapPenaltyMask, int length)
453 int i, value, startPos, endPos;
457 if (sscanf(line, "%d%d%d", &value, &startPos, &endPos) != 3)
462 if (value < 1 || value > 9)
467 if (startPos >= length || endPos >= length)
471 for (i = startPos - 1; i < endPos; i++)
473 gapPenaltyMask[i] = value + '0';
478 cerr << "An exception has occured in the function EMBLFileParser::getSwissMask()\n"
479 << "Program needs to terminate.\nPlease contact the Clustal developers\n";