/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ /** * Changes: * * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle * cross-platform end-of-lines. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "EMBLFileParser.h" namespace clustalw { /* * constructor sets up chartab array. */ EMBLFileParser::EMBLFileParser(string filePath) { fileName = filePath; fillCharTab(); } /* * dont need to destruct anything. */ EMBLFileParser::~EMBLFileParser() { } /* * get range of sequences */ vector EMBLFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq) { vector seqRangeVector; int i; for (i=0; iopen(fileName.c_str()); //nige _fileIn->seekg(0, std::ios::beg); // start at the beginning // Read in lines until we get to the begining of sequence seqNum. while (_currentSeqNum != seqNum) { while(!utilityObject->lineType(_line, "ID")) { if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore! { _fileIn->close(); return Sequence(blank, blank, blank); } } ++_currentSeqNum; if(_currentSeqNum == seqNum) // Found the sequence { break; } // Get next line so that we are past the '>' line _fileIn->getline(_line, MAXLINE + 1); } for (i = 5; i <= (int)strlen(_line); i++) { if (_line[i] != ' ') { break; } } strncpy(_sname, _line + i, MAXNAMES); // remember entryname for (i = 0; i <= (int)strlen(_sname); i++) { if (_sname[i] == ' ') { _sname[i] = EOS; break; } } _sname[MAXNAMES] = EOS; utilityObject->rTrim(_sname); utilityObject->blankToUnderscore(_sname); name = string(_sname); // Andreas Wilm (UCD): why cout here? cout << name << "\n"; while (!utilityObject->lineType(_line, "SQ")) { if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore! { _fileIn->close(); // FIXME AW: why return with a name but otherwise empty seq? return Sequence(blank, name, blank); } } while (_fileIn->getline(_line, MAXLINE + 1)) { if (gotSeq && utilityObject->blankLine(_line)) { break; } // NOTE I changed this to -1 and -2 because the getline doesnt return the \n if (strlen(_line) > 2 && _line[strlen(_line) - 1] == '.' && _line[strlen(_line) - 2] == '.') { continue; } for (i = 0; i <= MAXLINE; i++) { c = _line[i]; if (c == '\n' || c == EOS || c == '/') { break; } // EOL c = chartab[c]; if (c) { gotSeq = true; characterSeq += c; } } if (c == '/') { break; } } _fileIn->close(); if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength()) { parseExitCode=SEQUENCETOOBIG; if (offendingSeq!=NULL) offendingSeq->assign(name); // return empty seq return Sequence(blank, blank, blank); } return Sequence(characterSeq, name, title); } catch(...) { _fileIn->close(); cerr << "There was an exception in the EMBLFileParser::getSeq function.\n" << "Need to end program\n"; exit(1); } } /* * count the number of sequences in the file and return the number */ int EMBLFileParser::countSeqs() { char line[MAXLINE + 1]; // char c; line[0] = EOS; int numSeqs; // int i; //bool seqOk; numSeqs = 0; try { _fileIn = new InFileStream; //nige _fileIn->open(fileName.c_str()); //nige if(!_fileIn->is_open()) { return 0; // No sequences found! } while (_fileIn->getline(line, MAXLINE + 1)) { if (utilityObject->lineType(line, "ID")) { numSeqs++; } } _fileIn->close(); return numSeqs; } catch(...) { _fileIn->close(); cerr << "An exception has occured in the function EMBLFileParser::countSeqs()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /* * get secondary structure information from the file. */ void EMBLFileParser::getSecStructure(vector& gapPenaltyMask, vector& secStructMask, string& secStructName, int &structPenalties, int length) { char _title[MAXLINE + 1]; char _line[MAXLINE + 1]; char _lin2[MAXLINE + 1]; char _sname[MAXNAMES + 1]; char _feature[MAXLINE + 1]; int i; _line[0] = '\0'; try { _fileIn = new InFileStream; //nige _fileIn->open(fileName.c_str()); //nige _fileIn->seekg(0, std::ios::beg); // clear out the masks gapPenaltyMask.clear(); secStructMask.clear(); // find the start of the sequence entry for (;;) { while (!utilityObject->lineType(_line, "ID")) { if (!_fileIn->getline(_line, MAXLINE + 1)) { _fileIn->close(); return; } } for (i = 5; i <= (int)strlen(_line); i++) { if (_line[i] != ' ') { break; } } strncpy(_sname, _line + i, MAXNAMES); // remember entryname for (i = 0; i <= (int)strlen(_sname); i++) { if (_sname[i] == ' ') { _sname[i] = EOS; break; } } _sname[MAXNAMES] = EOS; utilityObject->rTrim(_sname); utilityObject->blankToUnderscore(_sname); // look for secondary structure feature table / gap penalty mask while (_fileIn->getline(_line, MAXLINE + 1)) { if (utilityObject->lineType(_line, "FT")) { sscanf(_line + 2, "%s", _feature); if (strcmp(_feature, "HELIX") == 0 || strcmp(_feature, "STRAND") == 0) { if (userParameters->getInteractive()) { strcpy(_title, "Found secondary structure in alignment file: "); strcat(_title, _sname); (*_lin2) = utilityObject->promptForYesNo(_title, "Use it to set local gap penalties "); } else { (*_lin2) = 'y'; } if ((*_lin2 != 'n') && (*_lin2 != 'N')) { structPenalties = SECST; for (i = 0; i < length; i++) { secStructMask.push_back('.'); } do { getSwissFeature(&_line[2], secStructMask, length); _fileIn->getline(_line, MAXLINE + 1); } while (utilityObject->lineType(_line, "FT")); } else { do { _fileIn->getline(_line, MAXLINE + 1); } while (utilityObject->lineType(_line, "FT")); } secStructName = string(_sname); } } else if (utilityObject->lineType(_line, "GM")) { if (userParameters->getInteractive()) { strcpy(_title, "Found gap penalty mask in alignment file: "); strcat(_title, _sname); (*_lin2) = utilityObject->promptForYesNo(_title, "Use it to set local gap penalties "); } else { (*_lin2) = 'y'; } if ((*_lin2 != 'n') && (*_lin2 != 'N')) { structPenalties = GMASK; for (i = 0; i < length; i++) { gapPenaltyMask.push_back('1'); } do { getSwissMask(&_line[2], gapPenaltyMask, length); _fileIn->getline(_line, MAXLINE + 1); } while (utilityObject->lineType(_line, "GM")); } else { do { _fileIn->getline(_line, MAXLINE + 1); } while (utilityObject->lineType(_line, "GM")); } secStructName = string(_sname); } if (utilityObject->lineType(_line, "SQ")) { break; } if (structPenalties != NONE) { break; } } } _fileIn->close(); } catch(...) { _fileIn->close(); cerr << "An exception has occured in the function EMBLFileParser::getSecStructure()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /* * get the sec structure mask */ void EMBLFileParser::getSwissFeature(char* line, vector& secStructMask, int length) { char c, s, feature[MAXLINE + 1]; int i, startPos, endPos; try { if (sscanf(line, "%s%d%d", feature, &startPos, &endPos) != 3) { return ; } if (strcmp(feature, "HELIX") == 0) { c = 'A'; s = '$'; } else if (strcmp(feature, "STRAND") == 0) { c = 'B'; s = '%'; } else { return ; } if (startPos >= length || endPos >= length) { return ; } secStructMask[startPos - 1] = s; for (i = startPos; i < endPos - 1; i++) { secStructMask[i] = c; } secStructMask[endPos - 1] = s; } catch(...) { cerr << "An exception has occured in the function EMBLFileParser::getSwissFeature()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /* * get the gap penalty mask. */ void EMBLFileParser::getSwissMask(char* line, vector& gapPenaltyMask, int length) { int i, value, startPos, endPos; try { if (sscanf(line, "%d%d%d", &value, &startPos, &endPos) != 3) { return; } if (value < 1 || value > 9) { return ; } if (startPos >= length || endPos >= length) { return ; } for (i = startPos - 1; i < endPos; i++) { gapPenaltyMask[i] = value + '0'; } } catch(...) { cerr << "An exception has occured in the function EMBLFileParser::getSwissMask()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } }