/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ /** * Changes: * * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle * cross-platform end-of-lines. * * 27-4-2007, Mark Larkin (UCD): Made 2 small changes to getSecStructure function. There * was a problem with the secondary structure info in windows. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "ClustalFileParser.h" namespace clustalw { ClustalFileParser::ClustalFileParser(string filePath) { fileName = filePath; fillCharTab(); _fileIn = 0; } ClustalFileParser::~ClustalFileParser() { } vector ClustalFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq) { vector seqRangeVector; int i; for (i=0; iopen(fileName.c_str()); //nige _fileIn->seekg(0, std::ios::beg); // start at the beginning _fileIn->getline(line, MAXLINE + 1); // read the title line...ignore it while (_fileIn->getline(line, MAXLINE + 1)) //nige { if (!clustalBlankline(line)) { for (i = 1; i < seqNum; i++) { _fileIn->getline(line, MAXLINE + 1); //nige } for (j = 0; j <= (int)strlen(line); j++) if (line[j] != ' ') { break; } string _tempStr = string(line); sscanf(_tempStr.c_str(), "%s%s", sname, tseq); for (j = 0; j < MAXNAMES; j++) { if (sname[j] == ' ') { break; } } sname[j] = EOS; utilityObject->rTrim(sname); utilityObject->blankToUnderscore(sname); // replace blanks with '_' name = string(sname); for (i = 0; i <= MAXLINE; i++) { c = tseq[i]; if (isspace(c) || c == EOS) { break; } // EOL c = chartab[c]; if (c) { characterSeq += c; // Add the character to the sequence } } for (i = 0;; i++) { if (!_fileIn->getline(line, MAXLINE + 1)) // If we cant get another line! { freeFileResources(_fileIn); if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength()) { parseExitCode=SEQUENCETOOBIG; if (offendingSeq!=NULL) offendingSeq->assign(name); // return empty seq return Sequence(blank, blank, blank); } return Sequence(characterSeq, name, title); } if (clustalBlankline(line)) { break; } } } } freeFileResources(_fileIn); if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength()) { parseExitCode=SEQUENCETOOBIG; if (offendingSeq!=NULL) offendingSeq->assign(name); // return empty seq return Sequence(blank, blank, blank); } return Sequence(characterSeq, name, title); } catch(...) { freeFileResources(_fileIn); cerr << "An exception has occured in the function ClustalFileParser::getSeq()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /* * The function countSeqs tells us how many sequences are in a clustal format file. * Need to check if the file is open! */ int ClustalFileParser::countSeqs() { char line[MAXLINE + 1]; int _nseqs; try { _fileIn = new InFileStream; //nige _fileIn->open(fileName.c_str()); //nige if(!_fileIn->is_open()) { freeFileResources(_fileIn); return 0; // No sequences found! } while (_fileIn->getline(line, MAXLINE + 1)) { if (!utilityObject->blankLine(line)) { break; } } // This gets us to the begining of the sequence lines! while (_fileIn->getline(line, MAXLINE + 1)) { if (!clustalBlankline(line)) { break; } } _nseqs = 1; while (_fileIn->getline(line, MAXLINE + 1)) { if (clustalBlankline(line)) { freeFileResources(_fileIn); return _nseqs; } _nseqs++; } freeFileResources(_fileIn); return (int)0; // if you got to here-funny format/no seqs. } catch(...) { freeFileResources(_fileIn); cerr << "An exception has occured in the function ClustalFileParser::countSeqs()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /* * This function is to get the secondary structure for a Clustal format file. * I am aware that I am using some C and some C++ here, and this may seem like * bad style, but I think it is better to use the C functions for processing the * strings as they are already working. */ void ClustalFileParser::getSecStructure(vector& gapPenaltyMask, vector& secStructMask, string& secStructName, int &structPenalties, int length) { char title[MAXLINE + 1]; title[0] = '\0'; char line[MAXLINE + 1]; line[0] = '\0'; char lin2[MAXLINE + 1]; lin2[0] = '\0'; char tseq[MAXLINE + 1]; tseq[0] = '\0'; char sname[MAXNAMES + 1]; sname[0] = '\0'; for(int i = 1; i < MAXNAMES + 1; i++) { title[i] = line[i] = lin2[i] = tseq[i] = sname[i] ='0'; } int i, j, len, ix, struct_index = 0; char c; try { _fileIn = new InFileStream; //nige _fileIn->open(fileName.c_str()); //nige _fileIn->seekg(0, std::ios::beg); // NOTE clear out the masks gapPenaltyMask.clear(); secStructMask.clear(); len = 0; // initialise length to zero if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } // read the title line...ignore it if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } // read the next line... // skip any blank lines for (;;) { if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } if (!utilityObject->blankLine(line)) { break; } } // look for structure table lines ix = -1; for (;;) { if (line[0] != '!') { break; } if (strncmp(line, "!SS", 3) == 0) { ix++; sscanf(line + 4, "%s%s", sname, tseq); for (j = 0; j < MAXNAMES; j++) { if (sname[j] == ' ') { break; } } sname[j] = EOS; utilityObject->rTrim(sname); utilityObject->blankToUnderscore(sname); if (userParameters->getInteractive()) { strcpy(title, "Found secondary structure in alignment file: "); strcat(title, sname); (*lin2) = utilityObject->promptForYesNo(title, "Use it to set local gap penalties "); } else { (*lin2) = 'y'; } if ((*lin2 != 'n') && (*lin2 != 'N')) { structPenalties = SECST; struct_index = ix; for (i = 0; i < length; i++) { secStructMask.push_back('.'); gapPenaltyMask.push_back('.'); } secStructName = string(sname); for (i = 0; len < length; i++) { c = tseq[i]; if (c == '\n' || c == EOS) { break; } // EOL if (!isspace(c)) { if(len < (int)secStructMask.size()) { secStructMask[len++] = c; // NOTE array notation = BAD } } } } } else if (strncmp(line, "!GM", 3) == 0) { ix++; sscanf(line + 4, "%s%s", sname, tseq); for (j = 0; j < MAXNAMES; j++) { if (sname[j] == ' ') { break; } } sname[j] = EOS; utilityObject->rTrim(sname); utilityObject->blankToUnderscore(sname); if (userParameters->getInteractive()) { strcpy(title, "Found gap penalty mask in alignment file: "); strcat(title, sname); (*lin2) = utilityObject->promptForYesNo(title, "Use it to set local gap penalties "); } else { (*lin2) = 'y'; } if ((*lin2 != 'n') && (*lin2 != 'N')) { structPenalties = GMASK; struct_index = ix; for (i = 0; i < length; i++) { gapPenaltyMask.push_back('1'); } secStructName = string(sname); for (i = 0; len < length; i++) { c = tseq[i]; if (c == '\n' || c == EOS) { break; } // EOL if (!isspace(c)) { if(len < (int)gapPenaltyMask.size()) { gapPenaltyMask[len++] = c; } } } } } if (structPenalties != NONE) { break; } if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } if (structPenalties == NONE) { freeFileResources(_fileIn); return ; } // skip any more comment lines while (line[0] == '!') { if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } // skip the sequence lines and any comments after the alignment for (;;) { if (isspace(line[0]) || line[0] == '\0') // Mark change 27-4-2007 { break; } if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } // read the rest of the alignment for (;;) { // skip any blank lines for (;;) { if (!utilityObject->blankLine(line)) { break; } if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } // get structure table line for (ix = 0; ix < struct_index; ix++) { if (line[0] != '!') { if (structPenalties == SECST) { utilityObject->error("bad secondary structure format\n"); } else { utilityObject->error("bad gap penalty mask format\n"); } structPenalties = NONE; freeFileResources(_fileIn); return ; } if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } if (structPenalties == SECST) { if (strncmp(line, "!SS", 3) != 0) { utilityObject->error("bad secondary structure format\n"); structPenalties = NONE; freeFileResources(_fileIn); return ; } sscanf(line + 4, "%s%s", sname, tseq); for (i = 0; len < length; i++) { c = tseq[i]; if (c == '\n' || c == EOS) { break; } // EOL if (!isspace(c)) { secStructMask[len++] = c; } } } else if (structPenalties == GMASK) { if (strncmp(line, "!GM", 3) != 0) { utilityObject->error("bad gap penalty mask format\n"); structPenalties = NONE; freeFileResources(_fileIn); return ; } sscanf(line + 4, "%s%s", sname, tseq); for (i = 0; len < length; i++) { c = tseq[i]; if (c == '\n' || c == EOS) { break; } // EOL if (!isspace(c)) { gapPenaltyMask[len++] = c; } } } // skip any more comment lines while (line[0] == '!') { if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } // skip the sequence lines for (;;) { if (isspace(line[0]) || line[0] == '\0') // Mark change 27-4-2007 { break; } if (!_fileIn->getline(line, MAXLINE + 1)) { freeFileResources(_fileIn); return ; } } } freeFileResources(_fileIn); } catch(...) { freeFileResources(_fileIn); cerr << "An exception has occured in the function ClustalFileParser::getSecStructure()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } bool ClustalFileParser::clustalBlankline(char* line) { int i; if (line[0] == '!') { return true; } for (i = 0; line[i] != '\n' && line[i] != EOS; i++) { if (isdigit(line[i]) || isspace(line[i]) || (line[i] == '*') || (line[i] == ':') || (line[i] == '.')) ; else { return false; } } return true; } }