/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ /** * Changes: * * Mark 24-1-2007. I am now using the char "delimiter" for the delimiter when using * getline. This is to get around the problem of some files having '\r'. * * 10-02-07,Nigel Brown(EMBL): Changed ifstream to InFileStream to handle * cross-platform end-of-lines and removed delimiter member. * * 28-12-07,Paul McGettigan : replaced array processing with string processing this fixes bug #72 * * 9-2-2008, Paul McGettigan : fixed problem where space after '>' but before sequence name was causing * alignment to fail due to no sequence name being read in * 15-2-2008, Paul McGettigan : fixed bug 91 where Pseudo -FASTA format files were not being processed as * previously in v1.83 */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "PearsonFileParser.h" namespace clustalw { /** * Constructor for the Pearson file parser. * @param filePath * @return */ PearsonFileParser::PearsonFileParser(string filePath) { fileName = filePath; fillCharTab(); } /** * reads fasta/pearson file in one go instead of calling getSeq for * each single sequence. * * FIXME AW: only PearsonFileParser::getSeqRange is special, rest is the * same. should be defined in FileParser and then overloaded in special * cases like here */ vector PearsonFileParser::getSeqRange(int firstSeq, int nSeqsToRead, string *offendingSeq) { string characterSeq = ""; string name = ""; string title = ""; string blank = ""; string greater = ">"; //_line[0] = EOS; vector seqRangeVector; string line; //int i, j; int nSeqsRead = 0; unsigned char c; char delim; int _currentSeqNum = 0; // Not at any sequence yet! try { delim=FileParser::getDelimiter(fileName); //cout << "delim = " << delim << endl; ifstream _fileIn; _fileIn.open(fileName.c_str(),ios::in); // Read in lines until we get to the begining of sequence firstSeq. string line=""; do { std::getline(_fileIn,line,delim); if(line.substr(0,1) == greater){ _currentSeqNum++; } } while(_currentSeqNum ' and read up to first ' ' or MAXNAMES // remove the first char i.e. '>' name=line.substr(1,MAXNAMES); //if(name.find(">") != string::npos){ // andreas wilm: exit if angle bracket within header? //} while(name.substr(0,1)==" "){ name=name.substr(1,MAXNAMES); } //int i; //i = name.find(" "); if(name.find(" ") != string::npos){ name=name.substr(0,name.find(" ")); } utilityObject->rTrim(&name); // also replaces linef name=utilityObject->blankToUnderscore(name); // replace blanks with '_' // Read in lines until we get to the begining of next sequence. title = ""; // No title information while(std::getline(_fileIn,line,delim) ){ string::const_iterator iterator1 = line.begin(); while(iterator1 != line.end()){ // Andreas Wilm (UCD): exit if angle brackets within sequence if(*iterator1=='>' && iterator1!=line.begin()) { /* error output handled in Clustal.cpp cerr << "\nMultiple angle brackets inside sequence found:" << " invalid format.\n" << "Maybe you forgot a linebreak between sequences?\n"; */ parseExitCode=BADFORMAT; _fileIn.close(); seqRangeVector.clear(); return seqRangeVector; } if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){ break; } c = *iterator1; c = chartab[c]; if(c){ characterSeq.append(1,c); } iterator1++; } if(*iterator1 == '>'){ break; } } // check sequence if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength()) { /* error output handled in Clustal.cpp */ parseExitCode=SEQUENCETOOBIG; if (offendingSeq!=NULL) offendingSeq->assign(name); _fileIn.close(); seqRangeVector.clear(); return seqRangeVector; } else if (characterSeq.length() == 0) { parseExitCode=EMPTYSEQUENCE; if (offendingSeq!=NULL) offendingSeq->assign(name); _fileIn.close(); seqRangeVector.clear(); return seqRangeVector; } seqRangeVector.push_back(Sequence(characterSeq, name, title)); characterSeq = ""; nSeqsRead++; } // while (nSeqsRead < nSeqsToRead) _fileIn.close(); return seqRangeVector; } catch(...) { cerr << "There was an exception in the PearsonFileParser::getSeqRange function.\n" << "Need to end program\n"; exit(1); } } /** * The function getSeq is used to get the sequence 'seqNum' in the file. It returns a * sequence object containing the sequence. * Deprecated: where possible use faster getSeqRange which reads * sequences in one go * @param seqNum The number of the sequence to get. * @return */ Sequence PearsonFileParser::getSeq(int seqNum, string *offendingSeq) { //char _line[MAXLINE + 1]; //char tseq[MAXLINE + 1]; //char sname[MAXNAMES + 1]; //sname [MAXNAMES] = '\0'; string characterSeq = ""; string name = ""; string title = ""; string blank = ""; string greater = ">"; //_line[0] = EOS; string line; cerr << "Use of PearsonFileParser::getSeq is deprecated!\n"; //int i, j; unsigned char c; char delim; int _currentSeqNum = 0; // Not at any sequence yet! try { /* _fileIn = new InFileStream; //nige _fileIn->open(fileName.c_str()); //nige _fileIn->seekg(0, std::ios::beg); // start at the beginning */ delim=FileParser::getDelimiter(fileName); //cout << "delim = " << delim << endl; ifstream _fileIn; _fileIn.open(fileName.c_str(),ios::in); ////////////////////////////////////////////////// //PMcG replace char array with string processing ////////////////////////////////////////////////// // Read in lines until we get to the begining of sequence seqNum. string line=""; do { std::getline(_fileIn,line,delim); if(line.substr(0,1) == greater){ _currentSeqNum++; } } while(_currentSeqNum ' and read up to first ' ' or MAXNAMES // remove the first char i.e. '>' name=line.substr(1,MAXNAMES); ////////////////////////////////////// // PMcG 9-2-2008 need to handle spaces at start of sequence name to conform to 1.83 handling ////////////////////////////////////// while(name.substr(0,1)==" "){ name=name.substr(1,MAXNAMES); } //int i; //i = name.find(" "); if(name.find(" ") != string::npos){ name=name.substr(0,name.find(" ")); } name=utilityObject->blankToUnderscore(name); // replace blanks with '_' // Read in lines until we get to the begining of sequence seqNum. /* PMcG replace char array with string processing while (_currentSeqNum != seqNum) { while(*_line != '>') { if(!_fileIn->getline(_line, MAXLINE + 1)) { freeFileResources(_fileIn); return Sequence(blank, blank, blank); } } ++_currentSeqNum; if(_currentSeqNum == seqNum) // Found the sequence { break; } // Get next line so that we are past the '>' line _fileIn->getline(_line, MAXLINE + 1); } // line contains the name of the sequence for (i = 1; i <= strlen(_line); i++) { if (_line[i] != ' ') { break; } } strncpy(sname, _line + i, MAXNAMES); // remember entryname for (i = 1; i <= strlen(sname); i++) { if (sname[i] == ' ') { break; } } sname[i] = EOS; utilityObject->rTrim(sname); utilityObject->blankToUnderscore(sname); // replace blanks with '_' name = string(sname); */ title = ""; // No title information string seqLine = ""; while(std::getline(_fileIn,seqLine,delim) ){ string::const_iterator iterator1 = seqLine.begin(); while(iterator1 != seqLine.end()){ if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){ break; } c = *iterator1; c = chartab[c]; // PMcG 15-02-2008 bug 91 // strip out spaces and numbers from pseudo_fasta files // but need to maintain gaps if present in sequence input // to replicate behaviour of v1.83 //if(*iterator1 != ' ' && !isdigit(*iterator1)){ if(c){ characterSeq.append(1,c); } iterator1++; } if(*iterator1 == '>'){ break; } } /* while (_fileIn->getline(_line, MAXLINE + 1)) { for (i = 0; i <= MAXLINE; i++) { c = _line[i]; if (c == '\n' || c == EOS || c == '>') { break; } c = chartab[c]; if (c) { characterSeq += c; } } if (c == '>') { break; } } */ _fileIn.close(); if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength()) { parseExitCode=SEQUENCETOOBIG; // return empty seq return Sequence(blank, blank, blank); } else if (characterSeq.length() == 0) { parseExitCode=EMPTYSEQUENCE; // return empty seq return Sequence(blank, blank, blank); } return Sequence(characterSeq, name, title); } catch(...) { cerr << "There was an exception in the PearsonFileParser::getSeq function.\n" << "Need to end program\n"; exit(1); } } /** * The function countSeqs, counts the number of sequences in a file. * @return The number of sequences in the file. */ int PearsonFileParser::countSeqs() { //char line[1000 + 1]; int _nseqs = 0; string line2; char delim; try { //_fileIn = new InFileStream; //nige //_fileIn->open(fileName.c_str()); //nige delim=FileParser::getDelimiter(fileName); ifstream _fileIn; _fileIn.open(fileName.c_str(),ios::in); if(!_fileIn.is_open()) { return 0; // No sequences found! } /* while ((*_fileIn) >> line2/@_fileIn->getline(line, 1000 + 1)@/) { /@if(_nseqs == 50) { cout << "\n\n" << line << "\n\n"; exit(1); }@/ */ while (std::getline(_fileIn,line2,delim)) { if (line2[0] == '>') { _nseqs++; } } _fileIn.close(); return _nseqs; } catch(...) { freeFileResources(_fileIn); cerr << "An exception has occured in the function PearsonFileParser::countSeqs()\n" << "Program needs to terminate.\nPlease contact the Clustal developers\n"; exit(1); } } /** * There is no secondary structure information in the Pearson file. This is here to * set the structPenalties to NONE. * @param gapPenaltyMask * @param secStructMask * @param secStructName * @param structPenalties * @param length */ void PearsonFileParser::getSecStructure(vector& gapPenaltyMask, vector& secStructMask, string& secStructName, int &structPenalties, int length) { structPenalties = NONE; } }