4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 * Mark 24-1-2007. I am now using the char "delimiter" for the delimiter when using
10 * getline. This is to get around the problem of some files having '\r'.
12 * 10-02-07,Nigel Brown(EMBL): Changed ifstream to InFileStream to handle
13 * cross-platform end-of-lines and removed delimiter member.
15 * 28-12-07,Paul McGettigan : replaced array processing with string processing this fixes bug #72
17 * 9-2-2008, Paul McGettigan : fixed problem where space after '>' but before sequence name was causing
18 * alignment to fail due to no sequence name being read in
19 * 15-2-2008, Paul McGettigan : fixed bug 91 where Pseudo -FASTA format files were not being processed as
26 #include "PearsonFileParser.h"
32 * Constructor for the Pearson file parser.
36 PearsonFileParser::PearsonFileParser(string filePath)
44 * reads fasta/pearson file in one go instead of calling getSeq for
45 * each single sequence.
47 * FIXME AW: only PearsonFileParser::getSeqRange is special, rest is the
48 * same. should be defined in FileParser and then overloaded in special
52 PearsonFileParser::getSeqRange(int firstSeq, int nSeqsToRead, string *offendingSeq)
54 string characterSeq = "";
60 vector<Sequence> seqRangeVector;
69 int _currentSeqNum = 0; // Not at any sequence yet!
73 delim=FileParser::getDelimiter(fileName);
74 //cout << "delim = " << delim << endl;
76 _fileIn.open(fileName.c_str(),ios::in);
78 // Read in lines until we get to the begining of sequence firstSeq.
82 std::getline(_fileIn,line,delim);
83 if(line.substr(0,1) == greater){
86 } while(_currentSeqNum <firstSeq);
89 while (nSeqsRead < nSeqsToRead)
91 // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
92 // remove the first char i.e. '>'
93 name=line.substr(1,MAXNAMES);
94 //if(name.find(">") != string::npos){
95 // andreas wilm: exit if angle bracket within header?
98 while(name.substr(0,1)==" "){
99 name=name.substr(1,MAXNAMES);
102 //i = name.find(" ");
103 if(name.find(" ") != string::npos){
104 name=name.substr(0,name.find(" "));
106 utilityObject->rTrim(&name); // also replaces linef
108 name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
111 // Read in lines until we get to the begining of next sequence.
113 title = ""; // No title information
115 while(std::getline(_fileIn,line,delim) ){
117 string::const_iterator iterator1 = line.begin();
118 while(iterator1 != line.end()){
120 // Andreas Wilm (UCD): exit if angle brackets within sequence
121 if(*iterator1=='>' && iterator1!=line.begin()) {
122 /* error output handled in Clustal.cpp
123 cerr << "\nMultiple angle brackets inside sequence found:"
124 << " invalid format.\n"
125 << "Maybe you forgot a linebreak between sequences?\n";
128 parseExitCode=BADFORMAT;
130 seqRangeVector.clear();
131 return seqRangeVector;
134 if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
141 characterSeq.append(1,c);
145 if(*iterator1 == '>'){
151 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
153 /* error output handled in Clustal.cpp */
154 parseExitCode=SEQUENCETOOBIG;
155 if (offendingSeq!=NULL)
156 offendingSeq->assign(name);
158 seqRangeVector.clear();
159 return seqRangeVector;
161 else if (characterSeq.length() == 0)
163 parseExitCode=EMPTYSEQUENCE;
164 if (offendingSeq!=NULL)
165 offendingSeq->assign(name);
167 seqRangeVector.clear();
168 return seqRangeVector;
171 seqRangeVector.push_back(Sequence(characterSeq, name, title));
174 } // while (nSeqsRead < nSeqsToRead)
178 return seqRangeVector;
183 cerr << "There was an exception in the PearsonFileParser::getSeqRange function.\n"
184 << "Need to end program\n";
192 * The function getSeq is used to get the sequence 'seqNum' in the file. It returns a
193 * sequence object containing the sequence.
194 * Deprecated: where possible use faster getSeqRange which reads
195 * sequences in one go
196 * @param seqNum The number of the sequence to get.
199 Sequence PearsonFileParser::getSeq(int seqNum, string *offendingSeq)
201 //char _line[MAXLINE + 1];
202 //char tseq[MAXLINE + 1];
203 //char sname[MAXNAMES + 1];
204 //sname [MAXNAMES] = '\0';
205 string characterSeq = "";
209 string greater = ">";
214 cerr << "Use of PearsonFileParser::getSeq is deprecated!\n";
218 int _currentSeqNum = 0; // Not at any sequence yet!
223 _fileIn = new InFileStream; //nige
224 _fileIn->open(fileName.c_str()); //nige
225 _fileIn->seekg(0, std::ios::beg); // start at the beginning
227 delim=FileParser::getDelimiter(fileName);
228 //cout << "delim = " << delim << endl;
230 _fileIn.open(fileName.c_str(),ios::in);
232 //////////////////////////////////////////////////
233 //PMcG replace char array with string processing
234 //////////////////////////////////////////////////
236 // Read in lines until we get to the begining of sequence seqNum.
240 std::getline(_fileIn,line,delim);
241 if(line.substr(0,1) == greater){
244 } while(_currentSeqNum <seqNum);
247 // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
248 // remove the first char i.e. '>'
249 name=line.substr(1,MAXNAMES);
251 //////////////////////////////////////
252 // PMcG 9-2-2008 need to handle spaces at start of sequence name to conform to 1.83 handling
253 //////////////////////////////////////
254 while(name.substr(0,1)==" "){
255 name=name.substr(1,MAXNAMES);
258 //i = name.find(" ");
259 if(name.find(" ") != string::npos){
260 name=name.substr(0,name.find(" "));
262 name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
265 // Read in lines until we get to the begining of sequence seqNum.
267 /* PMcG replace char array with string processing
268 while (_currentSeqNum != seqNum)
272 if(!_fileIn->getline(_line, MAXLINE + 1))
274 freeFileResources(_fileIn);
275 return Sequence(blank, blank, blank);
279 if(_currentSeqNum == seqNum) // Found the sequence
283 // Get next line so that we are past the '>' line
284 _fileIn->getline(_line, MAXLINE + 1);
287 // line contains the name of the sequence
288 for (i = 1; i <= strlen(_line); i++)
295 strncpy(sname, _line + i, MAXNAMES); // remember entryname
296 for (i = 1; i <= strlen(sname); i++)
304 utilityObject->rTrim(sname);
305 utilityObject->blankToUnderscore(sname); // replace blanks with '_'
306 name = string(sname);
309 title = ""; // No title information
312 while(std::getline(_fileIn,seqLine,delim) ){
313 string::const_iterator iterator1 = seqLine.begin();
314 while(iterator1 != seqLine.end()){
315 if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
320 // PMcG 15-02-2008 bug 91
321 // strip out spaces and numbers from pseudo_fasta files
322 // but need to maintain gaps if present in sequence input
323 // to replicate behaviour of v1.83
324 //if(*iterator1 != ' ' && !isdigit(*iterator1)){
326 characterSeq.append(1,c);
330 if(*iterator1 == '>'){
335 while (_fileIn->getline(_line, MAXLINE + 1))
337 for (i = 0; i <= MAXLINE; i++)
340 if (c == '\n' || c == EOS || c == '>')
360 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
362 parseExitCode=SEQUENCETOOBIG;
364 return Sequence(blank, blank, blank);
366 else if (characterSeq.length() == 0)
368 parseExitCode=EMPTYSEQUENCE;
370 return Sequence(blank, blank, blank);
373 return Sequence(characterSeq, name, title);
378 cerr << "There was an exception in the PearsonFileParser::getSeq function.\n"
379 << "Need to end program\n";
385 * The function countSeqs, counts the number of sequences in a file.
386 * @return The number of sequences in the file.
388 int PearsonFileParser::countSeqs()
390 //char line[1000 + 1];
397 //_fileIn = new InFileStream; //nige
398 //_fileIn->open(fileName.c_str()); //nige
399 delim=FileParser::getDelimiter(fileName);
401 _fileIn.open(fileName.c_str(),ios::in);
404 if(!_fileIn.is_open())
406 return 0; // No sequences found!
409 /* while ((*_fileIn) >> line2/@_fileIn->getline(line, 1000 + 1)@/)
413 cout << "\n\n" << line << "\n\n";
417 while (std::getline(_fileIn,line2,delim)) {
428 freeFileResources(_fileIn);
429 cerr << "An exception has occured in the function PearsonFileParser::countSeqs()\n"
430 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
436 * There is no secondary structure information in the Pearson file. This is here to
437 * set the structPenalties to NONE.
438 * @param gapPenaltyMask
439 * @param secStructMask
440 * @param secStructName
441 * @param structPenalties
444 void PearsonFileParser::getSecStructure(vector<char>& gapPenaltyMask,
445 vector<char>& secStructMask, string& secStructName,
446 int &structPenalties, int length)
448 structPenalties = NONE;