4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
9 * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10 * cross-platform end-of-lines.
12 * 23-03-07,Nigel Brown(EMBL): added call to Alignment::testUniqueNames() to
13 * test sequence names prior to appending; moved
14 * Alignment::checkAllNamesDifferent() test into block handling loading of new
17 * 02-04-07,Nigel Brown(EMBL): commented out the "No sequences in file..."
18 * warnings in seqInput() and profileInput()and enabled the
19 * higher-up-the-stack counterpart in MainWindow::errorHandler(), since the
20 * former was causing a crash during the repaint after the user accepts the
21 * message, because some state (what exactly?) is unstable at this depth after
22 * a sequence load error, specifically when loading an empty file after
23 * already loading some sequences.
29 #include "FileReader.h"
35 FileReader::FileReader()
37 fileIn = new InFileStream;
38 structPenalties = NONE; // I think this should be ok.
41 FileReader::~FileReader()
47 /* check if we've read sequences without any information, i.e. header
48 * but no sequence at all
50 bool FileReader::noEmptySequence(vector<Sequence> seqVector, string *offendingSeq)
52 vector<Sequence>::iterator si;
53 for (si = seqVector.begin(); si != seqVector.end(); si++) {
55 offendingSeq->assign(si->getName());
65 * The function seqInput is called from the interactive menu only. This is because it
66 * allows us to append seqs. But this would not happen on command line.
67 * It is called twice in the interactive menu, both times with append as false. It calls
68 * the readseqs function to do the work.
70 int FileReader::seqInput(Alignment *alignPtr, bool append, string *offendingSeq)
74 if(userParameters->getMenuFlag())
76 cout << "\n\nSequences should all be in 1 file.\n";
77 cout << "\n7 formats accepted: \n";
78 cout << "NBRF/PIR, EMBL/SwissProt, Pearson (Fasta), GDE, Clustal, GCG/MSF, \
84 int numSeqsAlready = alignPtr->getNumSeqs();
85 code = readSeqs(alignPtr, numSeqsAlready + 1, offendingSeq);
89 code = readSeqs(alignPtr, 1, offendingSeq); // 1 is the first seq to be read
94 userParameters->setStructPenalties1(false);
95 userParameters->setStructPenalties2(false);
97 alignPtr->clearSecStruct1();
98 alignPtr->clearSecStruct2();
100 string typeOfAlign = userParameters->getDNAFlag() ? "DNA" : "PROTEIN";
101 cout << "Sequences assumed to be "
102 << typeOfAlign << endl;
104 if (userParameters->getMenuFlag())
107 alignPtr->printSequencesAddedInfo();
109 if(userParameters->getDNAFlag())
111 userParameters->setDNAMultiGap();
115 userParameters->setProtMultiGap();
117 userParameters->setEmpty(false);
119 else if(code == NOSEQUENCESINFILE) // no sequences
121 /* 02-04-07,nige: this causes a fatal repaint: let
122 * MainWindow::errorHandler deal with it.
124 //utilityObject->error("No sequences in file! Bad format?\n");
135 * The function readSeqs will create the FileParser that is needed and
136 * it will read in all of the sequences. Depending on the filetype, it
137 * will also check for secondary structure information.
139 * If there is a problem with one of the sequences its name will be
140 * assigned to offendingSeq
142 int FileReader::readSeqs(Alignment *alignPtr, int firstSeq, string *offendingSeq)
146 string linuxFilePath = "file://";
147 // static char *seq1, sname1[MAXNAMES + 1], title[MAXTITLES + 1];
151 static bool DNAFlag1;
152 vector<Sequence> seqVector;
153 vector<Sequence> seqRangeVector;
155 auto_ptr<FileParser> fileParser; // Means we dont need to delete it!
157 vector<int> _outputIndex; // Local version of outputIndex.
160 if (userParameters->getMenuFlag())
162 utilityObject->getStr(string("Enter the name of the sequence file "), line);
166 line = userParameters->getSeqName();
168 if (line.length() == 0)
170 // We have no fileName, so return -1
174 // If we have file:// in the string, remove it!!
175 string::size_type loc = line.find(linuxFilePath, 0);
176 if(loc != string::npos)
178 fileName = line.substr(loc + linuxFilePath.length());
181 // Now we have the file name, we must open the file.
182 fileIn->open(line.c_str()); //nige
184 return CANNOTOPENFILE;
186 sequenceFileName = line;
188 // Check if the file exists!
189 if (!fileIn->is_open())
191 utilityObject->error("Cannot open input file (%s)\n", line.c_str());
192 return CANNOTOPENFILE;
195 userParameters->setSeqName(line);
198 // NOTE I made a change here because the profile2Name was not stored!
199 if(userParameters->getProfileNum() == 2 && userParameters->getProfile2Name().empty())
201 userParameters->setProfile2Name(line);
203 else if(userParameters->getProfileNum() == 1 && userParameters->getProfile1Name().empty())
205 userParameters->setProfile1Name(line);
210 checkInfile(&noSeqs, fileParser);
214 return NOSEQUENCESINFILE;
218 seqRangeVector = fileParser->getSeqRange(1, noSeqs, offendingSeq);
219 if (seqRangeVector.size()==0)
220 return fileParser->getParseExitCode();
221 // FIXME Andreas Wilm (UCD): noEmptySequence check should be done
222 // internally by FileParser instances
223 if (noEmptySequence(seqRangeVector, offendingSeq) == false) {
224 return EMPTYSEQUENCE; // Error there are same names.
227 for (int i=0; i < (int)seqRangeVector.size(); i++) {
228 // Andreas Wilm (UCD): fixed wrong default output order
229 // which is important when no alignment (convert!) is done
230 // _outputIndex.push_back(i); // default output order
231 _outputIndex.push_back(i+1); // default output order
233 Sequence tempSeq = seqRangeVector[i];
235 if (!userParameters->getExplicitDNAFlag())
237 DNAFlag1 = tempSeq.checkDNAFlag(); // check DNA/Prot
240 userParameters->setDNAFlag(DNAFlag1);
242 } // type decided by first seq
245 DNAFlag1 = userParameters->getDNAFlag();
248 seqVector.push_back(tempSeq);
252 if(firstSeq == 1) // New mulitple alignment, or else profile1
254 int prfNum = userParameters->getProfileNum();
255 alignPtr->addSequences(&seqVector);
256 //test names after saving
257 if(alignPtr->checkAllNamesDifferent(offendingSeq) == false) { //nige moved
258 return ALLNAMESNOTDIFFERENT; // Error there are same names.
261 userParameters->setProfileNum(prfNum);
262 if(!alignPtr->addOutputIndex(&_outputIndex))
269 //test names before appending
270 if (! alignPtr->testUniqueNames(&seqVector, offendingSeq)) { //nige added
271 return ALLNAMESNOTDIFFERENT;
273 alignPtr->appendSequences(&seqVector);
274 if(!alignPtr->appendOutputIndex(&_outputIndex))
280 // Then need to pass the length to the parser to get the secondary structure info.
282 int maxAlnLength; // Get values from the Alignment object.
283 maxAlnLength = alignPtr->getMaxAlnLength();
285 // look for a feature table / gap penalty mask (only if this is a profile)
286 if (userParameters->getProfileNum() > 0)
288 structPenalties = NONE;
289 secStructMask.clear();
290 gapPenaltyMask.clear();
293 fileParser->getSecStructure(gapPenaltyMask, secStructMask, secStructName,
294 structPenalties, maxAlnLength);
296 // Andreas Wilm (UCD): bug 114
297 // after calling getSecStructure gapPenaltyMask needs to be
298 // allocated (it's copied/linked to later)
299 if (gapPenaltyMask.empty())
301 gapPenaltyMask.resize(secStructMask.size());
305 if (fileIn->is_open())
309 return OK; // return the number of seqs. read in this call
313 * The function profileInput is used to read profiles into the Alignment. If the first
314 * profile is already there it will read in the second profile. It returns the number
315 * of seqs. If it returns -1, couldnt open file.
317 int FileReader::profileInput(Alignment *alignPtr)
320 //int i, totalNumOfSeqs = 0;
323 if(userParameters->getProfileNum() == 2 && userParameters->getProfile1Empty())
325 utilityObject->error("You must read in profile number 1 first\n");
326 return MUSTREADINPROFILE1FIRST;
329 if(userParameters->getProfileNum() == 1) // for the 1st profile
331 code = readSeqs(alignPtr, 1, &offendingSeq);
335 // success; found some seqs.
336 userParameters->setStructPenalties1(NONE);
337 alignPtr->clearSecStruct1(); // Clear old info
339 if (structPenalties != NONE) // feature table / mask in alignment
341 userParameters->setStructPenalties1(structPenalties);
342 if (structPenalties == SECST)
345 alignPtr->addSecStructMask1(&secStructMask);
347 alignPtr->addGapPenaltyMask1(&gapPenaltyMask);
348 alignPtr->addSecStructName1(secStructName);
352 alignPtr->setProfile1NumSeqs(alignPtr->getNumSeqs());
354 userParameters->setProfile1Empty(false);
355 userParameters->setProfile2Empty(true);
356 cout << "No. of seqs = " << alignPtr->getNumSeqs() << endl;
360 return code; // FIXME and offendingSeq?
365 // first seq to be read = profile1_nseqs + 1
366 int profile1NumOfSeqs = alignPtr->getNumSeqs();
367 code = readSeqs(alignPtr, profile1NumOfSeqs + 1, &offendingSeq);
370 userParameters->setStructPenalties2(NONE);
371 alignPtr->clearSecStruct2(); // Clear old info
373 if (structPenalties != NONE) // feature table / mask in alignment
375 userParameters->setStructPenalties2(structPenalties);
376 if (structPenalties == SECST)
378 alignPtr->addSecStructMask2(&secStructMask);
380 alignPtr->addGapPenaltyMask2(&gapPenaltyMask);
381 alignPtr->addSecStructName2(secStructName);
383 cout << "No. of seqs in profile=" << alignPtr->getNumSeqs() - profile1NumOfSeqs
385 cout << "Total no. of seqs =" << alignPtr->getNumSeqs() << endl;
386 userParameters->setProfile2Empty(false);
387 userParameters->setEmpty(false);
391 return code; // FIXME and offendingSeq?
394 // Clear out the masks used in this class. This is important!!!!
395 secStructMask.clear();
396 gapPenaltyMask.clear();
399 string typeOfAlign = userParameters->getDNAFlag() ? "DNA" : "PROTEIN";
400 cout << "Sequences assumed to be "
401 << typeOfAlign << endl;
403 if (userParameters->getMenuFlag())
408 alignPtr->printSequencesAddedInfo();
410 if(userParameters->getDNAFlag())
412 userParameters->setDNAMultiGap();
416 userParameters->setProtMultiGap();
423 * The function checkInfile is used to find out which format the file is in, and it
424 * also calls the appropriate function to count the seqences.
426 void FileReader::checkInfile(int *nseqs, auto_ptr<FileParser>& fileParser)
428 char _lineIn[MAXLINE + 1];
433 while (fileIn->getline(_lineIn, MAXLINE + 1))
435 if (!utilityObject->blankLine(_lineIn))
440 lengthLine = strlen(_lineIn) - 1; // Mark Change 20-6-07
442 for (i = lengthLine; i >= 0; i--)
444 if (isgraph(_lineIn[i]))
449 _lineIn[i + 1] = EOS;
451 // Put first 7 characters into upper case!
452 for (i = 0; i <= 6 && i <= lengthLine; i++)
454 _lineIn[i] = toupper(_lineIn[i]);
457 // Create the parser to read the file.
458 if (utilityObject->lineType(_lineIn, "ID"))
460 // EMBL/Swiss-Prot format ?
461 fileParser.reset(new EMBLFileParser(sequenceFileName));
462 if(userParameters->getDisplayInfo())
463 cout << "Sequence format is EMBL\n";
465 else if (utilityObject->lineType(_lineIn, "CLUSTAL"))
467 fileParser.reset(new ClustalFileParser(sequenceFileName));
468 if(userParameters->getDisplayInfo())
469 cout << "Sequence format is CLUSTAL\n";
471 else if (utilityObject->lineType(_lineIn, "PILEUP")) // MSF
473 fileParser.reset(new MSFFileParser(sequenceFileName));
474 if(userParameters->getDisplayInfo())
475 cout << "Sequence format is MSF\n";
477 else if (utilityObject->lineType(_lineIn, "!!AA_MULTIPLE_ALIGNMENT")) // MSF
479 fileParser.reset(new MSFFileParser(sequenceFileName));
480 userParameters->setDNAFlag(false);
481 if(userParameters->getDisplayInfo())
482 cout << "Sequence format is MSF\n";
484 else if (utilityObject->lineType(_lineIn, "!!NA_MULTIPLE_ALIGNMENT")) // MSF
486 fileParser.reset(new MSFFileParser(sequenceFileName));
487 userParameters->setDNAFlag(true);
488 if(userParameters->getDisplayInfo())
489 cout << "Sequence format is MSF\n";
491 else if (strstr(_lineIn, "MSF") && _lineIn[strlen(_lineIn) - 1] == '.' &&
492 _lineIn[strlen(_lineIn) - 2] == '.') // MSF
494 fileParser.reset(new MSFFileParser(sequenceFileName));
495 if(userParameters->getDisplayInfo())
496 cout << "Sequence format is MSF\n";
498 else if (utilityObject->lineType(_lineIn, "!!RICH_SEQUENCE")) // RSF
500 fileParser.reset(new RSFFileParser(sequenceFileName));
501 if(userParameters->getDisplayInfo())
502 cout << "Sequence format is RSF\n";
504 else if (utilityObject->lineType(_lineIn, "#NEXUS"))
506 //utilityObject->error("Cannot read NEXUS format\n");
509 else if (*_lineIn == '>')
511 if ((lengthLine>=3) && _lineIn[3] == ';') {
512 //if(_lineIn[3] == ';') // distinguish PIR and Pearson
515 fileParser.reset(new PIRFileParser(sequenceFileName));
516 if(userParameters->getDisplayInfo())
517 cout << "Sequence format is PIR\n";
523 fileParser.reset(new PearsonFileParser(sequenceFileName));
524 if(userParameters->getDisplayInfo())
525 cout << "Sequence format is Pearson\n";
528 else if ((*_lineIn == '"') || (*_lineIn == '%') || (*_lineIn == '#'))
531 fileParser.reset(new GDEFileParser(sequenceFileName));
532 if(userParameters->getDisplayInfo())
533 cout << "Sequence format is GDE\n";
537 userParameters->setDNAFlag(false);
539 else if (*_lineIn == '#')
541 userParameters->setDNAFlag(true);
551 // Get the number of sequences. This is passed back as a pointer!
552 *nseqs = fileParser->countSeqs();
553 // no output in 1.83: cout << "number of seqs is: " << *nseqs << "\n";