+++ /dev/null
-/**
- * Author: Mark Larkin
- *
- * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
- */
-/**
- * Changes:
- *
- * Mark 24-1-2007. I am now using the char "delimiter" for the delimiter when using
- * getline. This is to get around the problem of some files having '\r'.
- *
- * 10-02-07,Nigel Brown(EMBL): Changed ifstream to InFileStream to handle
- * cross-platform end-of-lines and removed delimiter member.
- *
- * 28-12-07,Paul McGettigan : replaced array processing with string processing this fixes bug #72
- *
- * 9-2-2008, Paul McGettigan : fixed problem where space after '>' but before sequence name was causing
- * alignment to fail due to no sequence name being read in
- * 15-2-2008, Paul McGettigan : fixed bug 91 where Pseudo -FASTA format files were not being processed as
- * previously in v1.83
- */
-
-#ifdef HAVE_CONFIG_H
- #include "config.h"
-#endif
-#include "PearsonFileParser.h"
-
-namespace clustalw
-{
-
-/**
- * Constructor for the Pearson file parser.
- * @param filePath
- * @return
- */
-PearsonFileParser::PearsonFileParser(string filePath)
-{
- fileName = filePath;
- fillCharTab();
-}
-
-
-/**
- * reads fasta/pearson file in one go instead of calling getSeq for
- * each single sequence.
- *
- * FIXME AW: only PearsonFileParser::getSeqRange is special, rest is the
- * same. should be defined in FileParser and then overloaded in special
- * cases like here
- */
-vector<Sequence>
-PearsonFileParser::getSeqRange(int firstSeq, int nSeqsToRead, string *offendingSeq)
-{
- string characterSeq = "";
- string name = "";
- string title = "";
- string blank = "";
- string greater = ">";
- //_line[0] = EOS;
- vector<Sequence> seqRangeVector;
-
- string line;
-
-
- //int i, j;
- int nSeqsRead = 0;
- unsigned char c;
- char delim;
- int _currentSeqNum = 0; // Not at any sequence yet!
-
- try
- {
- delim=FileParser::getDelimiter(fileName);
- //cout << "delim = " << delim << endl;
- ifstream _fileIn;
- _fileIn.open(fileName.c_str(),ios::in);
-
- // Read in lines until we get to the begining of sequence firstSeq.
- string line="";
-
- do {
- std::getline(_fileIn,line,delim);
- if(line.substr(0,1) == greater){
- _currentSeqNum++;
- }
- } while(_currentSeqNum <firstSeq);
-
-
- while (nSeqsRead < nSeqsToRead)
- {
- // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
- // remove the first char i.e. '>'
- name=line.substr(1,MAXNAMES);
- //if(name.find(">") != string::npos){
- // andreas wilm: exit if angle bracket within header?
- //}
-
- while(name.substr(0,1)==" "){
- name=name.substr(1,MAXNAMES);
- }
- //int i;
- //i = name.find(" ");
- if(name.find(" ") != string::npos){
- name=name.substr(0,name.find(" "));
- }
- utilityObject->rTrim(&name); // also replaces linef
-
- name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
-
-
- // Read in lines until we get to the begining of next sequence.
-
- title = ""; // No title information
-
- while(std::getline(_fileIn,line,delim) ){
-
- string::const_iterator iterator1 = line.begin();
- while(iterator1 != line.end()){
-
- // Andreas Wilm (UCD): exit if angle brackets within sequence
- if(*iterator1=='>' && iterator1!=line.begin()) {
- /* error output handled in Clustal.cpp
- cerr << "\nMultiple angle brackets inside sequence found:"
- << " invalid format.\n"
- << "Maybe you forgot a linebreak between sequences?\n";
- */
-
- parseExitCode=BADFORMAT;
- _fileIn.close();
- seqRangeVector.clear();
- return seqRangeVector;
- }
-
- if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
- break;
- }
- c = *iterator1;
-
- c = chartab[c];
- if(c){
- characterSeq.append(1,c);
- }
- iterator1++;
- }
- if(*iterator1 == '>'){
- break;
- }
- }
-
- // check sequence
- if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
- {
- /* error output handled in Clustal.cpp */
- parseExitCode=SEQUENCETOOBIG;
- if (offendingSeq!=NULL)
- offendingSeq->assign(name);
- _fileIn.close();
- seqRangeVector.clear();
- return seqRangeVector;
- }
- else if (characterSeq.length() == 0)
- {
- parseExitCode=EMPTYSEQUENCE;
- if (offendingSeq!=NULL)
- offendingSeq->assign(name);
- _fileIn.close();
- seqRangeVector.clear();
- return seqRangeVector;
- }
-
- seqRangeVector.push_back(Sequence(characterSeq, name, title));
- characterSeq = "";
- nSeqsRead++;
- } // while (nSeqsRead < nSeqsToRead)
-
- _fileIn.close();
-
- return seqRangeVector;
- }
-
- catch(...)
- {
- cerr << "There was an exception in the PearsonFileParser::getSeqRange function.\n"
- << "Need to end program\n";
- exit(1);
- }
-}
-
-
-
-/**
- * The function getSeq is used to get the sequence 'seqNum' in the file. It returns a
- * sequence object containing the sequence.
- * Deprecated: where possible use faster getSeqRange which reads
- * sequences in one go
- * @param seqNum The number of the sequence to get.
- * @return
- */
- Sequence PearsonFileParser::getSeq(int seqNum, string *offendingSeq)
-{
- //char _line[MAXLINE + 1];
- //char tseq[MAXLINE + 1];
- //char sname[MAXNAMES + 1];
- //sname [MAXNAMES] = '\0';
- string characterSeq = "";
- string name = "";
- string title = "";
- string blank = "";
- string greater = ">";
- //_line[0] = EOS;
-
- string line;
-
- cerr << "Use of PearsonFileParser::getSeq is deprecated!\n";
- //int i, j;
- unsigned char c;
- char delim;
- int _currentSeqNum = 0; // Not at any sequence yet!
-
- try
- {
- /*
- _fileIn = new InFileStream; //nige
- _fileIn->open(fileName.c_str()); //nige
- _fileIn->seekg(0, std::ios::beg); // start at the beginning
- */
- delim=FileParser::getDelimiter(fileName);
- //cout << "delim = " << delim << endl;
- ifstream _fileIn;
- _fileIn.open(fileName.c_str(),ios::in);
-
- //////////////////////////////////////////////////
- //PMcG replace char array with string processing
- //////////////////////////////////////////////////
-
- // Read in lines until we get to the begining of sequence seqNum.
- string line="";
-
- do {
- std::getline(_fileIn,line,delim);
- if(line.substr(0,1) == greater){
- _currentSeqNum++;
- }
- } while(_currentSeqNum <seqNum);
-
-
- // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
- // remove the first char i.e. '>'
- name=line.substr(1,MAXNAMES);
-
- //////////////////////////////////////
- // PMcG 9-2-2008 need to handle spaces at start of sequence name to conform to 1.83 handling
- //////////////////////////////////////
- while(name.substr(0,1)==" "){
- name=name.substr(1,MAXNAMES);
- }
- //int i;
- //i = name.find(" ");
- if(name.find(" ") != string::npos){
- name=name.substr(0,name.find(" "));
- }
- name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
-
-
- // Read in lines until we get to the begining of sequence seqNum.
-
- /* PMcG replace char array with string processing
- while (_currentSeqNum != seqNum)
- {
- while(*_line != '>')
- {
- if(!_fileIn->getline(_line, MAXLINE + 1))
- {
- freeFileResources(_fileIn);
- return Sequence(blank, blank, blank);
- }
- }
- ++_currentSeqNum;
- if(_currentSeqNum == seqNum) // Found the sequence
- {
- break;
- }
- // Get next line so that we are past the '>' line
- _fileIn->getline(_line, MAXLINE + 1);
- }
-
- // line contains the name of the sequence
- for (i = 1; i <= strlen(_line); i++)
- {
- if (_line[i] != ' ')
- {
- break;
- }
- }
- strncpy(sname, _line + i, MAXNAMES); // remember entryname
- for (i = 1; i <= strlen(sname); i++)
- {
- if (sname[i] == ' ')
- {
- break;
- }
- }
- sname[i] = EOS;
- utilityObject->rTrim(sname);
- utilityObject->blankToUnderscore(sname); // replace blanks with '_'
- name = string(sname);
- */
-
- title = ""; // No title information
-
- string seqLine = "";
- while(std::getline(_fileIn,seqLine,delim) ){
- string::const_iterator iterator1 = seqLine.begin();
- while(iterator1 != seqLine.end()){
- if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
- break;
- }
- c = *iterator1;
- c = chartab[c];
- // PMcG 15-02-2008 bug 91
- // strip out spaces and numbers from pseudo_fasta files
- // but need to maintain gaps if present in sequence input
- // to replicate behaviour of v1.83
- //if(*iterator1 != ' ' && !isdigit(*iterator1)){
- if(c){
- characterSeq.append(1,c);
- }
- iterator1++;
- }
- if(*iterator1 == '>'){
- break;
- }
- }
- /*
- while (_fileIn->getline(_line, MAXLINE + 1))
- {
- for (i = 0; i <= MAXLINE; i++)
- {
- c = _line[i];
- if (c == '\n' || c == EOS || c == '>')
- {
- break;
- }
-
- c = chartab[c];
- if (c)
- {
- characterSeq += c;
- }
- }
- if (c == '>')
- {
- break;
- }
- }
-*/
-
- _fileIn.close();
-
- if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
- {
- parseExitCode=SEQUENCETOOBIG;
- // return empty seq
- return Sequence(blank, blank, blank);
- }
- else if (characterSeq.length() == 0)
- {
- parseExitCode=EMPTYSEQUENCE;
- // return empty seq
- return Sequence(blank, blank, blank);
- }
-
- return Sequence(characterSeq, name, title);
- }
-
- catch(...)
- {
- cerr << "There was an exception in the PearsonFileParser::getSeq function.\n"
- << "Need to end program\n";
- exit(1);
- }
-}
-
-/**
- * The function countSeqs, counts the number of sequences in a file.
- * @return The number of sequences in the file.
- */
-int PearsonFileParser::countSeqs()
-{
- //char line[1000 + 1];
- int _nseqs = 0;
- string line2;
- char delim;
-
- try
- {
- //_fileIn = new InFileStream; //nige
- //_fileIn->open(fileName.c_str()); //nige
- delim=FileParser::getDelimiter(fileName);
- ifstream _fileIn;
- _fileIn.open(fileName.c_str(),ios::in);
-
-
- if(!_fileIn.is_open())
- {
- return 0; // No sequences found!
- }
-
- /* while ((*_fileIn) >> line2/@_fileIn->getline(line, 1000 + 1)@/)
- {
- /@if(_nseqs == 50)
- {
- cout << "\n\n" << line << "\n\n";
- exit(1);
- }@/
- */
- while (std::getline(_fileIn,line2,delim)) {
- if (line2[0] == '>')
- {
- _nseqs++;
- }
- }
- _fileIn.close();
- return _nseqs;
- }
- catch(...)
- {
- freeFileResources(_fileIn);
- cerr << "An exception has occured in the function PearsonFileParser::countSeqs()\n"
- << "Program needs to terminate.\nPlease contact the Clustal developers\n";
- exit(1);
- }
-}
-
-/**
- * There is no secondary structure information in the Pearson file. This is here to
- * set the structPenalties to NONE.
- * @param gapPenaltyMask
- * @param secStructMask
- * @param secStructName
- * @param structPenalties
- * @param length
- */
-void PearsonFileParser::getSecStructure(vector<char>& gapPenaltyMask,
- vector<char>& secStructMask, string& secStructName,
- int &structPenalties, int length)
-{
- structPenalties = NONE;
-}
-
-}
-
-