Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / clustalw / src / fileInput / PearsonFileParser.cpp
diff --git a/website/archive/binaries/mac/src/clustalw/src/fileInput/PearsonFileParser.cpp b/website/archive/binaries/mac/src/clustalw/src/fileInput/PearsonFileParser.cpp
new file mode 100644 (file)
index 0000000..61ef200
--- /dev/null
@@ -0,0 +1,453 @@
+/**
+ * Author: Mark Larkin
+ * 
+ * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.  
+ */
+/** 
+ * Changes: 
+ *
+ * Mark 24-1-2007. I am now using the char "delimiter" for the delimiter when using
+ * getline. This is to get around the problem of some files having '\r'.
+ *
+ * 10-02-07,Nigel Brown(EMBL): Changed ifstream to InFileStream to handle
+ * cross-platform end-of-lines and removed delimiter member.
+ *
+ * 28-12-07,Paul McGettigan : replaced array processing with string processing this fixes bug #72
+ *
+ * 9-2-2008, Paul McGettigan : fixed problem where space after '>' but before sequence name was causing
+ *                             alignment to fail due to no sequence name being read in
+ * 15-2-2008, Paul McGettigan : fixed bug 91 where Pseudo -FASTA format files were not being processed as 
+ *                              previously in v1.83
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include "config.h"
+#endif
+#include "PearsonFileParser.h"
+
+namespace clustalw
+{
+
+/**
+ * Constructor for the Pearson file parser.
+ * @param filePath 
+ * @return 
+ */
+PearsonFileParser::PearsonFileParser(string filePath)
+{
+    fileName = filePath;
+    fillCharTab();
+}
+
+
+/**
+ * reads fasta/pearson file in one go instead of calling getSeq for
+ * each single sequence.
+ *
+ * FIXME AW: only PearsonFileParser::getSeqRange is special, rest is the
+ * same. should be defined in FileParser and then overloaded in special
+ * cases like here
+ */
+vector<Sequence>
+PearsonFileParser::getSeqRange(int firstSeq, int nSeqsToRead, string *offendingSeq)
+{
+    string characterSeq = "";
+    string name = "";
+    string title = "";
+    string blank = "";
+    string greater = ">";
+    //_line[0] = EOS;
+    vector<Sequence> seqRangeVector;
+    
+    string line;
+    
+    
+    //int i, j;
+    int nSeqsRead = 0;
+    unsigned char c;
+    char delim;
+    int _currentSeqNum = 0; // Not at any sequence yet!
+    
+    try
+    {
+       delim=FileParser::getDelimiter(fileName);
+       //cout << "delim = " << delim << endl;
+       ifstream _fileIn;
+       _fileIn.open(fileName.c_str(),ios::in);
+
+        // Read in lines until we get to the begining of sequence firstSeq.
+        string line="";
+
+        do {
+          std::getline(_fileIn,line,delim);
+          if(line.substr(0,1) == greater){
+              _currentSeqNum++;
+          }
+        } while(_currentSeqNum <firstSeq);
+        
+        
+        while (nSeqsRead < nSeqsToRead)
+        {
+            // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
+            // remove the first char i.e. '>'
+            name=line.substr(1,MAXNAMES);
+            //if(name.find(">") != string::npos){
+            //  andreas wilm: exit if angle bracket within header?
+            //}
+            
+            while(name.substr(0,1)==" "){
+                name=name.substr(1,MAXNAMES);
+            }
+            //int i;
+            //i = name.find(" ");
+            if(name.find(" ") != string::npos){
+                name=name.substr(0,name.find(" "));
+            }
+            utilityObject->rTrim(&name); // also replaces linef
+
+            name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
+            
+            
+            // Read in lines until we get to the begining of next sequence.
+            
+            title = ""; // No title information
+            
+            while(std::getline(_fileIn,line,delim) ){
+                 
+               string::const_iterator iterator1 = line.begin();
+                while(iterator1 != line.end()){
+
+                    // Andreas Wilm (UCD): exit if angle brackets within sequence
+                    if(*iterator1=='>' && iterator1!=line.begin()) {
+                        /* error output handled in Clustal.cpp
+                        cerr << "\nMultiple angle brackets inside sequence found:"
+                             << " invalid format.\n"
+                             << "Maybe you forgot a linebreak between sequences?\n";
+                        */
+                        
+                        parseExitCode=BADFORMAT;
+                        _fileIn.close();
+                        seqRangeVector.clear();
+                        return seqRangeVector;
+                    }
+                       
+                    if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
+                        break;
+                    }
+                    c = *iterator1;
+
+                    c = chartab[c];
+                    if(c){
+                        characterSeq.append(1,c);
+                    }
+                    iterator1++;
+                }
+                if(*iterator1 == '>'){
+                    break;
+               } 
+            }
+            
+            // check sequence
+            if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
+            {
+                /* error output handled in Clustal.cpp */
+                parseExitCode=SEQUENCETOOBIG;
+                if (offendingSeq!=NULL)
+                    offendingSeq->assign(name);
+                _fileIn.close();
+                seqRangeVector.clear();
+                return seqRangeVector;
+            }
+            else if (characterSeq.length() == 0)
+            {
+                parseExitCode=EMPTYSEQUENCE;
+                if (offendingSeq!=NULL)
+                    offendingSeq->assign(name);
+                _fileIn.close();
+                seqRangeVector.clear();
+                return seqRangeVector;
+            }
+
+            seqRangeVector.push_back(Sequence(characterSeq, name, title));
+            characterSeq = "";
+            nSeqsRead++;
+        } // while (nSeqsRead < nSeqsToRead)
+
+        _fileIn.close();
+
+        return seqRangeVector;
+    }
+
+    catch(...)
+    {
+        cerr << "There was an exception in the PearsonFileParser::getSeqRange function.\n"
+             << "Need to end program\n";
+        exit(1);
+    }
+}
+
+
+
+/**
+ * The function getSeq is used to get the sequence 'seqNum' in the file. It returns a
+ * sequence object containing the sequence.
+ * Deprecated: where possible use faster getSeqRange which reads
+ * sequences in one go
+ * @param seqNum The number of the sequence to get.
+ * @return 
+ */
+    Sequence PearsonFileParser::getSeq(int seqNum, string *offendingSeq)
+{
+    //char _line[MAXLINE + 1];
+    //char tseq[MAXLINE + 1];
+    //char sname[MAXNAMES + 1];
+    //sname [MAXNAMES] = '\0';
+    string characterSeq = "";
+    string name = "";
+    string title = "";
+    string blank = "";
+    string greater = ">";
+    //_line[0] = EOS;
+    
+    string line;
+    
+    cerr << "Use of PearsonFileParser::getSeq is deprecated!\n";
+    //int i, j;
+    unsigned char c;
+    char delim;
+    int _currentSeqNum = 0; // Not at any sequence yet!
+    
+    try
+    {
+        /*
+        _fileIn = new InFileStream; //nige
+        _fileIn->open(fileName.c_str());  //nige
+        _fileIn->seekg(0, std::ios::beg); // start at the beginning
+       */
+       delim=FileParser::getDelimiter(fileName);
+       //cout << "delim = " << delim << endl;
+       ifstream _fileIn;
+       _fileIn.open(fileName.c_str(),ios::in);
+
+        //////////////////////////////////////////////////
+        //PMcG replace char array with string processing
+        //////////////////////////////////////////////////
+
+        // Read in lines until we get to the begining of sequence seqNum.
+        string line="";
+
+        do {
+          std::getline(_fileIn,line,delim);
+          if(line.substr(0,1) == greater){
+            _currentSeqNum++;
+          }
+        } while(_currentSeqNum <seqNum);
+        
+        
+        // get sequence name from current line (excluded '>' and read up to first ' ' or MAXNAMES
+        // remove the first char i.e. '>'
+        name=line.substr(1,MAXNAMES);
+           
+        //////////////////////////////////////
+        // PMcG 9-2-2008 need to handle spaces at start of sequence name to conform to 1.83 handling
+        //////////////////////////////////////
+        while(name.substr(0,1)==" "){
+          name=name.substr(1,MAXNAMES);
+        }
+        //int i;
+        //i = name.find(" ");
+        if(name.find(" ") != string::npos){
+          name=name.substr(0,name.find(" "));
+        } 
+        name=utilityObject->blankToUnderscore(name); // replace blanks with '_'
+
+
+        // Read in lines until we get to the begining of sequence seqNum.
+          
+        /* PMcG replace char array with string processing
+        while (_currentSeqNum != seqNum)
+        {
+            while(*_line != '>')
+            {
+                if(!_fileIn->getline(_line, MAXLINE + 1))
+                {
+                    freeFileResources(_fileIn);
+                    return Sequence(blank, blank, blank);
+                }
+            }
+            ++_currentSeqNum;
+            if(_currentSeqNum == seqNum) // Found the sequence
+            {
+                break;
+            }
+            // Get next line so that we are past the '>' line
+            _fileIn->getline(_line, MAXLINE + 1);
+        }
+
+        // line contains the name of the sequence
+        for (i = 1; i <= strlen(_line); i++)
+        {
+            if (_line[i] != ' ')
+            {
+                break;
+            }
+        }
+        strncpy(sname, _line + i, MAXNAMES); // remember entryname 
+        for (i = 1; i <= strlen(sname); i++)
+        {
+            if (sname[i] == ' ')
+            {
+                break;
+            }
+        }
+        sname[i] = EOS;
+        utilityObject->rTrim(sname);
+        utilityObject->blankToUnderscore(sname); // replace blanks with '_'
+        name = string(sname);        
+        */
+        
+        title = ""; // No title information
+
+        string seqLine = "";
+        while(std::getline(_fileIn,seqLine,delim) ){
+          string::const_iterator iterator1 = seqLine.begin();
+          while(iterator1 != seqLine.end()){
+            if(*iterator1 =='\n' || *iterator1 =='\r' || *iterator1 == EOS || *iterator1 =='>'){
+              break;
+            }
+            c = *iterator1;
+            c = chartab[c];
+            // PMcG 15-02-2008 bug 91
+            // strip out spaces and numbers from pseudo_fasta files
+            // but need to maintain gaps if present in sequence input
+            // to replicate behaviour of v1.83
+            //if(*iterator1 != ' ' && !isdigit(*iterator1)){
+            if(c){
+              characterSeq.append(1,c);
+            }
+            iterator1++;
+          }
+          if(*iterator1 == '>'){
+            break;
+          }
+        }
+ /*
+        while (_fileIn->getline(_line, MAXLINE + 1))
+        {
+            for (i = 0; i <= MAXLINE; i++)
+            {
+                c = _line[i];
+                if (c == '\n' || c == EOS || c == '>')
+                {
+                    break;
+                }
+
+                c = chartab[c];
+                if (c)
+                {
+                    characterSeq += c;
+                }
+            }
+            if (c == '>')
+            {
+                break;
+            }
+        }
+*/
+
+        _fileIn.close();
+
+        if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
+        {
+            parseExitCode=SEQUENCETOOBIG;
+            // return empty seq
+            return Sequence(blank, blank, blank);
+        }
+        else if (characterSeq.length() == 0)
+        {
+            parseExitCode=EMPTYSEQUENCE;
+            // return empty seq
+            return Sequence(blank, blank, blank);
+        }
+        
+        return Sequence(characterSeq, name, title);
+    }
+
+    catch(...)
+    {
+        cerr << "There was an exception in the PearsonFileParser::getSeq function.\n"
+             << "Need to end program\n";
+        exit(1);
+    }
+}
+
+/**
+ * The function countSeqs, counts the number of sequences in a file.
+ * @return The number of sequences in the file.
+ */
+int PearsonFileParser::countSeqs()
+{
+    //char line[1000 + 1];
+    int _nseqs = 0;
+    string line2;
+    char delim;
+
+    try
+    {
+        //_fileIn = new InFileStream;  //nige
+        //_fileIn->open(fileName.c_str());  //nige
+        delim=FileParser::getDelimiter(fileName);
+        ifstream _fileIn;
+        _fileIn.open(fileName.c_str(),ios::in);
+
+    
+        if(!_fileIn.is_open())
+        {
+            return 0; // No sequences found!
+        }
+    
+        /* while ((*_fileIn) >> line2/@_fileIn->getline(line, 1000 + 1)@/)
+           {
+           /@if(_nseqs == 50)
+           {
+           cout << "\n\n" << line << "\n\n";
+           exit(1);
+           }@/
+            */
+        while (std::getline(_fileIn,line2,delim)) {                 
+            if (line2[0] == '>')
+                {
+                    _nseqs++;
+                }
+        }
+        _fileIn.close();
+        return _nseqs;
+    }
+    catch(...)
+    {
+        freeFileResources(_fileIn);
+        cerr << "An exception has occured in the function PearsonFileParser::countSeqs()\n"
+             << "Program needs to terminate.\nPlease contact the Clustal developers\n";
+        exit(1);    
+    }
+}
+
+/**
+ * There is no secondary structure information in the Pearson file. This is here to 
+ * set the structPenalties to NONE.
+ * @param gapPenaltyMask 
+ * @param secStructMask 
+ * @param secStructName 
+ * @param structPenalties 
+ * @param length 
+ */
+void PearsonFileParser::getSecStructure(vector<char>& gapPenaltyMask, 
+                         vector<char>& secStructMask, string& secStructName, 
+                          int &structPenalties, int length)
+{
+    structPenalties = NONE;
+}
+
+}
+
+