Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / clustalw / src / fileInput / EMBLFileParser.cpp
1 /**
2  * Author: Mark Larkin
3  * 
4  * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.  
5  */
6 /**
7  * Changes:
8  *
9  * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10  * cross-platform end-of-lines.
11  */
12
13 #ifdef HAVE_CONFIG_H
14     #include "config.h"
15 #endif
16 #include "EMBLFileParser.h"
17
18 namespace clustalw
19 {
20
21 /*
22  * constructor sets up chartab array.
23  */
24 EMBLFileParser::EMBLFileParser(string filePath)
25 {
26     fileName = filePath; 
27     fillCharTab();
28 }
29
30 /*
31  * dont need to destruct anything.
32  */
33 EMBLFileParser::~EMBLFileParser()
34 {
35
36 }
37
38
39 /*
40  * get range of sequences
41  */
42     vector<Sequence> EMBLFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
43 {
44     vector<Sequence> seqRangeVector;
45     int i;
46
47     for (i=0; i<no; i++)
48     { 
49         Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
50         if (parseExitCode!=OK) {
51             seqRangeVector.clear();
52             return seqRangeVector;
53         }
54         seqRangeVector.push_back(tempSeq);
55     }
56     return seqRangeVector;
57 }
58
59
60 /*
61  * get the sequence seqNum in the file.
62  */
63     Sequence EMBLFileParser::getSeq(int seqNum, string *offendingSeq)
64 {
65     char _line[MAXLINE + 1];
66     //char _tseq[MAXLINE + 1];
67     char _sname[MAXNAMES + 1];
68     //char _title[MAXTITLES + 1];
69     string characterSeq = "";
70     string name = "";
71     string title = "";
72     
73     _line[0] = EOS;
74     int i;
75     //int j;
76     bool gotSeq = false;
77     unsigned char c;
78     int _currentSeqNum = 0;
79     string blank = "";
80
81     try
82     {
83         _fileIn = new InFileStream;  //nige
84         _fileIn->open(fileName.c_str());  //nige
85         _fileIn->seekg(0, std::ios::beg); // start at the beginning
86
87         // Read in lines until we get to the begining of sequence seqNum.
88         while (_currentSeqNum != seqNum)
89         {
90             while(!utilityObject->lineType(_line, "ID"))
91             {
92                 if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
93                 {
94                     _fileIn->close();
95                     return Sequence(blank, blank, blank);
96                 }
97             }
98             ++_currentSeqNum;
99             if(_currentSeqNum == seqNum) // Found the sequence
100             {
101                 break;
102             }
103             // Get next line so that we are past the '>' line
104             _fileIn->getline(_line, MAXLINE + 1);
105         }
106
107         for (i = 5; i <= (int)strlen(_line); i++)
108         {
109             if (_line[i] != ' ')
110             {
111                 break;
112             }
113         }
114         strncpy(_sname, _line + i, MAXNAMES); // remember entryname 
115         for (i = 0; i <= (int)strlen(_sname); i++)
116         {
117             if (_sname[i] == ' ')
118             {
119                 _sname[i] = EOS;
120                 break;
121             }
122         }
123
124         _sname[MAXNAMES] = EOS;
125         utilityObject->rTrim(_sname);
126         utilityObject->blankToUnderscore(_sname);
127         name = string(_sname);
128         // Andreas Wilm (UCD): why cout here? cout << name << "\n"; 
129         
130         while (!utilityObject->lineType(_line, "SQ"))
131         {
132             if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
133             {
134                 _fileIn->close();
135                 // FIXME AW: why return with a name but otherwise empty seq?
136                 return Sequence(blank, name, blank);
137             }
138         }
139
140         while (_fileIn->getline(_line, MAXLINE + 1))
141         {
142             if (gotSeq && utilityObject->blankLine(_line))
143             {
144                 break;
145             }
146             
147             // NOTE I changed this to -1 and -2 because the getline doesnt return the \n
148             if (strlen(_line) > 2 && _line[strlen(_line) - 1] == '.' &&
149                     _line[strlen(_line) - 2] == '.')
150             {
151                 continue;
152             }
153
154             for (i = 0; i <= MAXLINE; i++)
155             {
156                 c = _line[i];
157                 if (c == '\n' || c == EOS || c == '/')
158                 {
159                     break;
160                 }
161                 // EOL     
162                 c = chartab[c];
163                 if (c)
164                 {
165                     gotSeq = true;
166                     characterSeq += c;
167                 }
168             }
169             if (c == '/')
170             {
171                 break;
172             }
173         }
174         _fileIn->close();
175         
176         if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
177         {
178             parseExitCode=SEQUENCETOOBIG;
179             if (offendingSeq!=NULL)
180                 offendingSeq->assign(name);
181             // return empty seq
182             return Sequence(blank, blank, blank);
183         }
184         return Sequence(characterSeq, name, title);
185     }
186     catch(...)
187     {
188         _fileIn->close();
189         cerr << "There was an exception in the EMBLFileParser::getSeq function.\n"
190              << "Need to end program\n";
191         exit(1);    
192     }            
193 }
194
195 /*
196  * count the number of sequences in the file and return the number
197  */
198 int EMBLFileParser::countSeqs()
199 {
200     char line[MAXLINE + 1];
201     // char c;
202     line[0] = EOS;
203     int numSeqs;
204     // int i;
205     //bool seqOk;
206     numSeqs = 0;
207     
208     try
209     {
210         _fileIn = new InFileStream;  //nige
211         _fileIn->open(fileName.c_str());  //nige
212     
213         if(!_fileIn->is_open())
214         {
215             return 0; // No sequences found!
216         }       
217                  
218         while (_fileIn->getline(line, MAXLINE + 1))
219         {
220             if (utilityObject->lineType(line, "ID"))
221             {
222                 numSeqs++;
223             }
224         }
225         
226         _fileIn->close();
227         return numSeqs;
228     }
229     catch(...)
230     {
231         _fileIn->close();
232         cerr << "An exception has occured in the function EMBLFileParser::countSeqs()\n"
233              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
234         exit(1);     
235     }
236 }
237
238 /*
239  * get secondary structure information from the file.
240  */
241 void EMBLFileParser::getSecStructure(vector<char>& gapPenaltyMask, vector<char>& secStructMask, string& secStructName, int &structPenalties, int length)
242 {
243     char _title[MAXLINE + 1];
244     char _line[MAXLINE + 1];
245     char _lin2[MAXLINE + 1];
246     char _sname[MAXNAMES + 1];
247     char _feature[MAXLINE + 1];
248     int i;
249     _line[0] = '\0';
250     
251     try
252     {
253         _fileIn = new InFileStream;  //nige
254         _fileIn->open(fileName.c_str());  //nige
255         _fileIn->seekg(0, std::ios::beg);
256
257         // clear out the masks
258         gapPenaltyMask.clear();
259         secStructMask.clear();
260             
261         // find the start of the sequence entry 
262         for (;;)
263         {
264             while (!utilityObject->lineType(_line, "ID"))
265             {
266                 if (!_fileIn->getline(_line, MAXLINE + 1))
267                 {
268                     _fileIn->close();
269                     return;
270                 }
271             }
272             
273             for (i = 5; i <= (int)strlen(_line); i++)
274             {
275                 if (_line[i] != ' ')
276                 {
277                     break;
278                 }
279             }
280             strncpy(_sname, _line + i, MAXNAMES); // remember entryname
281             for (i = 0; i <= (int)strlen(_sname); i++)
282             {
283                 if (_sname[i] == ' ')
284                 {
285                     _sname[i] = EOS;
286                     break;
287                 }
288             }
289             _sname[MAXNAMES] = EOS;
290             utilityObject->rTrim(_sname);
291             utilityObject->blankToUnderscore(_sname);
292             
293             // look for secondary structure feature table / gap penalty mask 
294             while (_fileIn->getline(_line, MAXLINE + 1))
295             {
296                 if (utilityObject->lineType(_line, "FT"))
297                 {
298                     sscanf(_line + 2, "%s", _feature);
299                     if (strcmp(_feature, "HELIX") == 0 || strcmp(_feature, "STRAND") == 0)
300                     {
301                         if (userParameters->getInteractive())
302                         {
303                             strcpy(_title,
304                                 "Found secondary structure in alignment file: ");
305                             strcat(_title, _sname);
306                             (*_lin2) = utilityObject->promptForYesNo(_title,
307                                 "Use it to set local gap penalties ");
308                         }
309                         else
310                         {
311                             (*_lin2) = 'y';
312                         }
313                         if ((*_lin2 != 'n') && (*_lin2 != 'N'))
314                         {
315                             structPenalties = SECST;
316                             for (i = 0; i < length; i++)
317                             {
318                                 secStructMask.push_back('.');
319                             }
320                             do
321                             {
322                                 getSwissFeature(&_line[2], secStructMask, length);
323                                 _fileIn->getline(_line, MAXLINE + 1);
324                             }
325                             while (utilityObject->lineType(_line, "FT"));
326                         }
327                         else
328                         {
329                             do
330                             {
331                                 _fileIn->getline(_line, MAXLINE + 1);
332                             }
333                             while (utilityObject->lineType(_line, "FT"));
334                         }
335                         secStructName = string(_sname);
336                     }
337                 }
338                 else if (utilityObject->lineType(_line, "GM"))
339                 {
340                     if (userParameters->getInteractive())
341                     {
342                         strcpy(_title, "Found gap penalty mask in alignment file: ");
343                         strcat(_title, _sname);
344                         (*_lin2) = utilityObject->promptForYesNo(_title,
345                             "Use it to set local gap penalties ");
346                     }
347                     else
348                     {
349                         (*_lin2) = 'y';
350                     }
351                     if ((*_lin2 != 'n') && (*_lin2 != 'N'))
352                     {
353                         structPenalties = GMASK;
354                         for (i = 0; i < length; i++)
355                         {
356                             gapPenaltyMask.push_back('1');
357                         }
358                         do
359                         {
360                             getSwissMask(&_line[2], gapPenaltyMask, length);
361                             _fileIn->getline(_line, MAXLINE + 1);
362                         }
363                         while (utilityObject->lineType(_line, "GM"));
364                     }
365                     else
366                     {
367                         do
368                         {
369                             _fileIn->getline(_line, MAXLINE + 1);
370                         }
371                         while (utilityObject->lineType(_line, "GM"));
372                     }
373                     secStructName = string(_sname);
374                 }
375                 if (utilityObject->lineType(_line, "SQ"))
376                 {
377                     break;
378                 }
379
380                 if (structPenalties != NONE)
381                 {
382                     break;
383                 }
384             }
385         }
386         _fileIn->close();
387     }
388     catch(...)
389     {
390         _fileIn->close();
391         cerr << "An exception has occured in the function EMBLFileParser::getSecStructure()\n"
392              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
393         exit(1);    
394     }
395     
396 }
397
398 /*
399  * get the sec structure mask
400  */
401 void EMBLFileParser::getSwissFeature(char* line, vector<char>& secStructMask, int length)
402 {
403     char c, s, feature[MAXLINE + 1];
404     int i, startPos, endPos;
405
406     try
407     {
408         if (sscanf(line, "%s%d%d", feature, &startPos, &endPos) != 3)
409         {
410             return ;
411         }
412
413         if (strcmp(feature, "HELIX") == 0)
414         {
415             c = 'A';
416             s = '$';
417         }
418         else if (strcmp(feature, "STRAND") == 0)
419         {
420             c = 'B';
421             s = '%';
422         }
423         else
424         {
425             return ;
426         }
427
428         if (startPos >= length || endPos >= length)
429         {
430             return ;
431         }
432
433         secStructMask[startPos - 1] = s;
434         for (i = startPos; i < endPos - 1; i++)
435         {
436             secStructMask[i] = c;
437         }
438         secStructMask[endPos - 1] = s;
439     }
440     catch(...)
441     {
442         cerr << "An exception has occured in the function EMBLFileParser::getSwissFeature()\n"
443              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
444         exit(1);    
445     }
446 }
447
448 /*
449  * get the gap penalty mask.
450  */
451 void EMBLFileParser::getSwissMask(char* line, vector<char>& gapPenaltyMask, int length)
452 {
453     int i, value, startPos, endPos;
454
455     try
456     {
457         if (sscanf(line, "%d%d%d", &value, &startPos, &endPos) != 3)
458         {
459             return;
460         }
461
462         if (value < 1 || value > 9)
463         {
464             return ;
465         }
466
467         if (startPos >= length || endPos >= length)
468         {
469             return ;
470         }
471         for (i = startPos - 1; i < endPos; i++)
472         {
473             gapPenaltyMask[i] = value + '0';
474         }
475     }
476     catch(...)
477     {
478         cerr << "An exception has occured in the function EMBLFileParser::getSwissMask()\n"
479              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
480         exit(1);    
481     }    
482 }
483
484
485 }