9df9a4aac9bcbe7878742e8ffd56f59ecef40b53
[jabaws.git] / binaries / src / clustalw / src / fileInput / GDEFileParser.cpp
1 /**
2  * Author: Mark Larkin
3  * 
4  * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.  
5  */
6 /**
7  * Changes:
8  *
9  * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10  * cross-platform end-of-lines.
11  */
12
13 #ifdef HAVE_CONFIG_H
14     #include "config.h"
15 #endif
16 #include "GDEFileParser.h"
17
18 namespace clustalw
19 {
20
21 /*
22  * Constructor sets up the chartab array.
23  *
24  */
25 GDEFileParser::GDEFileParser(string filePath)
26 {
27     fileName = filePath; 
28     fillCharTab();
29 }
30
31 /*
32  * Nothing to do in destruction of object.
33  */
34 GDEFileParser::~GDEFileParser()
35 {
36
37 }
38
39
40     vector<Sequence> GDEFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
41 {
42     vector<Sequence> seqRangeVector;
43     int i;
44
45     for (i=0; i<no; i++)
46     { 
47         Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
48         if (parseExitCode!=OK) {
49             seqRangeVector.clear();
50             return seqRangeVector;
51         }
52         seqRangeVector.push_back(tempSeq);
53     }
54     return seqRangeVector;
55 }
56
57
58 /*
59  * The getSeq function is used to get sequence number seqNum from the file.
60  */
61     Sequence GDEFileParser::getSeq(int seqNum, string *offendingSeq)
62 {
63     char _line[MAXLINE + 1];
64     char _sname[MAXNAMES + 1];
65     string characterSeq = "";
66     string name = "";
67     string title = "";
68     string blank = "";
69     
70     _line[0] = EOS; 
71     
72     int i;
73     unsigned char c;
74     int _currentSeqNum = 0; // Not at any sequence yet!
75     
76     try
77     {
78         _fileIn = new InFileStream;  //nige
79         _fileIn->open(fileName.c_str());  //nige
80         _fileIn->seekg(0, std::ios::beg);
81                 
82         bool dnaFlagSet = userParameters->getDNAFlag();
83         while (_currentSeqNum != seqNum)
84         {
85             while((*_line != '#' && dnaFlagSet) ||
86                   (*_line != '%' && !dnaFlagSet))
87             {
88                 if(!_fileIn->getline(_line, MAXLINE + 1)) 
89                 {
90                     _fileIn->close();
91                     return Sequence(blank, blank, blank);
92                 }
93             }
94             ++_currentSeqNum;
95             if(_currentSeqNum == seqNum) // Found the sequence
96             {
97                 break;
98             }
99             // Get next line so that we are past the '#' or '%' line
100             _fileIn->getline(_line, MAXLINE + 1);  //nige
101         }
102         
103         for (i = 1; i <= MAXNAMES; i++)
104         {
105             if (_line[i] == '(' || _line[i] == '\n' || _line[i] == '\r')
106             {
107                 break;
108             }
109             _sname[i - 1] = _line[i];
110         }
111         i--;
112         _sname[i] = EOS;
113
114         for (i--; i > 0; i--)
115         {
116             if (isspace(_sname[i]))
117             {
118                 _sname[i] = EOS;
119             }
120             else
121             {
122                 break;
123             }
124         }
125         utilityObject->blankToUnderscore(_sname);
126         name = string(_sname);
127         title = "";
128         
129         while (_fileIn->getline(_line, MAXLINE + 1))
130         {
131             if (*_line == '%' ||  *_line == '#' ||  *_line == '"')
132             {
133                break;
134             }
135             for (i = 0; i <= MAXLINE; i++)
136             {
137                 c = _line[i];
138                 if (c == '\n' || c == EOS)
139                 {
140                     break;
141                 }
142
143                 c = chartab[c];
144                 if (c)
145                 {
146                     characterSeq += c;
147                 }
148             }
149         }
150         
151         _fileIn->close();
152         
153         if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
154         {
155             parseExitCode=SEQUENCETOOBIG;
156             if (offendingSeq!=NULL)
157                 offendingSeq->assign(name);
158             // return empty seq
159             return Sequence(blank, blank, blank);
160         }
161         return Sequence(characterSeq, name, title);
162     }
163     catch(...)
164     {
165         _fileIn->close();
166         cerr << "There was an exception in the GDEnFileParser::getSeq function.\n"
167              << "Need to end program\n";
168         exit(1);    
169     }
170
171 }
172
173 /*
174  * The countSeqs function returns the number of sequences in the file. 
175  */
176 int GDEFileParser::countSeqs()
177 {
178     char line[MAXLINE + 1];
179     int _nseqs = 0;
180     
181     try
182     {
183         _fileIn = new InFileStream;  //nige
184         _fileIn->open(fileName.c_str());  //nige
185     
186         if(!_fileIn->is_open())
187         {
188             return 0; // No sequences found!
189         }
190     
191         while (_fileIn->getline(line, MAXLINE + 1))
192         {
193             if ((*line == '%') && (userParameters->getDNAFlag() == false))
194             {
195                 _nseqs++;
196             }
197             else if ((*line == '#') && (userParameters->getDNAFlag() == true))
198             {
199                 _nseqs++;
200             }
201         }
202         _fileIn->close();
203
204         return _nseqs;
205     }
206     catch(...)
207     {
208         _fileIn->close();
209         cerr << "An exception has occured in the function GDEFileParser::countSeqs()\n"
210              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
211         exit(1);    
212     }    
213 }
214
215 /*
216  * getSecStructure gets the secondary structure from the file.
217  */
218 void GDEFileParser::getSecStructure(vector<char>& gapPenaltyMask, vector<char>& secStructMask,
219                                     string& secStructName, int &structPenalties, int length)
220 {
221     char _title[MAXLINE + 1];
222     char _line[MAXLINE + 1];
223     char _lin2[MAXLINE + 1];
224     char _sname[MAXNAMES + 1];
225     int i, len, offset = 0;
226     unsigned char c;
227     
228     try
229     {
230         _fileIn = new InFileStream;  //nige
231         _fileIn->open(fileName.c_str());  //nige
232         _fileIn->seekg(0, std::ios::beg);
233     
234         // NOTE I think I should empty the masks before pushing onto them!
235         gapPenaltyMask.clear();
236         secStructMask.clear();
237     
238         for (;;)
239         {
240             _line[0] = '\0';
241             // search for the next comment line
242             while (*_line != '"')
243             {
244                 if (!_fileIn->getline(_line, MAXLINE + 1))
245                 {
246                     _fileIn->close();
247                     return;
248                 }
249             }
250
251             // is it a secondary structure entry? 
252             if (strncmp(&_line[1], "SS_", 3) == 0)
253             {
254                 for (i = 1; i <= MAXNAMES - 3; i++)
255                 {
256                     if (_line[i + 3] == '(' || _line[i + 3] == '\n' || _line[i + 3] == '\r')
257                     {
258                         break;
259                     }
260                     _sname[i - 1] = _line[i + 3];
261                 }
262                 i--;
263                 _sname[i] = EOS;
264             
265                 // NOTE NOTE NOTE
266                 // Is it possible for this to be executed????????????????
267                 // if _line contains ( then we break and dont put it into _sname
268                 // So how can sname have it???????
269                 if (_sname[i - 1] == '(')
270                 {
271                     sscanf(&_line[i + 3], "%d", &offset);
272                 }
273                 else
274                 {
275                     offset = 0;
276                 }
277                 for (i--; i > 0; i--)
278                 {
279                     if (isspace(_sname[i]))
280                     {
281                         _sname[i] = EOS;
282                     }
283                     else
284                     {
285                         break;
286                     }
287                 }
288
289                 utilityObject->blankToUnderscore(_sname);
290                 secStructName = string(_sname);
291             
292                 if (userParameters->getInteractive())
293                 {
294                     strcpy(_title, "Found secondary structure in alignment file: ");
295                     strcat(_title, _sname);
296                     (*_lin2) = utilityObject->promptForYesNo(_title,
297                         "Use it to set local gap penalties ");
298                 }
299                 else
300                 {
301                     (*_lin2) = 'y';
302                 }
303                 if ((*_lin2 != 'n') && (*_lin2 != 'N'))
304                 {
305                     structPenalties = SECST;
306                     for (i = 0; i < length; i++)
307                     {
308                         secStructMask.push_back('.');
309                     }
310                     len = 0;
311                     while (_fileIn->getline(_line, MAXLINE + 1))
312                     {
313                         if (*_line == '%' ||  *_line == '#' ||  *_line == '"')
314                         {
315                             break;
316                         }
317                         for (i = offset; i < length; i++)
318                         {
319                             c = _line[i];
320                             if (c == '\n' || c == EOS)
321                             {
322                                 break;
323                             }
324                             // EOL
325                             secStructMask[len++] = c;
326                         }
327                         if (len >= length) // NOTE i put in >=
328                         {
329                             break;
330                         }
331                     }
332                 }
333             }
334         
335             // or is it a gap penalty mask entry?
336             else if (strncmp(&_line[1], "GM_", 3) == 0)
337             {
338                 for (i = 1; i <= MAXNAMES - 3; i++)
339                 {
340                     if (_line[i + 3] == '(' || _line[i + 3] == '\n')
341                     {
342                         break;
343                     }
344                     _sname[i - 1] = _line[i + 3];
345                 }
346                 i--;
347                 _sname[i] = EOS;
348             
349                 // NOTE NOTE
350                 // Again I dont think it is possible for _sname to have ( !!!!
351                 if (_sname[i - 1] == '(')
352                 {
353                     sscanf(&_line[i + 3], "%d", &offset);
354                 }
355                 else
356                 {
357                     offset = 0;
358                 }
359                 for (i--; i > 0; i--)
360                 {
361                     if (isspace(_sname[i]))
362                     {
363                         _sname[i] = EOS;
364                     }
365                     else
366                     {
367                         break;
368                     }
369                 }
370             
371                 utilityObject->blankToUnderscore(_sname);
372                 secStructName = string(_sname);
373
374                 if (userParameters->getInteractive())
375                 {
376                     strcpy(_title, "Found gap penalty mask in alignment file: ");
377                     strcat(_title, _sname);
378                     (*_lin2) = utilityObject->promptForYesNo(_title,
379                         "Use it to set local gap penalties ");
380                 }
381                 else
382                 {
383                     (*_lin2) = 'y';
384                 }            
385                 if ((*_lin2 != 'n') && (*_lin2 != 'N'))
386                 {
387                     structPenalties = GMASK;
388                     for (i = 0; i < length; i++)
389                     {
390                         gapPenaltyMask.push_back('1');
391                     }
392                     len = 0;
393                     while (_fileIn->getline(_line, MAXLINE + 1))
394                     {
395                         if (*_line == '%' ||  *_line == '#' ||  *_line == '"')
396                         {
397                             break;
398                         }
399                         for (i = offset; i < length; i++)
400                         {
401                             c = _line[i];
402                             if (c == '\n' || c == EOS)
403                             {
404                                 break;
405                             }
406                             // EOL
407                             gapPenaltyMask[len++] = c;
408                         }
409                         if (len >= length) // NOTE I put in >=
410                         {
411                             break;
412                         }
413                     }
414                 }
415             }
416             if (structPenalties != NONE)
417             {
418                 break;
419             }
420         }
421         _fileIn->close();
422     }
423     catch(...)
424     {
425         _fileIn->close();
426         cerr << "An exception has occured in the function GDEFileParser::getSecStructure()\n"
427              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
428         exit(1);    
429     }
430 }
431
432 }
433