3 #include "textfile.h"
\r
6 const size_t MAX_FASTA_LINE = 16000;
\r
13 void SeqVect::Clear()
\r
15 for (size_t n = 0; n < size(); ++n)
\r
19 void SeqVect::ToFASTAFile(TextFile &File) const
\r
21 unsigned uSeqCount = Length();
\r
22 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
24 Seq *ptrSeq = at(uSeqIndex);
\r
25 ptrSeq->ToFASTAFile(File);
\r
29 void SeqVect::FromFASTAFile(TextFile &File)
\r
33 FILE *f = File.GetStdioFile();
\r
38 char *SeqData = GetFastaSeq(f, &uLength, &Label);
\r
41 Seq *ptrSeq = new Seq;
\r
43 for (unsigned i = 0; i < uLength; ++i)
\r
45 char c = SeqData[i];
\r
46 ptrSeq->push_back(c);
\r
49 ptrSeq->SetName(Label);
\r
57 void SeqVect::PadToMSA(MSA &msa)
\r
59 unsigned uSeqCount = Length();
\r
66 unsigned uLongestSeqLength = 0;
\r
67 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
69 Seq *ptrSeq = at(uSeqIndex);
\r
70 unsigned uColCount = ptrSeq->Length();
\r
71 if (uColCount > uLongestSeqLength)
\r
72 uLongestSeqLength = uColCount;
\r
74 msa.SetSize(uSeqCount, uLongestSeqLength);
\r
75 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
77 Seq *ptrSeq = at(uSeqIndex);
\r
78 msa.SetSeqName(uSeqIndex, ptrSeq->GetName());
\r
79 unsigned uColCount = ptrSeq->Length();
\r
81 for (uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
83 char c = ptrSeq->at(uColIndex);
\r
84 msa.SetChar(uSeqIndex, uColIndex, c);
\r
86 while (uColIndex < uLongestSeqLength)
\r
87 msa.SetChar(uSeqIndex, uColIndex++, '.');
\r
91 void SeqVect::Copy(const SeqVect &rhs)
\r
94 unsigned uSeqCount = rhs.Length();
\r
95 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
97 Seq *ptrSeq = rhs.at(uSeqIndex);
\r
98 Seq *ptrSeqCopy = new Seq;
\r
99 ptrSeqCopy->Copy(*ptrSeq);
\r
100 push_back(ptrSeqCopy);
\r
104 void SeqVect::StripGaps()
\r
106 unsigned uSeqCount = Length();
\r
107 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
109 Seq *ptrSeq = at(uSeqIndex);
\r
110 ptrSeq->StripGaps();
\r
114 void SeqVect::StripGapsAndWhitespace()
\r
116 unsigned uSeqCount = Length();
\r
117 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
119 Seq *ptrSeq = at(uSeqIndex);
\r
120 ptrSeq->StripGapsAndWhitespace();
\r
124 void SeqVect::ToUpper()
\r
126 unsigned uSeqCount = Length();
\r
127 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
129 Seq *ptrSeq = at(uSeqIndex);
\r
134 bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const
\r
136 unsigned uSeqCount = Length();
\r
137 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
139 const Seq *ptrSeq = at(uSeqIndex);
\r
140 if (0 == stricmp(ptrSeq->GetName(), ptrName))
\r
142 *ptruIndex = uSeqIndex;
\r
149 void SeqVect::AppendSeq(const Seq &s)
\r
151 Seq *ptrSeqCopy = new Seq;
\r
152 ptrSeqCopy->Copy(s);
\r
153 push_back(ptrSeqCopy);
\r
156 void SeqVect::LogMe() const
\r
158 unsigned uSeqCount = Length();
\r
159 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
161 const Seq *ptrSeq = at(uSeqIndex);
\r
166 const char *SeqVect::GetSeqName(unsigned uSeqIndex) const
\r
168 assert(uSeqIndex < size());
\r
169 const Seq *ptrSeq = at(uSeqIndex);
\r
170 return ptrSeq->GetName();
\r
173 unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const
\r
175 assert(uSeqIndex < size());
\r
176 const Seq *ptrSeq = at(uSeqIndex);
\r
177 return ptrSeq->GetId();
\r
180 unsigned SeqVect::GetSeqIdFromName(const char *Name) const
\r
182 const unsigned uSeqCount = GetSeqCount();
\r
183 for (unsigned i = 0; i < uSeqCount; ++i)
\r
185 if (!strcmp(Name, GetSeqName(i)))
\r
186 return GetSeqId(i);
\r
188 Quit("SeqVect::GetSeqIdFromName(%s): not found", Name);
\r
192 Seq &SeqVect::GetSeqById(unsigned uId)
\r
194 const unsigned uSeqCount = GetSeqCount();
\r
195 for (unsigned i = 0; i < uSeqCount; ++i)
\r
197 if (GetSeqId(i) == uId)
\r
200 Quit("SeqVect::GetSeqIdByUd(%d): not found", uId);
\r
201 return (Seq &) *((Seq *) 0);
\r
204 unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const
\r
206 assert(uSeqIndex < size());
\r
207 const Seq *ptrSeq = at(uSeqIndex);
\r
208 return ptrSeq->Length();
\r
211 Seq &SeqVect::GetSeq(unsigned uSeqIndex)
\r
213 assert(uSeqIndex < size());
\r
214 return *at(uSeqIndex);
\r
217 const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const
\r
219 assert(uSeqIndex < size());
\r
220 return *at(uSeqIndex);
\r
223 void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId)
\r
225 assert(uSeqIndex < size());
\r
226 Seq *ptrSeq = at(uSeqIndex);
\r
227 return ptrSeq->SetId(uId);
\r
230 ALPHA SeqVect::GuessAlpha() const
\r
232 // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap
\r
233 // letters belong to the nucleotide alphabet, guess nucleo.
\r
234 // Otherwise amino.
\r
235 const unsigned CHAR_COUNT = 100;
\r
236 const unsigned MIN_NUCLEO_PCT = 95;
\r
238 const unsigned uSeqCount = GetSeqCount();
\r
239 if (0 == uSeqCount)
\r
240 return ALPHA_Amino;
\r
242 unsigned uSeqIndex = 0;
\r
244 unsigned uSeqLength = GetSeqLength(0);
\r
245 unsigned uDNACount = 0;
\r
246 unsigned uRNACount = 0;
\r
247 unsigned uTotal = 0;
\r
248 const Seq *ptrSeq = &GetSeq(0);
\r
251 while (uPos >= uSeqLength)
\r
254 if (uSeqIndex >= uSeqCount)
\r
256 ptrSeq = &GetSeq(uSeqIndex);
\r
257 uSeqLength = ptrSeq->Length();
\r
260 if (uSeqIndex >= uSeqCount)
\r
262 char c = ptrSeq->at(uPos++);
\r
270 if (uTotal >= CHAR_COUNT)
\r
273 if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
\r
275 if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
\r
277 return ALPHA_Amino;
\r
280 void SeqVect::FixAlpha()
\r
282 ClearInvalidLetterWarning();
\r
283 unsigned uSeqCount = Length();
\r
284 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
286 Seq *ptrSeq = at(uSeqIndex);
\r
287 ptrSeq->FixAlpha();
\r
289 ReportInvalidLetters();
\r