1 /////////////////////////////////////////////////////////////////
4 // Program for scoring alignments according to the SUM-OF-PAIRS
6 /////////////////////////////////////////////////////////////////
8 #include "SafeVector.h"
9 #include "MultiSequence.h"
23 const char CORE_BLOCK = 'h';
24 typedef pair<int,int> PII;
25 bool useCoreBlocks = false;
26 bool useColScore = false;
28 bool useBaliAnnot = false;
29 bool makeAnnot = false;
31 /////////////////////////////////////////////////////////////////
32 // Function prototypes
33 /////////////////////////////////////////////////////////////////
35 set<PII> ComputePairs (MultiSequence *align, bool isRef);
36 set<VI> ComputeColumns (MultiSequence *align, bool isRef);
37 string GetName (string s);
40 set<VI> refCols, testCols;
41 set<PII> refPairs, testPairs;
44 /////////////////////////////////////////////////////////////////
48 /////////////////////////////////////////////////////////////////
50 int main (int argc, char **argv){
54 cerr << "Usage: score TEST_ALIGNMENT REFERENCE_ALIGNMENT [BALIBASE_ANNOT_FILE] [-col] [-core] [-caps] [-annot FILENAME]" << endl;
59 FileBuffer infile (argv[1]);
61 MultiSequence *testAlign;
63 cerr << "ERROR: Could not open file '" << argv[1] << "' for reading." << endl;
67 testAlign = new MultiSequence(); assert (testAlign);
68 testAlign->LoadMFA (infile);
72 MultiSequence *refAlign = new MultiSequence (string (argv[2])); assert (refAlign);
74 string outFilename = "";
76 for (int i = 3; i < argc; i++){
77 if (strcmp (argv[i], "-core") == 0)
79 else if (strcmp (argv[i], "-col") == 0)
81 else if (strcmp (argv[i], "-caps") == 0)
83 else if (strcmp (argv[i], "-annot") == 0){
85 outFilename = string (argv[++i]);
87 else { // annotation file
90 ifstream annotFile (argv[i]);
91 if (annotFile.fail()){
92 cerr << "ERROR: Could not read BAliBASE annotation file." << endl;
96 SafeVector<int> *indices = refAlign->GetSequence(0)->GetMapping();
99 while (annotFile.getline (buffer, 10000)){
101 ss.str (string (buffer));
105 if ((ss >> s) && s == string ("BPOS")){
107 int begin=-1, end=-1;
108 if (sscanf (s.c_str(), "%d=%d", &begin, &end) == 2){
109 for (int i = (*indices)[begin]; i <= (*indices)[end]; i++)
122 if (useColScore) makeAnnot = false;
125 for (int i = 0; i < testAlign->GetNumSequences(); i++){
128 for (int j = 0; !found && j < refAlign->GetNumSequences(); j++){
129 if (testAlign->GetSequence(i)->GetHeader() == refAlign->GetSequence(j)->GetHeader())
134 testAlign->RemoveSequence (i);
139 for (int i = 0; i < refAlign->GetNumSequences(); i++){
142 for (int j = 0; !found && j < testAlign->GetNumSequences(); j++){
143 if (refAlign->GetSequence(i)->GetHeader() == testAlign->GetSequence(j)->GetHeader())
148 refAlign->RemoveSequence (i);
153 testAlign->SortByHeader();
154 refAlign->SortByHeader();
162 refCols = ComputeColumns (refAlign, true);
163 if (testAlign) testCols = ComputeColumns (testAlign, false);
164 set<VI> colIntersect;
165 insert_iterator<set<VI> > colIntersectIter (colIntersect, colIntersect.begin());
166 set_intersection (testCols.begin(), testCols.end(), refCols.begin(), refCols.end(), colIntersectIter);
167 TP = (int) colIntersect.size();
168 TPFN = (int) refCols.size();
169 if (testAlign) TPFP = (int) testCols.size();
172 refPairs = ComputePairs (refAlign, true);
173 if (testAlign) testPairs = ComputePairs (testAlign, false);
174 set<PII> pairIntersect;
176 insert_iterator<set<PII> > pairIntersectIter (pairIntersect, pairIntersect.begin());
177 set_intersection (testPairs.begin(), testPairs.end(), refPairs.begin(), refPairs.end(), pairIntersectIter);
178 TP = (int) pairIntersect.size();
179 TPFN = (int) refPairs.size();
180 if (testAlign) TPFP = (int) testPairs.size();
183 FD = (double) TP / TPFN;
184 FM = (double) TP / TPFP;
186 cout << GetName(string (argv[2])) << " " << TP << " " << TPFN << " " << TPFP << " " << FD << " " << FM << endl;
189 ofstream outfile (outFilename.c_str());
190 for (int i = 0; i < (int) annotation.size(); i++){
191 outfile << annotation[i] << endl;
196 if (testAlign) delete testAlign;
200 int GetOffset (Sequence *testSeq, Sequence *refSeq){
201 string test = testSeq->GetString();
202 string ref = refSeq->GetString();
204 for (int i = 0; i < (int) test.length(); i++) test[i] = toupper(test[i]);
205 for (int i = 0; i < (int) ref.length(); i++) ref[i] = toupper(ref[i]);
207 size_t offset = test.find (ref, 0);
208 if (offset == string::npos){
209 cerr << "ERROR: Reference string not found in original sequence!" << endl;
210 cerr << " test = " << test << endl;
211 cerr << " ref = " << ref << endl;
215 cerr << "Offset found: " << offset << endl;
220 string GetName (string s){
222 size_t index1 = s.rfind ('/');
223 size_t index2 = s.rfind ('.');
225 if (index1 == string::npos) index1 = 0; else index1++;
226 if (index2 == string::npos) index2 = s.length();
228 if (index2 < index1) index2 = s.length();
230 return s.substr (index1, index2 - index1);
233 bool isCore (char ch, int col){
234 if (ch == '-') return false;
236 return coreCols.find (col) != coreCols.end();
239 return ch >= 'A' && ch <= 'Z';
241 return ch == CORE_BLOCK;
244 /////////////////////////////////////////////////////////////////
247 // Returns the set of all matching pairs.
248 /////////////////////////////////////////////////////////////////
250 set<PII> ComputePairs (MultiSequence *align, bool isRef){
251 int N = align->GetNumSequences();
252 int L = align->GetSequence(0)->GetLength();
254 // retrieve all sequence data pointers
255 SafeVector<SafeVector<char>::iterator> seqs (N);
256 for (int i = 0; i < N; i++){
257 seqs[i] = align->GetSequence(i)->GetDataPtr();
258 assert (align->GetSequence(i)->GetLength() == L);
265 for (int i = 1; i <= L; i++){
267 // ctr keeps track of the current position in each sequence
268 for (int j = 0; j < N; j++){
269 ctr[j] += (seqs[j][i] != '-');
275 // check for all matching pairs
276 for (int j = 0; j < N - 1; j++){
277 for (int k = j + 1; k < N; k++){
279 // skip if one of the sequences is gapped
280 if (seqs[j][i] == '-' || seqs[k][i] == '-') continue;
282 // check for core blocks in the reference sequence
283 if (isRef && useCoreBlocks)
284 if (!isCore (seqs[j][i], i) || !isCore (seqs[k][i], i)) continue;
286 // if all ok, then add pair to list of pairs
287 pair<int,int> p (10000 * j + ctr[j], 10000 * k + ctr[k]);
289 // if we're making an annotation, compute annotation statistics
290 if (makeAnnot && !isRef){
292 if (refPairs.find (p) != refPairs.end()) good++;
299 if (makeAnnot && !isRef){
300 annotation.push_back ((ct == 0) ? 0 : 100 * good / ct);
308 /////////////////////////////////////////////////////////////////
311 // Returns the set of all columns.
312 /////////////////////////////////////////////////////////////////
314 set<VI> ComputeColumns (MultiSequence *align, bool isRef){
315 int N = align->GetNumSequences();
316 int L = align->GetSequence(0)->GetLength();
318 // retrieve all sequence data pointers
319 SafeVector<SafeVector<char>::iterator> seqs (N);
320 for (int i = 0; i < N; i++){
321 seqs[i] = align->GetSequence(i)->GetDataPtr();
328 for (int i = 1; i <= L; i++){
330 // ctr keeps track of the current position in each sequence
331 for (int j = 0; j < N; j++){
332 ctr[j] += (seqs[j][i] != '-');
335 // add column, pick only positions that are matched
336 SafeVector<int> column (N);
337 bool useThisColumn = !useCoreBlocks;
339 for (int j = 0; j < N; j++){
340 if (isCore (seqs[j][i], i)) useThisColumn = true;
341 column[j] = (seqs[j][i] == '-') ? -1 : ctr[j];
344 if (useThisColumn || !isRef)