1 /////////////////////////////////////////////////////////////////
4 // Program for scoring alignments according to the SUM-OF-PAIRS
6 /////////////////////////////////////////////////////////////////
8 #include "SafeVector.h"
9 #include "MultiSequence.h"
22 const char CORE_BLOCK = 'h';
23 typedef pair<int,int> PII;
24 bool useCoreBlocks = false;
25 bool useColScore = false;
27 bool useBaliAnnot = false;
28 bool makeAnnot = false;
30 /////////////////////////////////////////////////////////////////
31 // Function prototypes
32 /////////////////////////////////////////////////////////////////
34 set<PII> ComputePairs (MultiSequence *align, bool isRef);
35 set<VI> ComputeColumns (MultiSequence *align, bool isRef);
36 string GetName (string s);
39 set<VI> refCols, testCols;
40 set<PII> refPairs, testPairs;
43 /////////////////////////////////////////////////////////////////
47 /////////////////////////////////////////////////////////////////
49 int main (int argc, char **argv){
53 cerr << "Usage: score TEST_ALIGNMENT REFERENCE_ALIGNMENT [BALIBASE_ANNOT_FILE] [-col] [-core] [-caps] [-annot FILENAME]" << endl;
58 FileBuffer infile (argv[1]);
60 MultiSequence *testAlign;
62 cerr << "ERROR: Could not open file '" << argv[1] << "' for reading." << endl;
66 testAlign = new MultiSequence(); assert (testAlign);
67 testAlign->LoadMFA (infile);
71 MultiSequence *refAlign = new MultiSequence (string (argv[2])); assert (refAlign);
73 string outFilename = "";
75 for (int i = 3; i < argc; i++){
76 if (strcmp (argv[i], "-core") == 0)
78 else if (strcmp (argv[i], "-col") == 0)
80 else if (strcmp (argv[i], "-caps") == 0)
82 else if (strcmp (argv[i], "-annot") == 0){
84 outFilename = string (argv[++i]);
86 else { // annotation file
89 ifstream annotFile (argv[i]);
90 if (annotFile.fail()){
91 cerr << "ERROR: Could not read BAliBASE annotation file." << endl;
95 SafeVector<int> *indices = refAlign->GetSequence(0)->GetMapping();
98 while (annotFile.getline (buffer, 10000)){
100 ss.str (string (buffer));
104 if ((ss >> s) && s == string ("BPOS")){
106 int begin=-1, end=-1;
107 if (sscanf (s.c_str(), "%d=%d", &begin, &end) == 2){
108 for (int i = (*indices)[begin]; i <= (*indices)[end]; i++)
121 if (useColScore) makeAnnot = false;
124 for (int i = 0; i < testAlign->GetNumSequences(); i++){
127 for (int j = 0; !found && j < refAlign->GetNumSequences(); j++){
128 if (testAlign->GetSequence(i)->GetHeader() == refAlign->GetSequence(j)->GetHeader())
133 testAlign->RemoveSequence (i);
138 for (int i = 0; i < refAlign->GetNumSequences(); i++){
141 for (int j = 0; !found && j < testAlign->GetNumSequences(); j++){
142 if (refAlign->GetSequence(i)->GetHeader() == testAlign->GetSequence(j)->GetHeader())
147 refAlign->RemoveSequence (i);
152 testAlign->SortByHeader();
153 refAlign->SortByHeader();
161 refCols = ComputeColumns (refAlign, true);
162 if (testAlign) testCols = ComputeColumns (testAlign, false);
163 set<VI> colIntersect;
164 insert_iterator<set<VI> > colIntersectIter (colIntersect, colIntersect.begin());
165 set_intersection (testCols.begin(), testCols.end(), refCols.begin(), refCols.end(), colIntersectIter);
166 TP = (int) colIntersect.size();
167 TPFN = (int) refCols.size();
168 if (testAlign) TPFP = (int) testCols.size();
171 refPairs = ComputePairs (refAlign, true);
172 if (testAlign) testPairs = ComputePairs (testAlign, false);
173 set<PII> pairIntersect;
175 insert_iterator<set<PII> > pairIntersectIter (pairIntersect, pairIntersect.begin());
176 set_intersection (testPairs.begin(), testPairs.end(), refPairs.begin(), refPairs.end(), pairIntersectIter);
177 TP = (int) pairIntersect.size();
178 TPFN = (int) refPairs.size();
179 if (testAlign) TPFP = (int) testPairs.size();
182 FD = (double) TP / TPFN;
183 FM = (double) TP / TPFP;
185 cout << GetName(string (argv[2])) << " " << TP << " " << TPFN << " " << TPFP << " " << FD << " " << FM << endl;
188 ofstream outfile (outFilename.c_str());
189 for (int i = 0; i < (int) annotation.size(); i++){
190 outfile << annotation[i] << endl;
195 if (testAlign) delete testAlign;
199 int GetOffset (Sequence *testSeq, Sequence *refSeq){
200 string test = testSeq->GetString();
201 string ref = refSeq->GetString();
203 for (int i = 0; i < (int) test.length(); i++) test[i] = toupper(test[i]);
204 for (int i = 0; i < (int) ref.length(); i++) ref[i] = toupper(ref[i]);
206 size_t offset = test.find (ref, 0);
207 if (offset == string::npos){
208 cerr << "ERROR: Reference string not found in original sequence!" << endl;
209 cerr << " test = " << test << endl;
210 cerr << " ref = " << ref << endl;
214 cerr << "Offset found: " << offset << endl;
219 string GetName (string s){
221 size_t index1 = s.rfind ('/');
222 size_t index2 = s.rfind ('.');
224 if (index1 == string::npos) index1 = 0; else index1++;
225 if (index2 == string::npos) index2 = s.length();
227 if (index2 < index1) index2 = s.length();
229 return s.substr (index1, index2 - index1);
232 bool isCore (char ch, int col){
233 if (ch == '-') return false;
235 return coreCols.find (col) != coreCols.end();
238 return ch >= 'A' && ch <= 'Z';
240 return ch == CORE_BLOCK;
243 /////////////////////////////////////////////////////////////////
246 // Returns the set of all matching pairs.
247 /////////////////////////////////////////////////////////////////
249 set<PII> ComputePairs (MultiSequence *align, bool isRef){
250 int N = align->GetNumSequences();
251 int L = align->GetSequence(0)->GetLength();
253 // retrieve all sequence data pointers
254 SafeVector<SafeVector<char>::iterator> seqs (N);
255 for (int i = 0; i < N; i++){
256 seqs[i] = align->GetSequence(i)->GetDataPtr();
257 assert (align->GetSequence(i)->GetLength() == L);
264 for (int i = 1; i <= L; i++){
266 // ctr keeps track of the current position in each sequence
267 for (int j = 0; j < N; j++){
268 ctr[j] += (seqs[j][i] != '-');
274 // check for all matching pairs
275 for (int j = 0; j < N - 1; j++){
276 for (int k = j + 1; k < N; k++){
278 // skip if one of the sequences is gapped
279 if (seqs[j][i] == '-' || seqs[k][i] == '-') continue;
281 // check for core blocks in the reference sequence
282 if (isRef && useCoreBlocks)
283 if (!isCore (seqs[j][i], i) || !isCore (seqs[k][i], i)) continue;
285 // if all ok, then add pair to list of pairs
286 pair<int,int> p (10000 * j + ctr[j], 10000 * k + ctr[k]);
288 // if we're making an annotation, compute annotation statistics
289 if (makeAnnot && !isRef){
291 if (refPairs.find (p) != refPairs.end()) good++;
298 if (makeAnnot && !isRef){
299 annotation.push_back ((ct == 0) ? 0 : 100 * good / ct);
307 /////////////////////////////////////////////////////////////////
310 // Returns the set of all columns.
311 /////////////////////////////////////////////////////////////////
313 set<VI> ComputeColumns (MultiSequence *align, bool isRef){
314 int N = align->GetNumSequences();
315 int L = align->GetSequence(0)->GetLength();
317 // retrieve all sequence data pointers
318 SafeVector<SafeVector<char>::iterator> seqs (N);
319 for (int i = 0; i < N; i++){
320 seqs[i] = align->GetSequence(i)->GetDataPtr();
327 for (int i = 1; i <= L; i++){
329 // ctr keeps track of the current position in each sequence
330 for (int j = 0; j < N; j++){
331 ctr[j] += (seqs[j][i] != '-');
334 // add column, pick only positions that are matched
335 SafeVector<int> column (N);
336 bool useThisColumn = !useCoreBlocks;
338 for (int j = 0; j < N; j++){
339 if (isCore (seqs[j][i], i)) useThisColumn = true;
340 column[j] = (seqs[j][i] == '-') ? -1 : ctr[j];
343 if (useThisColumn || !isRef)