1 package jalview.datamodel;
\r
3 import java.util.Vector;
\r
5 public abstract class CigarBase
\r
8 * Base class for compact idiosyncratic representation of gaps and aligned residues
\r
9 * Regards to Tom Oldfield for his DynamicArray class.
\r
15 // nothing to be done (probably)
\r
18 protected int length = 0;
\r
19 protected int _inc_length = 10; // extension range for addition of new operations
\r
20 protected char[] operation = null;
\r
21 protected int[] range = null;
\r
23 * Range of Hidden residues in seq (translated as deleted in seq)
\r
25 public static final char D = 'D'; /**
\r
26 * Range of insertions to seq
\r
28 public static final char I = 'I'; /**
\r
29 * Range of aligned residues
\r
31 public static final char M = 'M';
\r
32 static protected final char _case_shift = 'a' - 'A';
\r
34 * Ugly function to get edited sequence string, start and end symbol positions and the deletion regions as an array of int pairs
\r
35 * May return null for an empty cigar string.
\r
36 * May return null for deletion ranges if there are none.
\r
37 * @param reference - the symbol sequence to apply the cigar operations to (or null if no sequence)
\r
38 * @param GapChar - the symbol to use for Insert operations
\r
39 * @return Object[] { String, int[] {start, startcol, end, endcol}, int[][3] {start, end, col} or null} the gapped sequence, first and last residue index, and the deletion ranges on the reference sequence
\r
41 public Object[] getSequenceAndDeletions(String reference, char GapChar)
\r
44 int[][] deletions = new int[length][];
\r
45 int[][] trunc_deletions = null;
\r
46 StringBuffer sq = new StringBuffer();
\r
47 int cursor = 0, alcursor=0,start = 0, startpos=0, end = 0, endpos=0, delcount = -1;
\r
48 boolean consecutive_del = false;
\r
53 if (reference!=null)
\r
54 rlength=reference.length();
\r
55 boolean modstart = true;
\r
56 for (int i = 0; i < length; i++)
\r
58 switch (operation[i])
\r
61 if (!consecutive_del)
\r
63 deletions[++delcount] = new int[]
\r
65 cursor, 0, alcursor};
\r
68 deletions[delcount][1] = cursor - 1;
\r
69 consecutive_del = true;
\r
72 consecutive_del = false;
\r
73 for (int r = 0; r < range[i]; r++)
\r
80 consecutive_del = false;
\r
87 if (reference!=null) {
\r
88 int sbend = cursor+range[i];
\r
89 if (sbend>rlength) {
\r
90 sq.append(reference.substring(cursor, rlength));
\r
91 while (sbend-- >= rlength)
\r
96 sq.append(reference.substring(cursor, sbend));
\r
100 cursor += range[i];
\r
105 throw new Error("Unknown SeqCigar operation '" + operation[i] + "'");
\r
108 if (++delcount > 0)
\r
110 trunc_deletions = new int[delcount][];
\r
111 System.arraycopy(deletions, 0, trunc_deletions, 0, delcount);
\r
114 return new Object[]
\r
116 ((reference!=null) ? sq.toString() : null),
\r
118 start, startpos, end, endpos}, trunc_deletions};
\r
120 protected void compact_operations() {
\r
122 if (operation==null)
\r
124 char last = operation[0];
\r
126 if (last==operation[i]) {
\r
127 range[i-1]+=range[i];
\r
130 System.arraycopy(range, i + 1, range, i, r);
\r
131 System.arraycopy(operation, i + 1, operation, i, r);
\r
135 last = operation[i++];
\r
140 * turn a cigar string into a series of operation range pairs
\r
141 * @param cigarString String
\r
142 * @return object[] {char[] operation, int[] range}
\r
143 * @throws java.lang.Exception for improperly formated cigar strings or ones with unknown operations
\r
145 public static Object[] parseCigarString(String cigarString)
\r
149 for (int i = 0, l = cigarString.length(); i < l; i++)
\r
151 char c = cigarString.charAt(i);
\r
152 if (c == M || c == (M - _case_shift) || c == I || c == (I - _case_shift) ||
\r
153 c == D || c == (D - _case_shift))
\r
158 char[] operation = new char[ops];
\r
159 int[] range = new int[ops];
\r
161 int i = 0, l = cigarString.length();
\r
168 c = cigarString.charAt(j++);
\r
170 while (c >= '0' && c <= '9' && j < l);
\r
171 if (j >= l && c >= '0' && c <= '9')
\r
173 throw new Exception("Unterminated cigar string.");
\r
177 String rangeint = cigarString.substring(i, j - 1);
\r
178 range[op] = Integer.parseInt(rangeint);
\r
181 catch (Exception e)
\r
183 throw new Error("Implementation bug in parseCigarString");
\r
185 if (c >= 'a' && c <= 'z')
\r
189 if ( (c == M || c == I || c == D))
\r
191 operation[op++] = c;
\r
195 throw new Exception("Unexpected operation '" + c +
\r
196 "' in cigar string (position " + i + " in '" +
\r
197 cigarString + "'");
\r
200 return new Object[]
\r
206 * add an operation to cigar string
\r
210 public void addOperation(char op, int range)
\r
212 if (op >= 'a' && op <= 'z')
\r
216 if (op != M && op != D && op != I)
\r
218 throw new Error("Implementation error. Invalid operation string.");
\r
221 throw new Error("Invalid range string (must be non-zero positive number)");
\r
223 if (operation == null)
\r
225 this.operation = new char[_inc_length];
\r
226 this.range = new int[_inc_length];
\r
228 if (length + 1 == operation.length)
\r
230 char[] ops = this.operation;
\r
231 this.operation = new char[length + 1 + _inc_length];
\r
232 System.arraycopy(ops, 0, this.operation, 0, length);
\r
234 int[] rng = this.range;
\r
235 this.range = new int[length + 1 + _inc_length];
\r
236 System.arraycopy(rng, 0, this.range, 0, length);
\r
239 if ((length>0) && (operation[length-1]==op))
\r
240 length--; // modify existing operation.
\r
242 this.range[length]=0; // reset range
\r
243 this.operation[length] = op;
\r
244 this.range[length++] += range;
\r
248 * semi-efficient insert an operation on the current cigar string set at column pos (from 1)
\r
249 * NOTE: Insertion operations simply extend width of cigar result - affecting registration of alignment
\r
250 * Deletion ops will shorten length of result - and affect registration of alignment
\r
251 * Match ops will also affect length of result - affecting registration of alignment
\r
252 * (ie "10M".insert(4,I,3)->"4M3I3M") - (replace?)
\r
253 * (ie "10M".insert(4,D,3)->"4M3D3M") - (shortens alignment)
\r
254 * (ie "5I5M".insert(4,I,3)->"8I5M") - real insertion
\r
255 * (ie "5I5M".insert(4,D,3)->"4I2D3M") - shortens aligment - I's are removed, Ms changed to Ds
\r
256 * (ie "10M".insert(4,M,3)->"13M") - lengthens - Is changed to M, Ds changed to M.
\r
257 * (ie "5I5M".insert(4,M,3)->"4I8M") - effectively shifts sequence left by 1 residue and extends it by 3
\r
258 * ( "10D5M".insert(-1,M,3)->"3M7D5M")
\r
259 * ( "10D5M".insert(0,M,3)->"7D8M")
\r
260 * ( "10D5M".insert(1,M,3)->"10D8M")
\r
262 * ( "1M10D5M".insert(0,M,3)->"1M10D8M")
\r
263 * ( "1M10D5M".insert(1,M,3)->"
\r
265 * if pos is beyond width - I operations are added before the operation
\r
266 * @param pos int -1, 0-length of visible region, or greater to append new ops (with insertions in between)
\r
269 public void addOperationAt(int pos, char op, int range)
\r
271 int cursor = -1; // mark the position for the current operation being edited.
\r
273 boolean last_d = false; // previous op was a deletion.
\r
275 throw new Error("pos<-1 is not supported.");
\r
277 if (operation[o] != D)
\r
279 if ( (cursor + this.range[o]) < pos)
\r
281 cursor += this.range[o];
\r
296 // must insert more operations before pos
\r
298 addInsertion(pos-cursor);
\r
299 // then just add the new operation. Regardless of what it is.
\r
300 addOperation(op, range);
\r
302 int diff = pos - cursor;
\r
304 int e_length = length-o; // new edit operation array length.
\r
305 // diff<0 - can only happen before first insertion or match. - affects op and all following
\r
306 // dif==0 - only when at first position of existing op -
\r
307 // diff>0 - must preserve some existing operations
\r
308 int[] e_range = new int[e_length];
\r
309 System.arraycopy(this.range, o, e_range, 0, e_length);
\r
310 char[] e_op = new char[e_length];
\r
311 System.arraycopy(this.operation, o, e_op, 0, e_length);
\r
312 length = o; // can now use add_operation to extend list.
\r
313 int e_o=0; // current operation being edited.
\r
319 if (last_d && diff <= 0)
\r
321 // reduce D's, if possible
\r
322 if (range<=this.range[o-1]) {
\r
323 this.range[o - 1] -= range;
\r
327 if (this.range[o-1]==0)
\r
328 o--; // lose this op.
\r
330 e_range[e_o] += range; // just add more matched residues
\r
333 // change from insertion to match
\r
334 if (last_d && diff<=0)
\r
336 // reduce D's, if possible
\r
337 if (range<=this.range[o-1]) {
\r
338 this.range[o - 1] -= range;
\r
342 if (this.range[o-1]==0)
\r
343 o--; // lose this op.
\r
358 throw new Error("Implementation Error: Unknown operation in addOperation!");
\r
360 // finally, add remaining ops.
\r
361 while (e_o<e_length) {
\r
362 addOperation(e_op[e_o], e_range[e_o]);
\r
369 * Mark residues from start to end (inclusive) as deleted from the alignment, and removes any insertions.
\r
372 * @return deleted int - number of symbols marked as deleted
\r
374 public int deleteRange(int start, int end) {
\r
377 // nothing to do here
\r
380 if (start<0 || start>end)
\r
381 throw new Error("Implementation Error: deleteRange out of bounds: start must be non-negative and less than end.");
\r
383 int cursor = 0; // mark the position for the current operation being edited.
\r
384 int rlength=1+end-start; // number of positions to delete
\r
387 boolean editing=false;
\r
388 char[] oldops = operation;
\r
389 int[] oldrange = range;
\r
393 compact_operations();
\r
394 while (o<oldlen && cursor<=end && rlength>0) {
\r
395 if (oldops[o] == D) {
\r
396 // absorbed into new deleted region.
\r
397 addDeleted(oldrange[o++]);
\r
401 int remain = oldrange[o]; // number of op characters left to edit
\r
403 if ( (cursor + remain) <= start)
\r
405 addOperation(oldops[o],oldrange[o]);
\r
406 cursor+=oldrange[o++];
\r
407 continue; // next operation
\r
410 // add operations before hidden region
\r
411 if (start-cursor>0) {
\r
412 addOperation(oldops[o], start- cursor);
\r
413 remain -= start - cursor;
\r
416 // start inserting new ops
\r
417 if (o<oldlen && editing && rlength>0 && remain>0) {
\r
418 switch (oldops[o]) {
\r
420 if (rlength>remain) {
\r
421 addDeleted(remain);
\r
425 addDeleted(rlength);
\r
426 if (remain-rlength>0)
\r
427 this.addOperation(M,remain-rlength); // add remaining back.
\r
433 if (remain-rlength>0) {
\r
434 // only remove some gaps
\r
435 addInsertion(remain-rlength);
\r
440 throw new Error("Implementation error."); // do nothing;
\r
442 throw new Error("Implementation Error! Unknown operation '"+oldops[o]+"'");
\r
445 remain = oldrange[++o]; // number of op characters left to edit
\r
450 addOperation(oldops[o],oldrange[o++]);
\r
452 //if (cursor<(start+1)) {
\r
453 // ran out of ops - nothing to do here ?
\r
454 // addInsertion(start-cursor);
\r
460 * Deleted regions mean that there will be discontinuous sequence numbering in the
\r
461 * sequence returned by getSeq(char).
\r
462 * @return true if there deletions
\r
464 public boolean hasDeletedRegions()
\r
466 for (int i = 0; i<length ; i++)
\r
468 if (operation[i] == D)
\r
476 * enumerate the ranges on seq that are marked as deleted in this cigar
\r
477 * @return int[] { vis_start, sym_start, length }
\r
479 public int[] getDeletedRegions() {
\r
482 Vector dr = new Vector();
\r
483 int cursor=0, vcursor=0;
\r
484 for (int i=0;i<length;i++) {
\r
485 switch (operation[i]) {
\r
492 dr.addElement(new int[] { vcursor, cursor, range[i]});
\r
498 int[] delregions = new int[dr.size()*3];
\r
499 for (int i=0,l=dr.size(); i<l; i++) {
\r
500 int[] reg = (int[]) dr.elementAt(i);
\r
501 delregions[i*3] = reg[0];
\r
502 delregions[i*3+1] = reg[1];
\r
503 delregions[i*3+2] = reg[2];
\r
508 * sum of ranges in cigar string
\r
509 * @return int number of residues hidden, matched, or gaps inserted into sequence
\r
511 public int getFullWidth()
\r
516 for (int i = 0; i < length; i++)
\r
525 * Visible length of aligned sequence
\r
526 * @return int length of including gaps and less hidden regions
\r
528 public int getWidth()
\r
533 for (int i = 0; i < length; i++)
\r
535 if (operation[i] == M || operation[i] == I)
\r
544 * mark a range of inserted residues
\r
547 public void addInsertion(int range)
\r
549 this.addOperation(I, range);
\r
553 * mark the next range residues as hidden (not aligned) or deleted
\r
556 public void addDeleted(int range)
\r
558 this.addOperation(D, range);
\r
562 * Modifies operation list to delete columns from start to end (inclusive)
\r
563 * editing will remove insertion operations, and convert matches to deletions
\r
564 * @param start alignment column
\r
565 * @param end alignment column
\r
566 * @return boolean true if residues were marked as deleted.
\r
567 public boolean deleteRange(int start, int end)
\r
569 boolean deleted = false;
\r
570 int op = 0, prevop = -1, firstm = -1,
\r
571 lastm = -1, postop = -1;
\r
572 int width = 0; // zero'th column
\r
575 // find operation bracketing start of the range
\r
578 if (operation[op] != D)
\r
580 width += range[prevop = op];
\r
584 while (op < length && width < start);
\r
588 // run off end - add more operations up to deletion.
\r
589 addInsertion(start - width);
\r
593 // edit existing operations.
\r
595 width -= range[prevop];
\r
596 int[] oldrange = range;
\r
597 char[] oldops = operation;
\r
598 range = new int[oldrange.length];
\r
599 operation = new char[oldops.length];
\r
604 if (operation[op] != D)
\r
606 width += range[postop = op];
\r
610 while (op < length && width <= end);
\r
613 if (deleted == true)
\r
615 addDeleted(end - start + 1);
\r
621 * Return an ENSEMBL style cigar string where D may indicates excluded parts of seq
\r
622 * @return String of form ([0-9]+[IMD])+
\r
624 public String getCigarstring()
\r
626 StringBuffer cigarString = new StringBuffer();
\r
627 for (int i = 0; i < length; i++)
\r
629 cigarString.append("" + range[i]);
\r
630 cigarString.append(operation[i]);
\r
632 return cigarString.toString();
\r