2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4)
\r
3 * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
5 * This program is free software; you can redistribute it and/or
\r
6 * modify it under the terms of the GNU General Public License
\r
7 * as published by the Free Software Foundation; either version 2
\r
8 * of the License, or (at your option) any later version.
\r
10 * This program is distributed in the hope that it will be useful,
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 * GNU General Public License for more details.
\r
15 * You should have received a copy of the GNU General Public License
\r
16 * along with this program; if not, write to the Free Software
\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
19 package jalview.datamodel;
\r
23 public abstract class CigarBase
\r
26 * Base class for compact idiosyncratic representation of gaps and aligned
\r
27 * residues Regards to Tom Oldfield for his DynamicArray class. 17th July 2006
\r
32 // nothing to be done (probably)
\r
35 protected int length = 0;
\r
37 protected int _inc_length = 10; // extension range for addition of new
\r
40 protected char[] operation = null;
\r
42 protected int[] range = null;
\r
45 * Range of Hidden residues in seq (translated as deleted in seq)
\r
47 public static final char D = 'D';
\r
50 * Range of insertions to seq
\r
52 public static final char I = 'I';
\r
55 * Range of aligned residues
\r
57 public static final char M = 'M';
\r
59 static protected final char _case_shift = 'a' - 'A';
\r
62 * Ugly function to get edited sequence string, start and end symbol positions
\r
63 * and the deletion regions as an array of int pairs May return null for an
\r
64 * empty cigar string. May return null for deletion ranges if there are none.
\r
66 * @param reference -
\r
67 * the symbol sequence to apply the cigar operations to (or
\r
68 * null if no sequence)
\r
70 * the symbol to use for Insert operations
\r
71 * @return Object[] { String, int[] {start, startcol, end, endcol}, int[][3]
\r
72 * {start, end, col} or null} the gapped sequence, first and last
\r
73 * residue index, and the deletion ranges on the reference sequence
\r
75 public Object[] getSequenceAndDeletions(String reference, char GapChar)
\r
78 int[][] deletions = new int[length][];
\r
79 int[][] trunc_deletions = null;
\r
80 StringBuffer sq = new StringBuffer();
\r
81 int cursor = 0, alcursor = 0, start = 0, startpos = 0, end = 0, endpos = 0, delcount = -1;
\r
82 boolean consecutive_del = false;
\r
87 if (reference != null)
\r
89 rlength = reference.length();
\r
91 boolean modstart = true;
\r
92 for (int i = 0; i < length; i++)
\r
94 switch (operation[i])
\r
97 if (!consecutive_del)
\r
99 deletions[++delcount] = new int[]
\r
100 { cursor, 0, alcursor };
\r
102 cursor += range[i];
\r
103 deletions[delcount][1] = cursor - 1;
\r
104 consecutive_del = true;
\r
107 consecutive_del = false;
\r
108 for (int r = 0; r < range[i]; r++)
\r
110 sq.append(GapChar);
\r
115 consecutive_del = false;
\r
119 startpos = alcursor;
\r
122 if (reference != null)
\r
124 int sbend = cursor + range[i];
\r
125 if (sbend > rlength)
\r
127 sq.append(reference.substring(cursor, rlength));
\r
128 while (sbend-- >= rlength)
\r
130 sq.append(GapChar);
\r
135 sq.append(reference.substring(cursor, sbend));
\r
138 alcursor += range[i];
\r
139 cursor += range[i];
\r
144 throw new Error("Unknown SeqCigar operation '" + operation[i] + "'");
\r
147 if (++delcount > 0)
\r
149 trunc_deletions = new int[delcount][];
\r
150 System.arraycopy(deletions, 0, trunc_deletions, 0, delcount);
\r
153 return new Object[]
\r
154 { ((reference != null) ? sq.toString() : null), new int[]
\r
155 { start, startpos, end, endpos }, trunc_deletions };
\r
158 protected void compact_operations()
\r
161 if (operation == null)
\r
165 char last = operation[0];
\r
168 if (last == operation[i])
\r
170 range[i - 1] += range[i];
\r
171 int r = length - i;
\r
174 System.arraycopy(range, i + 1, range, i, r);
\r
175 System.arraycopy(operation, i + 1, operation, i, r);
\r
181 last = operation[i++];
\r
187 * turn a cigar string into a series of operation range pairs
\r
189 * @param cigarString
\r
191 * @return object[] {char[] operation, int[] range}
\r
192 * @throws java.lang.Exception
\r
193 * for improperly formated cigar strings or ones with unknown
\r
196 public static Object[] parseCigarString(String cigarString)
\r
200 for (int i = 0, l = cigarString.length(); i < l; i++)
\r
202 char c = cigarString.charAt(i);
\r
203 if (c == M || c == (M - _case_shift) || c == I
\r
204 || c == (I - _case_shift) || c == D || c == (D - _case_shift))
\r
209 char[] operation = new char[ops];
\r
210 int[] range = new int[ops];
\r
212 int i = 0, l = cigarString.length();
\r
219 c = cigarString.charAt(j++);
\r
220 } while (c >= '0' && c <= '9' && j < l);
\r
221 if (j >= l && c >= '0' && c <= '9')
\r
223 throw new Exception("Unterminated cigar string.");
\r
227 String rangeint = cigarString.substring(i, j - 1);
\r
228 range[op] = Integer.parseInt(rangeint);
\r
230 } catch (Exception e)
\r
232 throw new Error("Implementation bug in parseCigarString");
\r
234 if (c >= 'a' && c <= 'z')
\r
238 if ((c == M || c == I || c == D))
\r
240 operation[op++] = c;
\r
244 throw new Exception("Unexpected operation '" + c
\r
245 + "' in cigar string (position " + i + " in '"
\r
246 + cigarString + "'");
\r
249 return new Object[]
\r
250 { operation, range };
\r
254 * add an operation to cigar string
\r
261 public void addOperation(char op, int range)
\r
263 if (op >= 'a' && op <= 'z')
\r
267 if (op != M && op != D && op != I)
\r
269 throw new Error("Implementation error. Invalid operation string.");
\r
274 "Invalid range string (must be non-zero positive number)");
\r
277 if (operation == null)
\r
279 this.operation = new char[_inc_length];
\r
280 this.range = new int[_inc_length];
\r
282 if (length + 1 == operation.length)
\r
284 char[] ops = this.operation;
\r
285 this.operation = new char[length + 1 + _inc_length];
\r
286 System.arraycopy(ops, 0, this.operation, 0, length);
\r
288 int[] rng = this.range;
\r
289 this.range = new int[length + 1 + _inc_length];
\r
290 System.arraycopy(rng, 0, this.range, 0, length);
\r
293 if ((length > 0) && (operation[length - 1] == op))
\r
295 length--; // modify existing operation.
\r
299 this.range[length] = 0; // reset range
\r
301 this.operation[length] = op;
\r
302 this.range[length++] += range;
\r
306 * semi-efficient insert an operation on the current cigar string set at
\r
307 * column pos (from 1) NOTE: Insertion operations simply extend width of cigar
\r
308 * result - affecting registration of alignment Deletion ops will shorten
\r
309 * length of result - and affect registration of alignment Match ops will also
\r
310 * affect length of result - affecting registration of alignment (ie
\r
311 * "10M".insert(4,I,3)->"4M3I3M") - (replace?) (ie
\r
312 * "10M".insert(4,D,3)->"4M3D3M") - (shortens alignment) (ie
\r
313 * "5I5M".insert(4,I,3)->"8I5M") - real insertion (ie
\r
314 * "5I5M".insert(4,D,3)->"4I2D3M") - shortens aligment - I's are removed, Ms
\r
315 * changed to Ds (ie "10M".insert(4,M,3)->"13M") - lengthens - Is changed to
\r
316 * M, Ds changed to M. (ie "5I5M".insert(4,M,3)->"4I8M") - effectively shifts
\r
317 * sequence left by 1 residue and extends it by 3 (
\r
318 * "10D5M".insert(-1,M,3)->"3M7D5M") ( "10D5M".insert(0,M,3)->"7D8M") (
\r
319 * "10D5M".insert(1,M,3)->"10D8M")
\r
320 * ( "1M10D5M".insert(0,M,3)->"1M10D8M") ( "1M10D5M".insert(1,M,3)->"
\r
322 * if pos is beyond width - I operations are added before the operation
\r
325 * int -1, 0-length of visible region, or greater to append new
\r
326 * ops (with insertions in between)
\r
330 * int public void addOperationAt(int pos, char op, int range) {
\r
331 * int cursor = -1; // mark the position for the current
\r
332 * operation being edited. int o = 0; boolean last_d = false; //
\r
333 * previous op was a deletion. if (pos < -1) throw new
\r
334 * Error("pos<-1 is not supported."); while (o<length) { if
\r
335 * (operation[o] != D) { if ( (cursor + this.range[o]) < pos) {
\r
336 * cursor += this.range[o]; o++; last_d=false; } else { break; } }
\r
337 * else { last_d=true; o++; } } if (o==length) { // must insert
\r
338 * more operations before pos if (pos-cursor>0)
\r
339 * addInsertion(pos-cursor); // then just add the new
\r
340 * operation. Regardless of what it is. addOperation(op,
\r
341 * range); } else { int diff = pos - cursor;
\r
343 * int e_length = length-o; // new edit operation array length. // diff<0 -
\r
344 * can only happen before first insertion or match. - affects op and all
\r
345 * following // dif==0 - only when at first position of existing op - //
\r
346 * diff>0 - must preserve some existing operations int[] e_range = new
\r
347 * int[e_length]; System.arraycopy(this.range, o, e_range, 0, e_length);
\r
348 * char[] e_op = new char[e_length]; System.arraycopy(this.operation, o, e_op,
\r
349 * 0, e_length); length = o; // can now use add_operation to extend list. int
\r
350 * e_o=0; // current operation being edited. switch (op) { case M: switch
\r
351 * (e_op[e_o]) { case M: if (last_d && diff <= 0) { // reduce D's, if possible
\r
352 * if (range<=this.range[o-1]) { this.range[o - 1] -= range; } else {
\r
353 * this.range[o-1]=0; } if (this.range[o-1]==0) o--; // lose this op. }
\r
354 * e_range[e_o] += range; // just add more matched residues break; case I: //
\r
355 * change from insertion to match if (last_d && diff<=0) { // reduce D's, if
\r
356 * possible if (range<=this.range[o-1]) { this.range[o - 1] -= range; } else {
\r
357 * this.range[o-1]=0; } if (this.range[o-1]==0) o--; // lose this op. }
\r
358 * e_range[e_o] break; default: throw new Inp }
\r
360 * break; case I: break; case D: } break; default: throw new
\r
361 * Error("Implementation Error: Unknown operation in addOperation!"); } //
\r
362 * finally, add remaining ops. while (e_o<e_length) { addOperation(e_op[e_o],
\r
363 * e_range[e_o]); e_o++; } } }
\r
366 * Mark residues from start to end (inclusive) as deleted from the alignment,
\r
367 * and removes any insertions.
\r
373 * @return deleted int - number of symbols marked as deleted
\r
375 public int deleteRange(int start, int end)
\r
380 // nothing to do here
\r
383 if (start < 0 || start > end)
\r
386 "Implementation Error: deleteRange out of bounds: start must be non-negative and less than end.");
\r
389 int cursor = 0; // mark the position for the current operation being edited.
\r
390 int rlength = 1 + end - start; // number of positions to delete
\r
391 int oldlen = length;
\r
393 boolean editing = false;
\r
394 char[] oldops = operation;
\r
395 int[] oldrange = range;
\r
399 compact_operations();
\r
400 while (o < oldlen && cursor <= end && rlength > 0)
\r
402 if (oldops[o] == D)
\r
404 // absorbed into new deleted region.
\r
405 addDeleted(oldrange[o++]);
\r
409 int remain = oldrange[o]; // number of op characters left to edit
\r
412 if ((cursor + remain) <= start)
\r
414 addOperation(oldops[o], oldrange[o]);
\r
415 cursor += oldrange[o++];
\r
416 continue; // next operation
\r
419 // add operations before hidden region
\r
420 if (start - cursor > 0)
\r
422 addOperation(oldops[o], start - cursor);
\r
423 remain -= start - cursor;
\r
426 // start inserting new ops
\r
427 if (o < oldlen && editing && rlength > 0 && remain > 0)
\r
432 if (rlength > remain)
\r
434 addDeleted(remain);
\r
439 deleted += rlength;
\r
440 addDeleted(rlength);
\r
441 if (remain - rlength > 0)
\r
443 this.addOperation(M, remain - rlength); // add remaining back.
\r
450 if (remain - rlength > 0)
\r
452 // only remove some gaps
\r
453 addInsertion(remain - rlength);
\r
458 throw new Error("Implementation error."); // do nothing;
\r
460 throw new Error("Implementation Error! Unknown operation '"
\r
461 + oldops[o] + "'");
\r
464 remain = oldrange[++o]; // number of op characters left to edit
\r
470 addOperation(oldops[o], oldrange[o++]);
\r
472 // if (cursor<(start+1)) {
\r
473 // ran out of ops - nothing to do here ?
\r
474 // addInsertion(start-cursor);
\r
480 * Deleted regions mean that there will be discontinuous sequence numbering in
\r
481 * the sequence returned by getSeq(char).
\r
483 * @return true if there deletions
\r
485 public boolean hasDeletedRegions()
\r
487 for (int i = 0; i < length; i++)
\r
489 if (operation[i] == D)
\r
498 * enumerate the ranges on seq that are marked as deleted in this cigar
\r
500 * @return int[] { vis_start, sym_start, length }
\r
502 public int[] getDeletedRegions()
\r
508 Vector dr = new Vector();
\r
509 int cursor = 0, vcursor = 0;
\r
510 for (int i = 0; i < length; i++)
\r
512 switch (operation[i])
\r
515 cursor += range[i];
\r
517 vcursor += range[i];
\r
520 dr.addElement(new int[]
\r
521 { vcursor, cursor, range[i] });
\r
522 cursor += range[i];
\r
525 if (dr.size() == 0)
\r
529 int[] delregions = new int[dr.size() * 3];
\r
530 for (int i = 0, l = dr.size(); i < l; i++)
\r
532 int[] reg = (int[]) dr.elementAt(i);
\r
533 delregions[i * 3] = reg[0];
\r
534 delregions[i * 3 + 1] = reg[1];
\r
535 delregions[i * 3 + 2] = reg[2];
\r
541 * sum of ranges in cigar string
\r
543 * @return int number of residues hidden, matched, or gaps inserted into
\r
546 public int getFullWidth()
\r
551 for (int i = 0; i < length; i++)
\r
560 * Visible length of aligned sequence
\r
562 * @return int length of including gaps and less hidden regions
\r
564 public int getWidth()
\r
569 for (int i = 0; i < length; i++)
\r
571 if (operation[i] == M || operation[i] == I)
\r
581 * mark a range of inserted residues
\r
586 public void addInsertion(int range)
\r
588 this.addOperation(I, range);
\r
592 * mark the next range residues as hidden (not aligned) or deleted
\r
597 public void addDeleted(int range)
\r
599 this.addOperation(D, range);
\r
603 * Modifies operation list to delete columns from start to end (inclusive)
\r
604 * editing will remove insertion operations, and convert matches to deletions
\r
610 * @return boolean true if residues were marked as deleted. public boolean
\r
611 * deleteRange(int start, int end) { boolean deleted = false; int op =
\r
612 * 0, prevop = -1, firstm = -1, lastm = -1, postop = -1; int width =
\r
613 * 0; // zero'th column if (length > 0) { // find operation bracketing
\r
614 * start of the range do { if (operation[op] != D) { width +=
\r
615 * range[prevop = op]; } op++; } while (op < length && width < start); }
\r
616 * if (width < start) { // run off end - add more operations up to
\r
617 * deletion. addInsertion(start - width); } else { // edit existing
\r
618 * operations. op = prevop; width -= range[prevop]; int[] oldrange =
\r
619 * range; char[] oldops = operation; range = new int[oldrange.length];
\r
620 * operation = new char[oldops.length]; if (op < length) { do { if
\r
621 * (operation[op] != D) { width += range[postop = op]; } op++; } while
\r
622 * (op < length && width <= end); } } if (deleted == true) {
\r
623 * addDeleted(end - start + 1); } return deleted; }
\r
626 * Return an ENSEMBL style cigar string where D may indicates excluded parts
\r
629 * @return String of form ([0-9]+[IMD])+
\r
631 public String getCigarstring()
\r
633 StringBuffer cigarString = new StringBuffer();
\r
634 for (int i = 0; i < length; i++)
\r
636 cigarString.append("" + range[i]);
\r
637 cigarString.append(operation[i]);
\r
639 return cigarString.toString();
\r