2 * Jalview - A Sequence Alignment Editor and Viewer
\r
3 * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
5 * This program is free software; you can redistribute it and/or
\r
6 * modify it under the terms of the GNU General Public License
\r
7 * as published by the Free Software Foundation; either version 2
\r
8 * of the License, or (at your option) any later version.
\r
10 * This program is distributed in the hope that it will be useful,
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 * GNU General Public License for more details.
\r
15 * You should have received a copy of the GNU General Public License
\r
16 * along with this program; if not, write to the Free Software
\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
19 package jalview.datamodel;
\r
21 import java.util.Vector;
\r
23 public abstract class CigarBase
\r
26 * Base class for compact idiosyncratic representation of gaps and aligned residues
\r
27 * Regards to Tom Oldfield for his DynamicArray class.
\r
33 // nothing to be done (probably)
\r
36 protected int length = 0;
\r
37 protected int _inc_length = 10; // extension range for addition of new operations
\r
38 protected char[] operation = null;
\r
39 protected int[] range = null;
\r
41 * Range of Hidden residues in seq (translated as deleted in seq)
\r
43 public static final char D = 'D'; /**
\r
44 * Range of insertions to seq
\r
46 public static final char I = 'I'; /**
\r
47 * Range of aligned residues
\r
49 public static final char M = 'M';
\r
50 static protected final char _case_shift = 'a' - 'A';
\r
52 * Ugly function to get edited sequence string, start and end symbol positions and the deletion regions as an array of int pairs
\r
53 * May return null for an empty cigar string.
\r
54 * May return null for deletion ranges if there are none.
\r
55 * @param reference - the symbol sequence to apply the cigar operations to (or null if no sequence)
\r
56 * @param GapChar - the symbol to use for Insert operations
\r
57 * @return Object[] { String, int[] {start, startcol, end, endcol}, int[][3] {start, end, col} or null} the gapped sequence, first and last residue index, and the deletion ranges on the reference sequence
\r
59 public Object[] getSequenceAndDeletions(String reference, char GapChar)
\r
62 int[][] deletions = new int[length][];
\r
63 int[][] trunc_deletions = null;
\r
64 StringBuffer sq = new StringBuffer();
\r
65 int cursor = 0, alcursor=0,start = 0, startpos=0, end = 0, endpos=0, delcount = -1;
\r
66 boolean consecutive_del = false;
\r
71 if (reference!=null)
\r
72 rlength=reference.length();
\r
73 boolean modstart = true;
\r
74 for (int i = 0; i < length; i++)
\r
76 switch (operation[i])
\r
79 if (!consecutive_del)
\r
81 deletions[++delcount] = new int[]
\r
83 cursor, 0, alcursor};
\r
86 deletions[delcount][1] = cursor - 1;
\r
87 consecutive_del = true;
\r
90 consecutive_del = false;
\r
91 for (int r = 0; r < range[i]; r++)
\r
98 consecutive_del = false;
\r
105 if (reference!=null) {
\r
106 int sbend = cursor+range[i];
\r
107 if (sbend>rlength) {
\r
108 sq.append(reference.substring(cursor, rlength));
\r
109 while (sbend-- >= rlength)
\r
111 sq.append(GapChar);
\r
114 sq.append(reference.substring(cursor, sbend));
\r
117 alcursor+=range[i];
\r
118 cursor += range[i];
\r
123 throw new Error("Unknown SeqCigar operation '" + operation[i] + "'");
\r
126 if (++delcount > 0)
\r
128 trunc_deletions = new int[delcount][];
\r
129 System.arraycopy(deletions, 0, trunc_deletions, 0, delcount);
\r
132 return new Object[]
\r
134 ((reference!=null) ? sq.toString() : null),
\r
136 start, startpos, end, endpos}, trunc_deletions};
\r
138 protected void compact_operations() {
\r
140 if (operation==null)
\r
142 char last = operation[0];
\r
144 if (last==operation[i]) {
\r
145 range[i-1]+=range[i];
\r
148 System.arraycopy(range, i + 1, range, i, r);
\r
149 System.arraycopy(operation, i + 1, operation, i, r);
\r
153 last = operation[i++];
\r
158 * turn a cigar string into a series of operation range pairs
\r
159 * @param cigarString String
\r
160 * @return object[] {char[] operation, int[] range}
\r
161 * @throws java.lang.Exception for improperly formated cigar strings or ones with unknown operations
\r
163 public static Object[] parseCigarString(String cigarString)
\r
167 for (int i = 0, l = cigarString.length(); i < l; i++)
\r
169 char c = cigarString.charAt(i);
\r
170 if (c == M || c == (M - _case_shift) || c == I || c == (I - _case_shift) ||
\r
171 c == D || c == (D - _case_shift))
\r
176 char[] operation = new char[ops];
\r
177 int[] range = new int[ops];
\r
179 int i = 0, l = cigarString.length();
\r
186 c = cigarString.charAt(j++);
\r
188 while (c >= '0' && c <= '9' && j < l);
\r
189 if (j >= l && c >= '0' && c <= '9')
\r
191 throw new Exception("Unterminated cigar string.");
\r
195 String rangeint = cigarString.substring(i, j - 1);
\r
196 range[op] = Integer.parseInt(rangeint);
\r
199 catch (Exception e)
\r
201 throw new Error("Implementation bug in parseCigarString");
\r
203 if (c >= 'a' && c <= 'z')
\r
207 if ( (c == M || c == I || c == D))
\r
209 operation[op++] = c;
\r
213 throw new Exception("Unexpected operation '" + c +
\r
214 "' in cigar string (position " + i + " in '" +
\r
215 cigarString + "'");
\r
218 return new Object[]
\r
224 * add an operation to cigar string
\r
228 public void addOperation(char op, int range)
\r
230 if (op >= 'a' && op <= 'z')
\r
234 if (op != M && op != D && op != I)
\r
236 throw new Error("Implementation error. Invalid operation string.");
\r
239 throw new Error("Invalid range string (must be non-zero positive number)");
\r
241 if (operation == null)
\r
243 this.operation = new char[_inc_length];
\r
244 this.range = new int[_inc_length];
\r
246 if (length + 1 == operation.length)
\r
248 char[] ops = this.operation;
\r
249 this.operation = new char[length + 1 + _inc_length];
\r
250 System.arraycopy(ops, 0, this.operation, 0, length);
\r
252 int[] rng = this.range;
\r
253 this.range = new int[length + 1 + _inc_length];
\r
254 System.arraycopy(rng, 0, this.range, 0, length);
\r
257 if ((length>0) && (operation[length-1]==op))
\r
258 length--; // modify existing operation.
\r
260 this.range[length]=0; // reset range
\r
261 this.operation[length] = op;
\r
262 this.range[length++] += range;
\r
266 * semi-efficient insert an operation on the current cigar string set at column pos (from 1)
\r
267 * NOTE: Insertion operations simply extend width of cigar result - affecting registration of alignment
\r
268 * Deletion ops will shorten length of result - and affect registration of alignment
\r
269 * Match ops will also affect length of result - affecting registration of alignment
\r
270 * (ie "10M".insert(4,I,3)->"4M3I3M") - (replace?)
\r
271 * (ie "10M".insert(4,D,3)->"4M3D3M") - (shortens alignment)
\r
272 * (ie "5I5M".insert(4,I,3)->"8I5M") - real insertion
\r
273 * (ie "5I5M".insert(4,D,3)->"4I2D3M") - shortens aligment - I's are removed, Ms changed to Ds
\r
274 * (ie "10M".insert(4,M,3)->"13M") - lengthens - Is changed to M, Ds changed to M.
\r
275 * (ie "5I5M".insert(4,M,3)->"4I8M") - effectively shifts sequence left by 1 residue and extends it by 3
\r
276 * ( "10D5M".insert(-1,M,3)->"3M7D5M")
\r
277 * ( "10D5M".insert(0,M,3)->"7D8M")
\r
278 * ( "10D5M".insert(1,M,3)->"10D8M")
\r
280 * ( "1M10D5M".insert(0,M,3)->"1M10D8M")
\r
281 * ( "1M10D5M".insert(1,M,3)->"
\r
283 * if pos is beyond width - I operations are added before the operation
\r
284 * @param pos int -1, 0-length of visible region, or greater to append new ops (with insertions in between)
\r
287 public void addOperationAt(int pos, char op, int range)
\r
289 int cursor = -1; // mark the position for the current operation being edited.
\r
291 boolean last_d = false; // previous op was a deletion.
\r
293 throw new Error("pos<-1 is not supported.");
\r
295 if (operation[o] != D)
\r
297 if ( (cursor + this.range[o]) < pos)
\r
299 cursor += this.range[o];
\r
314 // must insert more operations before pos
\r
316 addInsertion(pos-cursor);
\r
317 // then just add the new operation. Regardless of what it is.
\r
318 addOperation(op, range);
\r
320 int diff = pos - cursor;
\r
322 int e_length = length-o; // new edit operation array length.
\r
323 // diff<0 - can only happen before first insertion or match. - affects op and all following
\r
324 // dif==0 - only when at first position of existing op -
\r
325 // diff>0 - must preserve some existing operations
\r
326 int[] e_range = new int[e_length];
\r
327 System.arraycopy(this.range, o, e_range, 0, e_length);
\r
328 char[] e_op = new char[e_length];
\r
329 System.arraycopy(this.operation, o, e_op, 0, e_length);
\r
330 length = o; // can now use add_operation to extend list.
\r
331 int e_o=0; // current operation being edited.
\r
337 if (last_d && diff <= 0)
\r
339 // reduce D's, if possible
\r
340 if (range<=this.range[o-1]) {
\r
341 this.range[o - 1] -= range;
\r
345 if (this.range[o-1]==0)
\r
346 o--; // lose this op.
\r
348 e_range[e_o] += range; // just add more matched residues
\r
351 // change from insertion to match
\r
352 if (last_d && diff<=0)
\r
354 // reduce D's, if possible
\r
355 if (range<=this.range[o-1]) {
\r
356 this.range[o - 1] -= range;
\r
360 if (this.range[o-1]==0)
\r
361 o--; // lose this op.
\r
376 throw new Error("Implementation Error: Unknown operation in addOperation!");
\r
378 // finally, add remaining ops.
\r
379 while (e_o<e_length) {
\r
380 addOperation(e_op[e_o], e_range[e_o]);
\r
387 * Mark residues from start to end (inclusive) as deleted from the alignment, and removes any insertions.
\r
390 * @return deleted int - number of symbols marked as deleted
\r
392 public int deleteRange(int start, int end) {
\r
395 // nothing to do here
\r
398 if (start<0 || start>end)
\r
399 throw new Error("Implementation Error: deleteRange out of bounds: start must be non-negative and less than end.");
\r
401 int cursor = 0; // mark the position for the current operation being edited.
\r
402 int rlength=1+end-start; // number of positions to delete
\r
405 boolean editing=false;
\r
406 char[] oldops = operation;
\r
407 int[] oldrange = range;
\r
411 compact_operations();
\r
412 while (o<oldlen && cursor<=end && rlength>0) {
\r
413 if (oldops[o] == D) {
\r
414 // absorbed into new deleted region.
\r
415 addDeleted(oldrange[o++]);
\r
419 int remain = oldrange[o]; // number of op characters left to edit
\r
421 if ( (cursor + remain) <= start)
\r
423 addOperation(oldops[o],oldrange[o]);
\r
424 cursor+=oldrange[o++];
\r
425 continue; // next operation
\r
428 // add operations before hidden region
\r
429 if (start-cursor>0) {
\r
430 addOperation(oldops[o], start- cursor);
\r
431 remain -= start - cursor;
\r
434 // start inserting new ops
\r
435 if (o<oldlen && editing && rlength>0 && remain>0) {
\r
436 switch (oldops[o]) {
\r
438 if (rlength>remain) {
\r
439 addDeleted(remain);
\r
443 addDeleted(rlength);
\r
444 if (remain-rlength>0)
\r
445 this.addOperation(M,remain-rlength); // add remaining back.
\r
451 if (remain-rlength>0) {
\r
452 // only remove some gaps
\r
453 addInsertion(remain-rlength);
\r
458 throw new Error("Implementation error."); // do nothing;
\r
460 throw new Error("Implementation Error! Unknown operation '"+oldops[o]+"'");
\r
463 remain = oldrange[++o]; // number of op characters left to edit
\r
468 addOperation(oldops[o],oldrange[o++]);
\r
470 //if (cursor<(start+1)) {
\r
471 // ran out of ops - nothing to do here ?
\r
472 // addInsertion(start-cursor);
\r
478 * Deleted regions mean that there will be discontinuous sequence numbering in the
\r
479 * sequence returned by getSeq(char).
\r
480 * @return true if there deletions
\r
482 public boolean hasDeletedRegions()
\r
484 for (int i = 0; i<length ; i++)
\r
486 if (operation[i] == D)
\r
494 * enumerate the ranges on seq that are marked as deleted in this cigar
\r
495 * @return int[] { vis_start, sym_start, length }
\r
497 public int[] getDeletedRegions() {
\r
500 Vector dr = new Vector();
\r
501 int cursor=0, vcursor=0;
\r
502 for (int i=0;i<length;i++) {
\r
503 switch (operation[i]) {
\r
510 dr.addElement(new int[] { vcursor, cursor, range[i]});
\r
516 int[] delregions = new int[dr.size()*3];
\r
517 for (int i=0,l=dr.size(); i<l; i++) {
\r
518 int[] reg = (int[]) dr.elementAt(i);
\r
519 delregions[i*3] = reg[0];
\r
520 delregions[i*3+1] = reg[1];
\r
521 delregions[i*3+2] = reg[2];
\r
526 * sum of ranges in cigar string
\r
527 * @return int number of residues hidden, matched, or gaps inserted into sequence
\r
529 public int getFullWidth()
\r
534 for (int i = 0; i < length; i++)
\r
543 * Visible length of aligned sequence
\r
544 * @return int length of including gaps and less hidden regions
\r
546 public int getWidth()
\r
551 for (int i = 0; i < length; i++)
\r
553 if (operation[i] == M || operation[i] == I)
\r
562 * mark a range of inserted residues
\r
565 public void addInsertion(int range)
\r
567 this.addOperation(I, range);
\r
571 * mark the next range residues as hidden (not aligned) or deleted
\r
574 public void addDeleted(int range)
\r
576 this.addOperation(D, range);
\r
580 * Modifies operation list to delete columns from start to end (inclusive)
\r
581 * editing will remove insertion operations, and convert matches to deletions
\r
582 * @param start alignment column
\r
583 * @param end alignment column
\r
584 * @return boolean true if residues were marked as deleted.
\r
585 public boolean deleteRange(int start, int end)
\r
587 boolean deleted = false;
\r
588 int op = 0, prevop = -1, firstm = -1,
\r
589 lastm = -1, postop = -1;
\r
590 int width = 0; // zero'th column
\r
593 // find operation bracketing start of the range
\r
596 if (operation[op] != D)
\r
598 width += range[prevop = op];
\r
602 while (op < length && width < start);
\r
606 // run off end - add more operations up to deletion.
\r
607 addInsertion(start - width);
\r
611 // edit existing operations.
\r
613 width -= range[prevop];
\r
614 int[] oldrange = range;
\r
615 char[] oldops = operation;
\r
616 range = new int[oldrange.length];
\r
617 operation = new char[oldops.length];
\r
622 if (operation[op] != D)
\r
624 width += range[postop = op];
\r
628 while (op < length && width <= end);
\r
631 if (deleted == true)
\r
633 addDeleted(end - start + 1);
\r
639 * Return an ENSEMBL style cigar string where D may indicates excluded parts of seq
\r
640 * @return String of form ([0-9]+[IMD])+
\r
642 public String getCigarstring()
\r
644 StringBuffer cigarString = new StringBuffer();
\r
645 for (int i = 0; i < length; i++)
\r
647 cigarString.append("" + range[i]);
\r
648 cigarString.append(operation[i]);
\r
650 return cigarString.toString();
\r