--- /dev/null
+package jalview.datamodel;
+
+public abstract class CigarBase
+{
+ /**
+ * Base class for compact idiosyncratic representation of gaps and aligned residues
+ * Regards to Tom Oldfield for his DynamicArray class.
+ * 17th July 2006
+ * Not thread safe.
+ */
+ public CigarBase()
+ {
+ // nothing to be done (probably)
+ }
+
+ protected int length = 0;
+ protected int _inc_length = 10; // extension range for addition of new operations
+ protected char[] operation = null;
+ protected int[] range = null;
+ /**
+ * Range of Hidden residues in seq (translated as deleted in seq)
+ */
+ public static final char D = 'D'; /**
+ * Range of insertions to seq
+ */
+ public static final char I = 'I'; /**
+ * Range of aligned residues
+ */
+ public static final char M = 'M';
+ static protected final char _case_shift = 'a' - 'A';
+ /**
+ * Ugly function to get edited sequence string, start and end symbol positions and the deletion regions as an array of int pairs
+ * May return null for an empty cigar string.
+ * May return null for deletion ranges if there are none.
+ * @param reference - the symbol sequence to apply the cigar operations to (or null if no sequence)
+ * @param GapChar - the symbol to use for Insert operations
+ * @return Object[] { String, int[] {start, startcol, end, endcol}, int[][3] {start, end, col} or null} the gapped sequence, first and last residue index, and the deletion ranges on the reference sequence
+ */
+ public Object[] getSequenceAndDeletions(String reference, char GapChar)
+ {
+ int rlength=0;
+ int[][] deletions = new int[length][];
+ int[][] trunc_deletions = null;
+ StringBuffer sq = new StringBuffer();
+ int cursor = 0, alcursor=0,start = 0, startpos=0, end = 0, endpos=0, delcount = -1;
+ boolean consecutive_del = false;
+ if (length == 0)
+ {
+ return null;
+ }
+ if (reference!=null)
+ rlength=reference.length();
+ boolean modstart = true;
+ for (int i = 0; i < length; i++)
+ {
+ switch (operation[i])
+ {
+ case D:
+ if (!consecutive_del)
+ {
+ deletions[++delcount] = new int[]
+ {
+ cursor, 0, alcursor};
+ }
+ cursor += range[i];
+ deletions[delcount][1] = cursor - 1;
+ consecutive_del = true;
+ break;
+ case I:
+ consecutive_del = false;
+ for (int r = 0; r < range[i]; r++)
+ {
+ sq.append(GapChar);
+ alcursor++;
+ }
+ break;
+ case M:
+ consecutive_del = false;
+ if (modstart)
+ {
+ start = cursor;
+ startpos=alcursor;
+ modstart = false;
+ }
+ if (reference!=null) {
+ int sbend = cursor+range[i];
+ if (sbend>=rlength) {
+ sq.append(reference.substring(cursor, rlength));
+ while (sbend-- >= rlength)
+ {
+ sq.append(GapChar);
+ }
+ } else {
+ sq.append(reference.substring(cursor, sbend));
+ }
+ }
+ alcursor+=range[i];
+ cursor += range[i];
+ end = cursor - 1;
+ endpos = alcursor;
+ break;
+ default:
+ throw new Error("Unknown SeqCigar operation '" + operation[i] + "'");
+ }
+ }
+ if (++delcount > 0)
+ {
+ trunc_deletions = new int[delcount][];
+ System.arraycopy(deletions, 0, trunc_deletions, 0, delcount);
+ }
+ deletions = null;
+ return new Object[]
+ {
+ ((reference!=null) ? sq.toString() : null),
+ new int[] {
+ start, startpos, end, endpos}, trunc_deletions};
+ }
+
+ /**
+ * turn a cigar string into a series of operation range pairs
+ * @param cigarString String
+ * @return object[] {char[] operation, int[] range}
+ * @throws java.lang.Exception for improperly formated cigar strings or ones with unknown operations
+ */
+ public static Object[] parseCigarString(String cigarString)
+ throws Exception
+ {
+ int ops = 0;
+ for (int i = 0, l = cigarString.length(); i < l; i++)
+ {
+ char c = cigarString.charAt(i);
+ if (c == M || c == (M - _case_shift) || c == I || c == (I - _case_shift) ||
+ c == D || c == (D - _case_shift))
+ {
+ ops++;
+ }
+ }
+ char[] operation = new char[ops];
+ int[] range = new int[ops];
+ int op = 0;
+ int i = 0, l = cigarString.length();
+ while (i < l)
+ {
+ char c;
+ int j = i;
+ do
+ {
+ c = cigarString.charAt(j++);
+ }
+ while (c >= '0' && c <= '9' && j < l);
+ if (j >= l && c >= '0' && c <= '9')
+ {
+ throw new Exception("Unterminated cigar string.");
+ }
+ try
+ {
+ String rangeint = cigarString.substring(i, j - 1);
+ range[op] = Integer.parseInt(rangeint);
+ i = j;
+ }
+ catch (Exception e)
+ {
+ throw new Error("Implementation bug in parseCigarString");
+ }
+ if (c >= 'a' && c <= 'z')
+ {
+ c -= _case_shift;
+ }
+ if ( (c == M || c == I || c == D))
+ {
+ operation[op++] = c;
+ }
+ else
+ {
+ throw new Exception("Unexpected operation '" + c +
+ "' in cigar string (position " + i + " in '" +
+ cigarString + "'");
+ }
+ }
+ return new Object[]
+ {
+ operation, range};
+ }
+
+ /**
+ * inefficient add one operation to cigar string
+ * @param op char
+ * @param range int
+ */
+ public void addOperation(char op, int range)
+ {
+ if (op >= 'a' && op <= 'z')
+ {
+ op -= _case_shift;
+ }
+ if (op != M && op != D && op != I)
+ {
+ throw new Error("Implementation error. Invalid operation string.");
+ }
+ if (range<=0)
+ throw new Error("Invalid range string (must be non-zero positive number)");
+ int lngth = 0;
+ if (operation == null)
+ {
+ this.operation = new char[_inc_length];
+ this.range = new int[_inc_length];
+ }
+ if (length + 1 == operation.length)
+ {
+ char[] ops = this.operation;
+ this.operation = new char[length + 1 + _inc_length];
+ System.arraycopy(ops, 0, this.operation, 0, length);
+ ops = null;
+ int[] rng = this.range;
+ this.range = new int[length + 1 + _inc_length];
+ System.arraycopy(rng, 0, this.range, 0, length);
+ rng = null;
+ }
+ if ((length>0) && (operation[length-1]==op))
+ length--; // modify existing operation.
+ else
+ this.range[length]=0; // reset range
+ this.operation[length] = op;
+ this.range[length++] += range;
+ }
+
+ /**
+ * semi-efficient insert an operation on the current cigar string set at column pos (from 1)
+ * NOTE: Insertion operations simply extend width of cigar result - affecting registration of alignment
+ * Deletion ops will shorten length of result - and affect registration of alignment
+ * Match ops will also affect length of result - affecting registration of alignment
+ * (ie "10M".insert(4,I,3)->"4M3I3M")
+ * (ie "10M".insert(4,D,3)->"4M3D3M")
+ * (ie "5I5M".insert(4,I,3)->"8I5M")
+ * (ie "5I5M".insert(4,D,3)->"4I3M")
+ * if pos is beyond width - I operations are added before the operation
+ * (ie "10M".insert(4,M,3)->"13M")
+ * (ie "5I5M".insert(4,M,3)->"4I8M") - effectively shifts sequence left by 1 residue and extends it by 3
+ * @param pos int
+ * @param op char
+ * @param range int
+ */
+ public void addOperationAt(int pos, char op, int range)
+ {
+
+ }
+
+ /**
+ * sum of ranges in cigar string
+ * @return int number of residues hidden, matched, or gaps inserted into sequence
+ */
+ public int getFullWidth()
+ {
+ int w = 0;
+ if (range != null)
+ {
+ for (int i = 0; i < length; i++)
+ {
+ w += range[i];
+ }
+ }
+ return w;
+ }
+
+ /**
+ * Visible length of aligned sequence
+ * @return int length of including gaps and less hidden regions
+ */
+ public int getWidth()
+ {
+ int w = 0;
+ if (range != null)
+ {
+ for (int i = 0; i < length; i++)
+ {
+ if (operation[i] == M || operation[i] == I)
+ {
+ w += range[i];
+ }
+ }
+ }
+ return w;
+ }
+ /**
+ * mark a range of inserted residues
+ * @param range int
+ */
+ public void addInsertion(int range)
+ {
+ this.addOperation(I, range);
+ }
+
+ /**
+ * mark the next range residues as hidden (not aligned) or deleted
+ * @param range int
+ */
+ public void addDeleted(int range)
+ {
+ this.addOperation(D, range);
+ }
+
+ /**
+ * Modifies operation list to delete columns from start to end (inclusive)
+ * editing will remove insertion operations, and convert matches to deletions
+ * @param start alignment column
+ * @param end alignment column
+ * @return boolean true if residues were marked as deleted.
+ */
+ public boolean deleteRange(int start, int end)
+ {
+ boolean deleted = false;
+ int op = 0, prevop = -1, firstm = -1,
+ lastm = -1, postop = -1;
+ int width = 0; // zero'th column
+ if (length > 0)
+ {
+ // find operation bracketing start of the range
+ do
+ {
+ if (operation[op] != D)
+ {
+ width += range[prevop = op];
+ }
+ op++;
+ }
+ while (op < length && width < start);
+ }
+ if (width < start)
+ {
+ // run off end - add more operations up to deletion.
+ addInsertion(start - width);
+ }
+ else
+ {
+ // edit existing operations.
+ op = prevop;
+ width -= range[prevop];
+ int[] oldrange = range;
+ char[] oldops = operation;
+ range = new int[oldrange.length];
+ operation = new char[oldops.length];
+ if (op < length)
+ {
+ do
+ {
+ if (operation[op] != D)
+ {
+ width += range[postop = op];
+ }
+ op++;
+ }
+ while (op < length && width <= end);
+ }
+ }
+ if (deleted == true)
+ {
+ addDeleted(end - start + 1);
+ }
+ return deleted;
+ }
+
+ /**
+ * Return an ENSEMBL style cigar string where D may indicates excluded parts of seq
+ * @return String of form ([0-9]+[IMD])+
+ */
+ public String getCigarstring()
+ {
+ StringBuffer cigarString = new StringBuffer();
+ for (int i = 0; i < length; i++)
+ {
+ cigarString.append("" + range[i]);
+ cigarString.append(operation[i]);
+ }
+ return cigarString.toString();
+ }
+}
--- /dev/null
+package jalview.datamodel;
+
+import jalview.analysis.AlignSeq;
+
+public class SeqCigar
+ extends CigarSimple
+{
+
+ private SequenceI refseq=null;
+ /**
+ * Reference dataset sequence for the cigar string
+ * @return SequenceI
+ */
+ public SequenceI getRefSeq() {
+ return refseq;
+ }
+ /**
+ * Returns sequence as a string with cigar operations applied to it
+ * @return String
+ */
+ public String getSequenceString(char GapChar)
+ {
+ return (length==0) ? "" : (String) getSequenceAndDeletions(refseq.getSequence(), GapChar)[0];
+ }
+
+ /**
+ * recreates a gapped and edited version of RefSeq or null for an empty cigar string
+ * @return SequenceI
+ */
+ public SequenceI getSeq(char GapChar) {
+ Sequence seq;
+ if (refseq==null || length==0)
+ return null;
+ Object[] edit_result=getSequenceAndDeletions(refseq.getSequence(), GapChar);
+ if (edit_result==null)
+ throw new Error("Implementation Error - unexpected null from getSequenceAndDeletions");
+
+ seq = new Sequence(refseq.getName(), (String) edit_result[0], refseq.getStart()+((int[]) edit_result[1])[0], refseq.getStart()+((int[]) edit_result[1])[2]);
+ seq.setDatasetSequence(refseq);
+ return seq;
+ }
+ /*
+ We don't allow this - refseq is given at construction time only
+ public void setSeq(SequenceI seq) {
+ this.seq = seq;
+ }
+ */
+ /**
+ * internal constructor - sets seq to a gapless sequence derived from seq
+ * and prepends any 'D' operations needed to get to the first residue of seq.
+ * @param seq SequenceI
+ * @return true if gaps are present in seq
+ */
+ private boolean _setSeq(SequenceI seq) {
+ boolean hasgaps=false;
+
+ if (seq==null)
+ throw new Error("Implementation Error - _setSeq(null)");
+
+ // Find correct sequence to reference and add initial hidden offset
+ SequenceI ds = seq.getDatasetSequence();
+ if (ds==null) {
+ ds = new Sequence(seq.getName(),
+ AlignSeq.extractGaps(jalview.util.Comparison.GapChars, new String(seq.getSequence())),
+ seq.getStart(),
+ seq.getEnd());
+ }
+ // check that we haven't just duplicated an ungapped sequence.
+ if (ds.getLength()==seq.getLength()) {
+ ds = seq;
+ } else {
+ hasgaps = true;
+ }
+ this.refseq = ds;
+ // Adjust offset
+ if (ds.getStart()<seq.getStart()) {
+ addDeleted(seq.getStart()-ds.getStart());
+ }
+ return hasgaps;
+ }
+ /**
+ * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
+ * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
+ * @param seq SequenceI
+ * @param operation char[]
+ * @param range int[]
+ */
+ public SeqCigar(SequenceI seq, char operation[], int range[]) {
+ super();
+ if (seq==null)
+ throw new Error("Implementation Bug. Null seq !");
+ if (operation.length!=range.length) {
+ throw new Error("Implementation Bug. Cigar Operation list!= range list");
+ }
+
+ if (operation!=null) {
+ this.operation = new char[operation.length+_inc_length];
+ this.range = new int[operation.length+_inc_length];
+
+ if (_setSeq(seq)) {
+ throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
+ }
+ for (int i = this.length, j=0; j < operation.length; i++,j++)
+ {
+ char op = operation[j];
+ if (op != M && op != I && op != D)
+ {
+ throw new Error(
+ "Implementation Bug. Cigar Operation '"+j+"' '"+op+"' not one of '"+M+"', '"+I+"', or '"+D+"'.");
+ }
+ this.operation[i] = op;
+ this.range[i] = range[j];
+ }
+ this.length+=operation.length;
+ } else {
+ this.operation = null;
+ this.range = null;
+ this.length=0;
+ if (_setSeq(seq)) {
+ throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
+ }
+ }
+ }
+ /**
+ * add range matched residues to cigar string
+ * @param range int
+ */
+ public void addMatch(int range) {
+ this.addOperation(M, range);
+ }
+ /**
+ * Deleted regions mean that there will be discontinuous sequence numbering in the
+ * sequence returned by getSeq(char).
+ * @return true if there are non-terminal deletions
+ */
+ public boolean hasDeletedRegions() {
+ for (int i=1, l=length-1; i<l; i++) {
+ if (operation[i]==D)
+ return true;
+ }
+ return false;
+ }
+ protected static void addSequenceOps(CigarBase cigar, SequenceI seq, int startpos, int endpos) {
+ char op = '\0';
+ int range = 0;
+ int p = 0, res = seq.getLength();
+ startpos++;
+ endpos++;
+ while (p<res)
+ {
+ boolean isGap = jalview.util.Comparison.isGap(seq.getCharAt(p++));
+ if ( (startpos <= p) && (p < endpos))
+ {
+ if (isGap)
+ {
+ if (range > 0 && op != I)
+ {
+ cigar.addOperation(op, range);
+ range = 0;
+ }
+ op = I;
+ range++;
+ }
+ else
+ {
+ if (range > 0 && op != M)
+ {
+ cigar.addOperation(op, range);
+ range = 0;
+ }
+ op = M;
+ range++;
+ }
+ }
+ else
+ {
+ if (!isGap)
+ {
+ if (range > 0 && op != D)
+ {
+ cigar.addOperation(op, range);
+ range = 0;
+ }
+ op = D;
+ range++;
+ }
+ else
+ {
+ // do nothing - insertions are not recorded in flanking regions.
+ }
+ }
+ }
+ if (range > 0)
+ {
+ cigar.addOperation(op, range);
+ }
+ }
+ /**
+ * create a cigar string for given sequence
+ * @param seq SequenceI
+ */
+ public SeqCigar(SequenceI seq) {
+ super();
+ if (seq == null)
+ throw new Error("Implementation error for new Cigar(SequenceI)");
+ if (_setSeq(seq))
+ {
+ // there is still work to do
+ addSequenceOps(this, seq, 0, seq.getLength());
+ }
+ }
+ public SeqCigar(SequenceI seq, int start, int end) {
+ super();
+ if (seq == null)
+ throw new Error("Implementation error for new Cigar(SequenceI)");
+ if (_setSeq(seq))
+ {
+ // there is still work to do
+ addSequenceOps(this, seq, start, end);
+ }
+ }
+
+ /**
+ * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
+ * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
+ * @param seq SequenceI object resolvable to a dataset sequence
+ * @param cigarString String
+ * @return Cigar
+ */
+ public static SeqCigar parseCigar(SequenceI seq, String cigarString)
+ throws Exception
+ {
+ Object[] opsandrange = parseCigarString(cigarString);
+ return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
+ }
+ /**
+ * non rigorous testing
+ */
+ /**
+ *
+ * @param seq Sequence
+ * @param ex_cs_gapped String
+ * @return String
+ */
+ public static String testCigar_string(Sequence seq, String ex_cs_gapped) {
+ SeqCigar c_sgapped = new SeqCigar(seq);
+ String cs_gapped = c_sgapped.getCigarstring();
+ if (!cs_gapped.equals(ex_cs_gapped))
+ System.err.println("Failed getCigarstring: incorect string '"+cs_gapped+"' != "+ex_cs_gapped);
+ return cs_gapped;
+ }
+ public static boolean testSeqRecovery(SeqCigar gen_sgapped, SequenceI s_gapped) {
+ SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
+ if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence())) {
+ System.err.println("Couldn't reconstruct sequence.\n" +
+ gen_sgapped_s.getSequence() + "\n" +
+ s_gapped.getSequence());
+ return false;
+ }
+ return true;
+ }
+ public static void main(String argv[]) throws Exception {
+ Sequence s=new Sequence("MySeq", "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",39,80);
+ String orig_gapped;
+ Sequence s_gapped=new Sequence("MySeq", orig_gapped="----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt", 39,80);
+ String ex_cs_gapped="4I4M6I6M3I11M4I12M4I9M";
+ s_gapped.setDatasetSequence(s);
+ String sub_gapped_s;
+ Sequence s_subsequence_gapped=new Sequence("MySeq", sub_gapped_s="------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh", 43,77);
+
+ s_subsequence_gapped.setDatasetSequence(s);
+ SeqCigar c_null = new SeqCigar(s);
+ String cs_null = c_null.getCigarstring();
+ if (cs_null.length()>0)
+ System.err.println("Failed getCigarstring: Unexpected cigar operations:"+cs_null);
+ testCigar_string(s_gapped, ex_cs_gapped);
+ SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
+ if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
+ System.err.println("Failed parseCigar("+ex_cs_gapped+")->getCigarString()->'"+gen_sgapped.getCigarstring()+"'");
+ testSeqRecovery(gen_sgapped, s_gapped);
+ // Test dataset resolution
+ SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
+ if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
+ System.err.println("Failed recovery for subsequence of dataset sequence");
+ // width functions
+ if (sub_gapped.getWidth()!=sub_gapped_s.length())
+ System.err.println("Failed getWidth()");
+
+ sub_gapped.getFullWidth();
+ if (sub_gapped.hasDeletedRegions())
+ System.err.println("hasDeletedRegions is incorrect.");
+ // Test start-end region SeqCigar
+ SeqCigar sub_se_gp= new SeqCigar(s_subsequence_gapped, 8, 48);
+ if (sub_se_gp.getWidth()!=40)
+ System.err.println("SeqCigar(seq, start, end) not properly clipped alignsequence.");
+ System.out.println("Original sequence align:\n"+sub_gapped_s+"\nReconstructed window from 8 to 48\n"+"XXXXXXXX"+sub_se_gp.getSequenceString('-')+"...."+"\nCigar String:"+sub_se_gp.getCigarstring()+"");
+ }
+
+}