From 60508bc218cee42c6fa3405db19f7790acafabab Mon Sep 17 00:00:00 2001 From: jprocter Date: Wed, 26 Jul 2006 13:45:15 +0000 Subject: [PATCH] Cigars for representing alignment view as used for calculation. --- src/jalview/datamodel/CigarBase.java | 376 ++++++++++++++++++++++++++++++++ src/jalview/datamodel/CigarCigar.java | 32 +++ src/jalview/datamodel/CigarSimple.java | 24 ++ src/jalview/datamodel/SeqCigar.java | 299 +++++++++++++++++++++++++ 4 files changed, 731 insertions(+) create mode 100644 src/jalview/datamodel/CigarBase.java create mode 100644 src/jalview/datamodel/CigarCigar.java create mode 100644 src/jalview/datamodel/CigarSimple.java create mode 100644 src/jalview/datamodel/SeqCigar.java diff --git a/src/jalview/datamodel/CigarBase.java b/src/jalview/datamodel/CigarBase.java new file mode 100644 index 0000000..2306327 --- /dev/null +++ b/src/jalview/datamodel/CigarBase.java @@ -0,0 +1,376 @@ +package jalview.datamodel; + +public abstract class CigarBase +{ + /** + * Base class for compact idiosyncratic representation of gaps and aligned residues + * Regards to Tom Oldfield for his DynamicArray class. + * 17th July 2006 + * Not thread safe. + */ + public CigarBase() + { + // nothing to be done (probably) + } + + protected int length = 0; + protected int _inc_length = 10; // extension range for addition of new operations + protected char[] operation = null; + protected int[] range = null; + /** + * Range of Hidden residues in seq (translated as deleted in seq) + */ + public static final char D = 'D'; /** + * Range of insertions to seq + */ + public static final char I = 'I'; /** + * Range of aligned residues + */ + public static final char M = 'M'; + static protected final char _case_shift = 'a' - 'A'; + /** + * Ugly function to get edited sequence string, start and end symbol positions and the deletion regions as an array of int pairs + * May return null for an empty cigar string. + * May return null for deletion ranges if there are none. + * @param reference - the symbol sequence to apply the cigar operations to (or null if no sequence) + * @param GapChar - the symbol to use for Insert operations + * @return Object[] { String, int[] {start, startcol, end, endcol}, int[][3] {start, end, col} or null} the gapped sequence, first and last residue index, and the deletion ranges on the reference sequence + */ + public Object[] getSequenceAndDeletions(String reference, char GapChar) + { + int rlength=0; + int[][] deletions = new int[length][]; + int[][] trunc_deletions = null; + StringBuffer sq = new StringBuffer(); + int cursor = 0, alcursor=0,start = 0, startpos=0, end = 0, endpos=0, delcount = -1; + boolean consecutive_del = false; + if (length == 0) + { + return null; + } + if (reference!=null) + rlength=reference.length(); + boolean modstart = true; + for (int i = 0; i < length; i++) + { + switch (operation[i]) + { + case D: + if (!consecutive_del) + { + deletions[++delcount] = new int[] + { + cursor, 0, alcursor}; + } + cursor += range[i]; + deletions[delcount][1] = cursor - 1; + consecutive_del = true; + break; + case I: + consecutive_del = false; + for (int r = 0; r < range[i]; r++) + { + sq.append(GapChar); + alcursor++; + } + break; + case M: + consecutive_del = false; + if (modstart) + { + start = cursor; + startpos=alcursor; + modstart = false; + } + if (reference!=null) { + int sbend = cursor+range[i]; + if (sbend>=rlength) { + sq.append(reference.substring(cursor, rlength)); + while (sbend-- >= rlength) + { + sq.append(GapChar); + } + } else { + sq.append(reference.substring(cursor, sbend)); + } + } + alcursor+=range[i]; + cursor += range[i]; + end = cursor - 1; + endpos = alcursor; + break; + default: + throw new Error("Unknown SeqCigar operation '" + operation[i] + "'"); + } + } + if (++delcount > 0) + { + trunc_deletions = new int[delcount][]; + System.arraycopy(deletions, 0, trunc_deletions, 0, delcount); + } + deletions = null; + return new Object[] + { + ((reference!=null) ? sq.toString() : null), + new int[] { + start, startpos, end, endpos}, trunc_deletions}; + } + + /** + * turn a cigar string into a series of operation range pairs + * @param cigarString String + * @return object[] {char[] operation, int[] range} + * @throws java.lang.Exception for improperly formated cigar strings or ones with unknown operations + */ + public static Object[] parseCigarString(String cigarString) + throws Exception + { + int ops = 0; + for (int i = 0, l = cigarString.length(); i < l; i++) + { + char c = cigarString.charAt(i); + if (c == M || c == (M - _case_shift) || c == I || c == (I - _case_shift) || + c == D || c == (D - _case_shift)) + { + ops++; + } + } + char[] operation = new char[ops]; + int[] range = new int[ops]; + int op = 0; + int i = 0, l = cigarString.length(); + while (i < l) + { + char c; + int j = i; + do + { + c = cigarString.charAt(j++); + } + while (c >= '0' && c <= '9' && j < l); + if (j >= l && c >= '0' && c <= '9') + { + throw new Exception("Unterminated cigar string."); + } + try + { + String rangeint = cigarString.substring(i, j - 1); + range[op] = Integer.parseInt(rangeint); + i = j; + } + catch (Exception e) + { + throw new Error("Implementation bug in parseCigarString"); + } + if (c >= 'a' && c <= 'z') + { + c -= _case_shift; + } + if ( (c == M || c == I || c == D)) + { + operation[op++] = c; + } + else + { + throw new Exception("Unexpected operation '" + c + + "' in cigar string (position " + i + " in '" + + cigarString + "'"); + } + } + return new Object[] + { + operation, range}; + } + + /** + * inefficient add one operation to cigar string + * @param op char + * @param range int + */ + public void addOperation(char op, int range) + { + if (op >= 'a' && op <= 'z') + { + op -= _case_shift; + } + if (op != M && op != D && op != I) + { + throw new Error("Implementation error. Invalid operation string."); + } + if (range<=0) + throw new Error("Invalid range string (must be non-zero positive number)"); + int lngth = 0; + if (operation == null) + { + this.operation = new char[_inc_length]; + this.range = new int[_inc_length]; + } + if (length + 1 == operation.length) + { + char[] ops = this.operation; + this.operation = new char[length + 1 + _inc_length]; + System.arraycopy(ops, 0, this.operation, 0, length); + ops = null; + int[] rng = this.range; + this.range = new int[length + 1 + _inc_length]; + System.arraycopy(rng, 0, this.range, 0, length); + rng = null; + } + if ((length>0) && (operation[length-1]==op)) + length--; // modify existing operation. + else + this.range[length]=0; // reset range + this.operation[length] = op; + this.range[length++] += range; + } + + /** + * semi-efficient insert an operation on the current cigar string set at column pos (from 1) + * NOTE: Insertion operations simply extend width of cigar result - affecting registration of alignment + * Deletion ops will shorten length of result - and affect registration of alignment + * Match ops will also affect length of result - affecting registration of alignment + * (ie "10M".insert(4,I,3)->"4M3I3M") + * (ie "10M".insert(4,D,3)->"4M3D3M") + * (ie "5I5M".insert(4,I,3)->"8I5M") + * (ie "5I5M".insert(4,D,3)->"4I3M") + * if pos is beyond width - I operations are added before the operation + * (ie "10M".insert(4,M,3)->"13M") + * (ie "5I5M".insert(4,M,3)->"4I8M") - effectively shifts sequence left by 1 residue and extends it by 3 + * @param pos int + * @param op char + * @param range int + */ + public void addOperationAt(int pos, char op, int range) + { + + } + + /** + * sum of ranges in cigar string + * @return int number of residues hidden, matched, or gaps inserted into sequence + */ + public int getFullWidth() + { + int w = 0; + if (range != null) + { + for (int i = 0; i < length; i++) + { + w += range[i]; + } + } + return w; + } + + /** + * Visible length of aligned sequence + * @return int length of including gaps and less hidden regions + */ + public int getWidth() + { + int w = 0; + if (range != null) + { + for (int i = 0; i < length; i++) + { + if (operation[i] == M || operation[i] == I) + { + w += range[i]; + } + } + } + return w; + } + /** + * mark a range of inserted residues + * @param range int + */ + public void addInsertion(int range) + { + this.addOperation(I, range); + } + + /** + * mark the next range residues as hidden (not aligned) or deleted + * @param range int + */ + public void addDeleted(int range) + { + this.addOperation(D, range); + } + + /** + * Modifies operation list to delete columns from start to end (inclusive) + * editing will remove insertion operations, and convert matches to deletions + * @param start alignment column + * @param end alignment column + * @return boolean true if residues were marked as deleted. + */ + public boolean deleteRange(int start, int end) + { + boolean deleted = false; + int op = 0, prevop = -1, firstm = -1, + lastm = -1, postop = -1; + int width = 0; // zero'th column + if (length > 0) + { + // find operation bracketing start of the range + do + { + if (operation[op] != D) + { + width += range[prevop = op]; + } + op++; + } + while (op < length && width < start); + } + if (width < start) + { + // run off end - add more operations up to deletion. + addInsertion(start - width); + } + else + { + // edit existing operations. + op = prevop; + width -= range[prevop]; + int[] oldrange = range; + char[] oldops = operation; + range = new int[oldrange.length]; + operation = new char[oldops.length]; + if (op < length) + { + do + { + if (operation[op] != D) + { + width += range[postop = op]; + } + op++; + } + while (op < length && width <= end); + } + } + if (deleted == true) + { + addDeleted(end - start + 1); + } + return deleted; + } + + /** + * Return an ENSEMBL style cigar string where D may indicates excluded parts of seq + * @return String of form ([0-9]+[IMD])+ + */ + public String getCigarstring() + { + StringBuffer cigarString = new StringBuffer(); + for (int i = 0; i < length; i++) + { + cigarString.append("" + range[i]); + cigarString.append(operation[i]); + } + return cigarString.toString(); + } +} diff --git a/src/jalview/datamodel/CigarCigar.java b/src/jalview/datamodel/CigarCigar.java new file mode 100644 index 0000000..5f8f78b --- /dev/null +++ b/src/jalview/datamodel/CigarCigar.java @@ -0,0 +1,32 @@ +package jalview.datamodel; + +public class CigarCigar + extends CigarSimple +{ + SeqCigar refCigar; + /** + * Apply CIGAR operations to the result of another cigar + * @param cigar Cigar + */ + CigarCigar(SeqCigar cigar) { + super(); + refCigar = cigar; + } + /** + * + * @return String formed by applying CIGAR operations to the reference object + * @param GapChar char + * @todo Implement this jalview.datamodel.Cigar method + */ + public String getSequenceString(char GapChar) + { + if (length==0) + return ""; + String refString = refCigar.getSequenceString(GapChar); + if (refString!=null) { + return (length==0) ? "" : (String) getSequenceAndDeletions(refString, GapChar)[0]; + } else + return null; + } + +} diff --git a/src/jalview/datamodel/CigarSimple.java b/src/jalview/datamodel/CigarSimple.java new file mode 100644 index 0000000..4db24a6 --- /dev/null +++ b/src/jalview/datamodel/CigarSimple.java @@ -0,0 +1,24 @@ +package jalview.datamodel; + +/** + *

Title:

+ * + *

Description:

+ * + *

Copyright: Copyright (c) 2004

+ * + *

Company: Dundee University

+ * + * @author not attributable + * @version 1.0 + */ +public abstract class CigarSimple + extends CigarBase +{ + /** + * Return a symbol sequence with edits (gaps, insertions and deletions) applied + * @param GapChar char + * @return String + */ + public abstract String getSequenceString(char GapChar); +} diff --git a/src/jalview/datamodel/SeqCigar.java b/src/jalview/datamodel/SeqCigar.java new file mode 100644 index 0000000..536e4ea --- /dev/null +++ b/src/jalview/datamodel/SeqCigar.java @@ -0,0 +1,299 @@ +package jalview.datamodel; + +import jalview.analysis.AlignSeq; + +public class SeqCigar + extends CigarSimple +{ + + private SequenceI refseq=null; + /** + * Reference dataset sequence for the cigar string + * @return SequenceI + */ + public SequenceI getRefSeq() { + return refseq; + } + /** + * Returns sequence as a string with cigar operations applied to it + * @return String + */ + public String getSequenceString(char GapChar) + { + return (length==0) ? "" : (String) getSequenceAndDeletions(refseq.getSequence(), GapChar)[0]; + } + + /** + * recreates a gapped and edited version of RefSeq or null for an empty cigar string + * @return SequenceI + */ + public SequenceI getSeq(char GapChar) { + Sequence seq; + if (refseq==null || length==0) + return null; + Object[] edit_result=getSequenceAndDeletions(refseq.getSequence(), GapChar); + if (edit_result==null) + throw new Error("Implementation Error - unexpected null from getSequenceAndDeletions"); + + seq = new Sequence(refseq.getName(), (String) edit_result[0], refseq.getStart()+((int[]) edit_result[1])[0], refseq.getStart()+((int[]) edit_result[1])[2]); + seq.setDatasetSequence(refseq); + return seq; + } + /* + We don't allow this - refseq is given at construction time only + public void setSeq(SequenceI seq) { + this.seq = seq; + } + */ + /** + * internal constructor - sets seq to a gapless sequence derived from seq + * and prepends any 'D' operations needed to get to the first residue of seq. + * @param seq SequenceI + * @return true if gaps are present in seq + */ + private boolean _setSeq(SequenceI seq) { + boolean hasgaps=false; + + if (seq==null) + throw new Error("Implementation Error - _setSeq(null)"); + + // Find correct sequence to reference and add initial hidden offset + SequenceI ds = seq.getDatasetSequence(); + if (ds==null) { + ds = new Sequence(seq.getName(), + AlignSeq.extractGaps(jalview.util.Comparison.GapChars, new String(seq.getSequence())), + seq.getStart(), + seq.getEnd()); + } + // check that we haven't just duplicated an ungapped sequence. + if (ds.getLength()==seq.getLength()) { + ds = seq; + } else { + hasgaps = true; + } + this.refseq = ds; + // Adjust offset + if (ds.getStart() 0 && op != I) + { + cigar.addOperation(op, range); + range = 0; + } + op = I; + range++; + } + else + { + if (range > 0 && op != M) + { + cigar.addOperation(op, range); + range = 0; + } + op = M; + range++; + } + } + else + { + if (!isGap) + { + if (range > 0 && op != D) + { + cigar.addOperation(op, range); + range = 0; + } + op = D; + range++; + } + else + { + // do nothing - insertions are not recorded in flanking regions. + } + } + } + if (range > 0) + { + cigar.addOperation(op, range); + } + } + /** + * create a cigar string for given sequence + * @param seq SequenceI + */ + public SeqCigar(SequenceI seq) { + super(); + if (seq == null) + throw new Error("Implementation error for new Cigar(SequenceI)"); + if (_setSeq(seq)) + { + // there is still work to do + addSequenceOps(this, seq, 0, seq.getLength()); + } + } + public SeqCigar(SequenceI seq, int start, int end) { + super(); + if (seq == null) + throw new Error("Implementation error for new Cigar(SequenceI)"); + if (_setSeq(seq)) + { + // there is still work to do + addSequenceOps(this, seq, start, end); + } + } + + /** + * Create a cigar object from a cigar string like '[]+' + * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix) + * @param seq SequenceI object resolvable to a dataset sequence + * @param cigarString String + * @return Cigar + */ + public static SeqCigar parseCigar(SequenceI seq, String cigarString) + throws Exception + { + Object[] opsandrange = parseCigarString(cigarString); + return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]); + } + /** + * non rigorous testing + */ + /** + * + * @param seq Sequence + * @param ex_cs_gapped String + * @return String + */ + public static String testCigar_string(Sequence seq, String ex_cs_gapped) { + SeqCigar c_sgapped = new SeqCigar(seq); + String cs_gapped = c_sgapped.getCigarstring(); + if (!cs_gapped.equals(ex_cs_gapped)) + System.err.println("Failed getCigarstring: incorect string '"+cs_gapped+"' != "+ex_cs_gapped); + return cs_gapped; + } + public static boolean testSeqRecovery(SeqCigar gen_sgapped, SequenceI s_gapped) { + SequenceI gen_sgapped_s = gen_sgapped.getSeq('-'); + if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence())) { + System.err.println("Couldn't reconstruct sequence.\n" + + gen_sgapped_s.getSequence() + "\n" + + s_gapped.getSequence()); + return false; + } + return true; + } + public static void main(String argv[]) throws Exception { + Sequence s=new Sequence("MySeq", "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",39,80); + String orig_gapped; + Sequence s_gapped=new Sequence("MySeq", orig_gapped="----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt", 39,80); + String ex_cs_gapped="4I4M6I6M3I11M4I12M4I9M"; + s_gapped.setDatasetSequence(s); + String sub_gapped_s; + Sequence s_subsequence_gapped=new Sequence("MySeq", sub_gapped_s="------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh", 43,77); + + s_subsequence_gapped.setDatasetSequence(s); + SeqCigar c_null = new SeqCigar(s); + String cs_null = c_null.getCigarstring(); + if (cs_null.length()>0) + System.err.println("Failed getCigarstring: Unexpected cigar operations:"+cs_null); + testCigar_string(s_gapped, ex_cs_gapped); + SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped); + if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped)) + System.err.println("Failed parseCigar("+ex_cs_gapped+")->getCigarString()->'"+gen_sgapped.getCigarstring()+"'"); + testSeqRecovery(gen_sgapped, s_gapped); + // Test dataset resolution + SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped); + if (!testSeqRecovery(sub_gapped, s_subsequence_gapped)) + System.err.println("Failed recovery for subsequence of dataset sequence"); + // width functions + if (sub_gapped.getWidth()!=sub_gapped_s.length()) + System.err.println("Failed getWidth()"); + + sub_gapped.getFullWidth(); + if (sub_gapped.hasDeletedRegions()) + System.err.println("hasDeletedRegions is incorrect."); + // Test start-end region SeqCigar + SeqCigar sub_se_gp= new SeqCigar(s_subsequence_gapped, 8, 48); + if (sub_se_gp.getWidth()!=40) + System.err.println("SeqCigar(seq, start, end) not properly clipped alignsequence."); + System.out.println("Original sequence align:\n"+sub_gapped_s+"\nReconstructed window from 8 to 48\n"+"XXXXXXXX"+sub_se_gp.getSequenceString('-')+"...."+"\nCigar String:"+sub_se_gp.getCigarstring()+""); + } + +} -- 1.7.10.2