--- /dev/null
+package jalview.datamodel;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
+
+public class CigarParser
+{
+ private final char gapChar;
+
+ public CigarParser(char gap)
+ {
+ gapChar = gap;
+ }
+
+ /**
+ * Apply the CIGAR string to a read sequence and return the updated read.
+ *
+ * @param rec
+ * the SAM record for the read
+ * @param insertions
+ * a map of inserted positions and lengths
+ * @param alignmentStart
+ * start of alignment to be used as offset when padding reads with
+ * gaps
+ * @param seq
+ * sequence to be annotated with inserts
+ * @return string representing read with gaps, clipping etc applied
+ */
+ public String parseCigarToSequence(SAMRecord rec,
+ SortedMap<Integer, Integer> insertions, int alignmentStart,
+ SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ Iterator<CigarElement> it = rec.getCigar().getCigarElements()
+ .iterator();
+
+ int next = 0;
+ int refnext = 0;
+
+ // pad with spaces before read
+ // first count gaps needed to pad to reference position
+ // NB position is 1-based so number of gaps = pos-1
+ int gaps = rec.getStart() - alignmentStart;
+
+ // now count gaps to pad for insertions in other reads
+ int insertCount = countInsertionsBeforeRead(rec, insertions);
+ addGaps(newRead, gaps + insertCount);
+
+ int lastinserts = 0;
+ while (it.hasNext())
+ {
+ CigarElement el = it.next();
+ int length = el.getLength();
+
+ // which insertions coincide with current CIGAR operation?
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(
+ rec.getStart() + refnext, rec.getStart() + refnext + length);
+
+ int[] override = null;
+ if (lastinserts > 0)
+ {
+ if (!seqInserts.containsKey(rec.getStart() + refnext))
+ {
+ // ERROR
+ int pos = rec.getStart() + refnext;
+ System.out.println(
+ "Read " + rec.getReadName() + " has an insertion at "
+ + pos + " which is not in seqInserts");
+ }
+ else
+ {
+ // we just inserted something
+ // remove these inserts now - should be very first entry
+ int count = seqInserts.get(rec.getStart() + refnext);
+ override = new int[] { rec.getStart() + refnext,
+ count - lastinserts };
+ lastinserts = 0;
+ }
+ }
+
+ Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
+ .iterator();
+
+ String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
+ newRead.append(nextSegment);
+
+ if (el.getOperator().consumesReferenceBases())
+ {
+ refnext += length;
+ }
+ if (el.getOperator().consumesReadBases())
+ {
+ next += length;
+ }
+ if (el.getOperator().equals(CigarOperator.I))
+ {
+ // count how many insertions we made
+ lastinserts = length;
+ }
+ }
+ return newRead.toString();
+ }
+
+ /**
+ * Apply a cigar operation to a segment of a read, accounting for any
+ * insertions to the reference from other reads in the region
+ *
+ * @param el
+ * the current CIGAR element
+ * @param next
+ * location in read to start applying CIGAR op
+ * @param length
+ * length of CIGAR op
+ * @param rec
+ * SAMRecord corresponding to read
+ * @param it
+ * iterator over insertions which coincide with rec
+ * @param override
+ * an optional <insertion position, length> which can override a
+ * <position,length> in it to change the length. Set to null if not
+ * used.
+ * @param seq
+ * @return
+ */
+ private String applyCigarOp(CigarElement el, int next,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override, SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int length = el.getLength();
+ int nextPos = next;
+
+ switch (el.getOperator())
+ {
+ case M:
+ case X:
+ case EQ:
+ // matched or mismatched residues
+ newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
+ override, seq);
+ break;
+ case N: // intron in RNA
+ case D: // deletion
+ int insertCount = 0;
+ while (it.hasNext())
+ {
+ // just count all the insertions we need to add in the deletion/intron
+ // region
+ Entry<Integer, Integer> entry = it.next();
+ insertCount += entry.getValue();
+ }
+ addGaps(newRead, length + insertCount);
+ break;
+ case I:
+ // the reference sequence and other reads should have been gapped for
+ // this insertion, so just add in the residues
+ newRead.append(read.substring(nextPos, nextPos + length));
+ seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ case S:
+ // soft clipping - just skip this bit of the read
+ seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ case H:
+ // hard clipping - this stretch will not appear in the read
+ seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ default:
+ break;
+ }
+
+ return newRead.toString();
+ }
+
+ /**
+ * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
+ *
+ * @param next
+ * next position in read
+ * @param length
+ * length of cigar operation
+ * @param rec
+ * SAMRecord to apply the CIGAR op to
+ * @param it
+ * iterator over insertions in the CIGAR region
+ * @param override
+ * insertions which have already been accounted for
+ * @param seq
+ * @return
+ */
+ private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override, SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int nextPos = next; // location of next position in read
+ while (nextPos < next + length)
+ {
+ int insertPos = next + length; // location of next insert in read (or end
+ // of read)
+ int insertLen = 0; // no of gaps to insert after insertPos
+ if (it.hasNext())
+ {
+ // account for sequence already having insertion
+ // if an override entry exists it gives the number of positions still to
+ // be inserted at this point, after accounting for insertions driven by
+ // the current read.
+ // e.g. if the current read has 2 inserted positions here, and the
+ // insertions list also has 2 inserted positions, the override will
+ // cancel the insert for this read. If the insertions list had 3
+ // insertions, the override would reduce this to 1 insertion.
+ Entry<Integer, Integer> entry = it.next();
+ int key = entry.getKey();
+ insertLen = entry.getValue();
+ if (override != null && key == override[0])
+ {
+ insertLen = override[1];
+ }
+ if (insertLen > 0)
+ {
+ insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
+ }
+ else
+ {
+ insertPos = nextPos; // bugfix - trigger by override + insertions in
+ // same M stretch
+ }
+ }
+ newRead.append(read.substring(nextPos, insertPos));
+ nextPos = insertPos;
+ addGaps(newRead, insertLen); // add insertions
+ }
+ return newRead;
+ }
+
+ /**
+ * Get list of positions inserted to the reference sequence. Note: assumes all
+ * reads are on the same chromosome.
+ *
+ * @param it
+ * Iterator over all the SAMRecords
+ * @return sorted map of insertion positions <position, insertion size>
+ */
+ /*
+ 1234567
+ ABCDEFG insert 3,4
+
+ R1: insert 3 = gap before 3rd res
+ AB.CDEFG
+
+ R2: insert 4 = gap before 4th res
+ ABC.DEFG
+
+ R3: insert 3,4 = 2 gaps before 3rd res
+ AB..CDEFG
+
+ => AB--C-DEFG
+
+ So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
+ Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
+
+ */
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
+ {
+ return getInsertions(it, null);
+ }
+
+ public int firstAlColumn = -1, firstRposCol = -1;
+
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+ Range[] extent)
+ {
+ SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+ new TreeMap<>(), new TreeMap<>() };
+ Range xten = extent != null && extent[0] != null ? extent[0] : null;
+ do
+ {
+ // check extent of read
+ SAMRecord rec = it.next();
+ if (extent != null)
+ {
+
+ int nstart = rec.getAlignmentStart();
+ if (nstart != 0)
+ {
+ nstart = rec.getReferencePositionAtReadPosition(nstart);
+ }
+ int nend = rec.getAlignmentEnd();
+ if (nend != 0)
+ {
+ nend = rec.getReferencePositionAtReadPosition(nend);
+ }
+
+ xten = new Range(
+ nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+ nend == 0 ? xten.end : Math.max(nend, xten.end));
+ }
+
+ // check each record for insertions in the CIGAR string
+
+ // pick the insert map to update
+ int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+ SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
+ Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+ .iterator();
+ int refLocation = rec.getAlignmentStart();
+ int rloc = 0;
+ int alcol = 0;
+ int alpos = 0;
+ while (cit.hasNext())
+ {
+ CigarElement el = cit.next();
+ switch (el.getOperator())
+ {
+ case S:
+ case H:
+ // ignore soft/hardclipped
+
+ break;
+ default:
+ if (el.getOperator().consumesReadBases()
+ && !el.getOperator().consumesReferenceBases())
+ {
+ // add to insertions list, and move along the read
+ // location is the start of next CIGAR segment
+ // because getReferencePositionAtReadPosition returns 0 for read
+ // positions which are not in the reference (like insertions)
+
+ // if there's already an insertion at this location, keep the
+ // longest
+ // insertion; if there's no insertion keep this one
+ if (!inserts.containsKey(refLocation)
+ || (inserts.containsKey(refLocation)
+ && inserts.get(refLocation) < el.getLength()))
+ {
+ inserts.put(refLocation, el.getLength());
+ }
+ break;
+ }
+ if (el.getOperator().consumesReferenceBases())
+ {
+ if (el.getOperator().consumesReadBases() && alcol == 0
+ && refLocation + 1 > extent[0].start)
+ {
+ // first match position : last rloc + first match op
+ alcol = rloc + 1;
+ alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+ }
+ refLocation += el.getLength();
+ }
+
+ }
+ if (el.getOperator().consumesReadBases()
+ || el.getOperator().consumesReferenceBases())
+ {
+ rloc += el.getLength();
+ }
+ }
+ // Wrong: hack that fails to relocate reference to correct place in
+ // sequence
+ if (firstAlColumn < 0)
+ {
+ firstAlColumn = alcol;
+ firstRposCol = alpos;
+ }
+ } while (it.hasNext());
+ if (xten != null)
+ {
+ extent[0] = xten;
+ }
+ return sinserts;
+ }
+
+ /**
+ * Add sequence of gaps to a read
+ *
+ * @param read
+ * read to update
+ * @param length
+ * number of gaps
+ */
+ private void addGaps(StringBuilder read, int length)
+ {
+ if (length > 0)
+ {
+ char[] gaps = new char[length];
+ Arrays.fill(gaps, gapChar);
+ read.append(gaps);
+ }
+ }
+
+ /**
+ * Get the number of insertions before the start of an aligned read (i.e.
+ * insertions in soft clipped region are counted too)
+ *
+ * @param rec
+ * the SAMRecord for the read
+ * @param inserts
+ * the map of insertions
+ * @return number of insertions before the read starts
+ */
+ private int countInsertionsBeforeRead(SAMRecord rec,
+ SortedMap<Integer, Integer> inserts)
+ {
+ int gaps = 0;
+
+ // add in any insertion gaps before read
+ // although we only need to get the submap from alignmentStart, there won't
+ // be any insertions before that so we can just start at 0
+ SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
+ rec.getStart());
+
+ Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
+ .iterator();
+ while (it.hasNext())
+ {
+ Entry<Integer, Integer> entry = it.next();
+ gaps += entry.getValue();
+ }
+ return gaps;
+ }
+}