package jalview.datamodel; import java.util.Arrays; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; import java.util.TreeMap; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; public class CigarParser { private final char gapChar; public CigarParser(char gap) { gapChar = gap; } /** * Apply the CIGAR string to a read sequence and return the updated read. * * @param rec * the SAM record for the read * @param insertions * a map of inserted positions and lengths * @param alignmentStart * start of alignment to be used as offset when padding reads with * gaps * @param seq * sequence to be annotated with inserts * @return string representing read with gaps, clipping etc applied */ public String parseCigarToSequence(SAMRecord rec, SortedMap insertions, int alignmentStart, SequenceI seq) { StringBuilder newRead = new StringBuilder(); Iterator it = rec.getCigar().getCigarElements() .iterator(); int next = 0; int refnext = 0; // pad with spaces before read // first count gaps needed to pad to reference position // NB position is 1-based so number of gaps = pos-1 int gaps = rec.getStart() - alignmentStart; // now count gaps to pad for insertions in other reads int insertCount = countInsertionsBeforeRead(rec, insertions); addGaps(newRead, gaps + insertCount); int lastinserts = 0; while (it.hasNext()) { CigarElement el = it.next(); int length = el.getLength(); // which insertions coincide with current CIGAR operation? SortedMap seqInserts = insertions.subMap( rec.getStart() + refnext, rec.getStart() + refnext + length); int[] override = null; if (lastinserts > 0) { if (!seqInserts.containsKey(rec.getStart() + refnext)) { // ERROR int pos = rec.getStart() + refnext; System.out.println( "Read " + rec.getReadName() + " has an insertion at " + pos + " which is not in seqInserts"); } else { // we just inserted something // remove these inserts now - should be very first entry int count = seqInserts.get(rec.getStart() + refnext); override = new int[] { rec.getStart() + refnext, count - lastinserts }; lastinserts = 0; } } Iterator> iit = seqInserts.entrySet() .iterator(); String nextSegment = applyCigarOp(el, next, rec, iit, override, seq); newRead.append(nextSegment); if (el.getOperator().consumesReferenceBases()) { refnext += length; } if (el.getOperator().consumesReadBases()) { next += length; } if (el.getOperator().equals(CigarOperator.I)) { // count how many insertions we made lastinserts = length; } } return newRead.toString(); } /** * Apply a cigar operation to a segment of a read, accounting for any * insertions to the reference from other reads in the region * * @param el * the current CIGAR element * @param next * location in read to start applying CIGAR op * @param length * length of CIGAR op * @param rec * SAMRecord corresponding to read * @param it * iterator over insertions which coincide with rec * @param override * an optional which can override a * in it to change the length. Set to null if not * used. * @param seq * @return */ private String applyCigarOp(CigarElement el, int next, SAMRecord rec, Iterator> it, int[] override, SequenceI seq) { StringBuilder newRead = new StringBuilder(); String read = rec.getReadString(); int length = el.getLength(); int nextPos = next; switch (el.getOperator()) { case M: case X: case EQ: // matched or mismatched residues newRead = applyCigarConsumeReadAndRef(next, length, rec, it, override, seq); break; case N: // intron in RNA case D: // deletion int insertCount = 0; while (it.hasNext()) { // just count all the insertions we need to add in the deletion/intron // region Entry entry = it.next(); insertCount += entry.getValue(); } addGaps(newRead, length + insertCount); break; case I: // the reference sequence and other reads should have been gapped for // this insertion, so just add in the residues newRead.append(read.substring(nextPos, nextPos + length)); seq.addSequenceFeature(new SequenceFeature("INSERTION", "", nextPos + 1, nextPos + length, "bamfile")); break; case S: // soft clipping - just skip this bit of the read seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "", nextPos + 1, nextPos + length, "bamfile")); break; case H: // hard clipping - this stretch will not appear in the read seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "", nextPos + 1, nextPos + length, "bamfile")); break; default: break; } return newRead.toString(); } /** * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X * * @param next * next position in read * @param length * length of cigar operation * @param rec * SAMRecord to apply the CIGAR op to * @param it * iterator over insertions in the CIGAR region * @param override * insertions which have already been accounted for * @param seq * @return */ private StringBuilder applyCigarConsumeReadAndRef(int next, int length, SAMRecord rec, Iterator> it, int[] override, SequenceI seq) { StringBuilder newRead = new StringBuilder(); String read = rec.getReadString(); int nextPos = next; // location of next position in read while (nextPos < next + length) { int insertPos = next + length; // location of next insert in read (or end // of read) int insertLen = 0; // no of gaps to insert after insertPos if (it.hasNext()) { // account for sequence already having insertion // if an override entry exists it gives the number of positions still to // be inserted at this point, after accounting for insertions driven by // the current read. // e.g. if the current read has 2 inserted positions here, and the // insertions list also has 2 inserted positions, the override will // cancel the insert for this read. If the insertions list had 3 // insertions, the override would reduce this to 1 insertion. Entry entry = it.next(); int key = entry.getKey(); insertLen = entry.getValue(); if (override != null && key == override[0]) { insertLen = override[1]; } if (insertLen > 0) { insertPos = rec.getReadPositionAtReferencePosition(key) - 1; } else { insertPos = nextPos; // bugfix - trigger by override + insertions in // same M stretch } } newRead.append(read.substring(nextPos, insertPos)); nextPos = insertPos; addGaps(newRead, insertLen); // add insertions } return newRead; } /** * Get list of positions inserted to the reference sequence. Note: assumes all * reads are on the same chromosome. * * @param it * Iterator over all the SAMRecords * @return sorted map of insertion positions */ /* 1234567 ABCDEFG insert 3,4 R1: insert 3 = gap before 3rd res AB.CDEFG R2: insert 4 = gap before 4th res ABC.DEFG R3: insert 3,4 = 2 gaps before 3rd res AB..CDEFG => AB--C-DEFG So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added */ public SortedMap[] getInsertions(Iterator it) { return getInsertions(it, null); } public int firstAlColumn = -1, firstRposCol = -1; public SortedMap[] getInsertions(Iterator it, Range[] extent) { SortedMap sinserts[] = new SortedMap[] { new TreeMap<>(), new TreeMap<>() }; Range xten = extent != null && extent[0] != null ? extent[0] : null; do { // check extent of read SAMRecord rec = it.next(); if (extent != null) { int nstart = rec.getAlignmentStart(); if (nstart != 0) { nstart = rec.getReferencePositionAtReadPosition(nstart); } int nend = rec.getAlignmentEnd(); if (nend != 0) { nend = rec.getReferencePositionAtReadPosition(nend); } xten = new Range( nstart <= 0 ? xten.start : Math.min(nstart, xten.start), nend == 0 ? xten.end : Math.max(nend, xten.end)); } // check each record for insertions in the CIGAR string // pick the insert map to update int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0; SortedMap inserts = sinserts[insmap]; Iterator cit = rec.getCigar().getCigarElements() .iterator(); int refLocation = rec.getAlignmentStart(); int rloc = 0; int alcol = 0; int alpos = 0; while (cit.hasNext()) { CigarElement el = cit.next(); switch (el.getOperator()) { case S: case H: // ignore soft/hardclipped break; default: if (el.getOperator().consumesReadBases() && !el.getOperator().consumesReferenceBases()) { // add to insertions list, and move along the read // location is the start of next CIGAR segment // because getReferencePositionAtReadPosition returns 0 for read // positions which are not in the reference (like insertions) // if there's already an insertion at this location, keep the // longest // insertion; if there's no insertion keep this one if (!inserts.containsKey(refLocation) || (inserts.containsKey(refLocation) && inserts.get(refLocation) < el.getLength())) { inserts.put(refLocation, el.getLength()); } break; } if (el.getOperator().consumesReferenceBases()) { if (el.getOperator().consumesReadBases() && alcol == 0 && refLocation + 1 > extent[0].start) { // first match position : last rloc + first match op alcol = rloc + 1; alpos = rec.getReferencePositionAtReadPosition(rloc + 1); } refLocation += el.getLength(); } } if (el.getOperator().consumesReadBases() || el.getOperator().consumesReferenceBases()) { rloc += el.getLength(); } } // Wrong: hack that fails to relocate reference to correct place in // sequence if (firstAlColumn < 0) { firstAlColumn = alcol; firstRposCol = alpos; } } while (it.hasNext()); if (xten != null) { extent[0] = xten; } return sinserts; } /** * Add sequence of gaps to a read * * @param read * read to update * @param length * number of gaps */ private void addGaps(StringBuilder read, int length) { if (length > 0) { char[] gaps = new char[length]; Arrays.fill(gaps, gapChar); read.append(gaps); } } /** * Get the number of insertions before the start of an aligned read (i.e. * insertions in soft clipped region are counted too) * * @param rec * the SAMRecord for the read * @param inserts * the map of insertions * @return number of insertions before the read starts */ private int countInsertionsBeforeRead(SAMRecord rec, SortedMap inserts) { int gaps = 0; // add in any insertion gaps before read // although we only need to get the submap from alignmentStart, there won't // be any insertions before that so we can just start at 0 SortedMap seqInserts = inserts.subMap(0, rec.getStart()); Iterator> it = seqInserts.entrySet() .iterator(); while (it.hasNext()) { Entry entry = it.next(); gaps += entry.getValue(); } return gaps; } }