1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @param alignmentStart
31 * start of alignment to be used as offset when padding reads with
34 * sequence to be annotated with inserts
35 * @return string representing read with gaps, clipping etc applied
37 public String parseCigarToSequence(SAMRecord rec,
38 SortedMap<Integer, Integer> insertions, int alignmentStart,
41 StringBuilder newRead = new StringBuilder();
42 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
48 // pad with spaces before read
49 // first count gaps needed to pad to reference position
50 // NB position is 1-based so number of gaps = pos-1
51 int gaps = rec.getStart() - alignmentStart;
53 // now count gaps to pad for insertions in other reads
54 int insertCount = countInsertionsBeforeRead(rec, insertions);
55 addGaps(newRead, gaps + insertCount);
60 CigarElement el = it.next();
61 int length = el.getLength();
63 // which insertions coincide with current CIGAR operation?
64 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
65 rec.getStart() + refnext, rec.getStart() + refnext + length);
67 int[] override = null;
70 if (!seqInserts.containsKey(rec.getStart() + refnext))
73 int pos = rec.getStart() + refnext;
75 "Read " + rec.getReadName() + " has an insertion at "
76 + pos + " which is not in seqInserts");
80 // we just inserted something
81 // remove these inserts now - should be very first entry
82 int count = seqInserts.get(rec.getStart() + refnext);
83 override = new int[] { rec.getStart() + refnext,
84 count - lastinserts };
89 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
92 String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
93 newRead.append(nextSegment);
95 if (el.getOperator().consumesReferenceBases())
99 if (el.getOperator().consumesReadBases())
103 if (el.getOperator().equals(CigarOperator.I))
105 // count how many insertions we made
106 lastinserts = length;
109 return newRead.toString();
113 * Apply a cigar operation to a segment of a read, accounting for any
114 * insertions to the reference from other reads in the region
117 * the current CIGAR element
119 * location in read to start applying CIGAR op
123 * SAMRecord corresponding to read
125 * iterator over insertions which coincide with rec
127 * an optional <insertion position, length> which can override a
128 * <position,length> in it to change the length. Set to null if not
133 private String applyCigarOp(CigarElement el, int next,
134 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
135 int[] override, SequenceI seq)
137 StringBuilder newRead = new StringBuilder();
138 String read = rec.getReadString();
140 int length = el.getLength();
143 switch (el.getOperator())
148 // matched or mismatched residues
149 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
152 case N: // intron in RNA
157 // just count all the insertions we need to add in the deletion/intron
159 Entry<Integer, Integer> entry = it.next();
160 insertCount += entry.getValue();
162 addGaps(newRead, length + insertCount);
165 // the reference sequence and other reads should have been gapped for
166 // this insertion, so just add in the residues
167 newRead.append(read.substring(nextPos, nextPos + length));
168 seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
169 nextPos + 1, nextPos + length, "bamfile"));
172 // soft clipping - just skip this bit of the read
173 seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
174 nextPos + 1, nextPos + length, "bamfile"));
177 // hard clipping - this stretch will not appear in the read
178 seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
179 nextPos + 1, nextPos + length, "bamfile"));
185 return newRead.toString();
189 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
192 * next position in read
194 * length of cigar operation
196 * SAMRecord to apply the CIGAR op to
198 * iterator over insertions in the CIGAR region
200 * insertions which have already been accounted for
204 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
205 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
206 int[] override, SequenceI seq)
208 StringBuilder newRead = new StringBuilder();
209 String read = rec.getReadString();
211 int nextPos = next; // location of next position in read
212 while (nextPos < next + length)
214 int insertPos = next + length; // location of next insert in read (or end
216 int insertLen = 0; // no of gaps to insert after insertPos
219 // account for sequence already having insertion
220 // if an override entry exists it gives the number of positions still to
221 // be inserted at this point, after accounting for insertions driven by
223 // e.g. if the current read has 2 inserted positions here, and the
224 // insertions list also has 2 inserted positions, the override will
225 // cancel the insert for this read. If the insertions list had 3
226 // insertions, the override would reduce this to 1 insertion.
227 Entry<Integer, Integer> entry = it.next();
228 int key = entry.getKey();
229 insertLen = entry.getValue();
230 if (override != null && key == override[0])
232 insertLen = override[1];
236 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
240 insertPos = nextPos; // bugfix - trigger by override + insertions in
244 newRead.append(read.substring(nextPos, insertPos));
246 addGaps(newRead, insertLen); // add insertions
252 * Get list of positions inserted to the reference sequence. Note: assumes all
253 * reads are on the same chromosome.
256 * Iterator over all the SAMRecords
257 * @return sorted map of insertion positions <position, insertion size>
263 R1: insert 3 = gap before 3rd res
266 R2: insert 4 = gap before 4th res
269 R3: insert 3,4 = 2 gaps before 3rd res
274 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
275 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
278 public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
280 return getInsertions(it, null);
283 public int firstAlColumn = -1, firstRposCol = -1;
285 public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
288 SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
289 new TreeMap<>(), new TreeMap<>() };
290 Range xten = extent != null && extent[0] != null ? extent[0] : null;
293 // check extent of read
294 SAMRecord rec = it.next();
298 int nstart = rec.getAlignmentStart();
301 nstart = rec.getReferencePositionAtReadPosition(nstart);
303 int nend = rec.getAlignmentEnd();
306 nend = rec.getReferencePositionAtReadPosition(nend);
310 nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
311 nend == 0 ? xten.end : Math.max(nend, xten.end));
314 // check each record for insertions in the CIGAR string
316 // pick the insert map to update
317 int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
318 SortedMap<Integer, Integer> inserts = sinserts[insmap];
320 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
322 int refLocation = rec.getAlignmentStart();
326 while (cit.hasNext())
328 CigarElement el = cit.next();
329 switch (el.getOperator())
333 // ignore soft/hardclipped
337 if (el.getOperator().consumesReadBases()
338 && !el.getOperator().consumesReferenceBases())
340 // add to insertions list, and move along the read
341 // location is the start of next CIGAR segment
342 // because getReferencePositionAtReadPosition returns 0 for read
343 // positions which are not in the reference (like insertions)
345 // if there's already an insertion at this location, keep the
347 // insertion; if there's no insertion keep this one
348 if (!inserts.containsKey(refLocation)
349 || (inserts.containsKey(refLocation)
350 && inserts.get(refLocation) < el.getLength()))
352 inserts.put(refLocation, el.getLength());
356 if (el.getOperator().consumesReferenceBases())
358 if (el.getOperator().consumesReadBases() && alcol == 0
359 && refLocation + 1 > extent[0].start)
361 // first match position : last rloc + first match op
363 alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
365 refLocation += el.getLength();
369 if (el.getOperator().consumesReadBases()
370 || el.getOperator().consumesReferenceBases())
372 rloc += el.getLength();
375 // Wrong: hack that fails to relocate reference to correct place in
377 if (firstAlColumn < 0)
379 firstAlColumn = alcol;
380 firstRposCol = alpos;
382 } while (it.hasNext());
391 * Add sequence of gaps to a read
398 private void addGaps(StringBuilder read, int length)
402 char[] gaps = new char[length];
403 Arrays.fill(gaps, gapChar);
409 * Get the number of insertions before the start of an aligned read (i.e.
410 * insertions in soft clipped region are counted too)
413 * the SAMRecord for the read
415 * the map of insertions
416 * @return number of insertions before the read starts
418 private int countInsertionsBeforeRead(SAMRecord rec,
419 SortedMap<Integer, Integer> inserts)
423 // add in any insertion gaps before read
424 // although we only need to get the submap from alignmentStart, there won't
425 // be any insertions before that so we can just start at 0
426 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
429 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
433 Entry<Integer, Integer> entry = it.next();
434 gaps += entry.getValue();