1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @param alignmentStart
31 * start of alignment to be used as offset when padding reads with
34 * sequence to be annotated with inserts
35 * @return string representing read with gaps, clipping etc applied
37 public String parseCigarToSequence(SAMRecord rec,
38 SortedMap<Integer, Integer> insertions, int alignmentStart,
41 StringBuilder newRead = new StringBuilder();
42 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
48 // pad with spaces before read
49 // first count gaps needed to pad to reference position
50 // NB position is 1-based so number of gaps = pos-1
51 int gaps = rec.getStart() - alignmentStart;
53 // now count gaps to pad for insertions in other reads
54 int insertCount = countInsertionsBeforeRead(rec, insertions);
55 addGaps(newRead, gaps + insertCount);
60 CigarElement el = it.next();
61 int length = el.getLength();
63 // which insertions coincide with current CIGAR operation?
64 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
65 rec.getStart() + refnext, rec.getStart() + refnext + length);
67 int[] override = null;
70 if (!seqInserts.containsKey(rec.getStart() + refnext))
73 int pos = rec.getStart() + refnext;
75 "Read " + rec.getReadName() + " has an insertion at "
76 + pos + " which is not in seqInserts");
80 // we just inserted something
81 // remove these inserts now - should be very first entry
82 int count = seqInserts.get(rec.getStart() + refnext);
83 override = new int[] { rec.getStart() + refnext,
84 count - lastinserts };
89 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
92 String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
93 newRead.append(nextSegment);
95 if (el.getOperator().consumesReferenceBases())
99 if (el.getOperator().consumesReadBases())
103 if (el.getOperator().equals(CigarOperator.I))
105 // count how many insertions we made
106 lastinserts = length;
109 return newRead.toString();
113 * Apply a cigar operation to a segment of a read, accounting for any
114 * insertions to the reference from other reads in the region
117 * the current CIGAR element
119 * location in read to start applying CIGAR op
123 * SAMRecord corresponding to read
125 * iterator over insertions which coincide with rec
127 * an optional <insertion position, length> which can override a
128 * <position,length> in it to change the length. Set to null if not
133 private String applyCigarOp(CigarElement el, int next,
134 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
135 int[] override, SequenceI seq)
137 StringBuilder newRead = new StringBuilder();
138 String read = rec.getReadString();
140 int length = el.getLength();
143 switch (el.getOperator())
148 // matched or mismatched residues
149 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
152 case N: // intron in RNA
157 // just count all the insertions we need to add in the deletion/intron
159 Entry<Integer, Integer> entry = it.next();
160 insertCount += entry.getValue();
162 addGaps(newRead, length + insertCount);
165 // the reference sequence and other reads should have been gapped for
166 // this insertion, so just add in the residues
167 newRead.append(read.substring(nextPos, nextPos + length));
168 seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
169 nextPos + 1, nextPos + length, "bamfile"));
172 // soft clipping - just skip this bit of the read
174 // hard clipping - this stretch will not appear in the read
179 return newRead.toString();
183 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
186 * next position in read
188 * length of cigar operation
190 * SAMRecord to apply the CIGAR op to
192 * iterator over insertions in the CIGAR region
194 * insertions which have already been accounted for
198 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
199 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
200 int[] override, SequenceI seq)
202 StringBuilder newRead = new StringBuilder();
203 String read = rec.getReadString();
205 int nextPos = next; // location of next position in read
206 while (nextPos < next + length)
208 int insertPos = next + length; // location of next insert in read (or end
210 int insertLen = 0; // no of gaps to insert after insertPos
213 // account for sequence already having insertion
214 // if an override entry exists it gives the number of positions still to
215 // be inserted at this point, after accounting for insertions driven by
217 // e.g. if the current read has 2 inserted positions here, and the
218 // insertions list also has 2 inserted positions, the override will
219 // cancel the insert for this read. If the insertions list had 3
220 // insertions, the override would reduce this to 1 insertion.
221 Entry<Integer, Integer> entry = it.next();
222 int key = entry.getKey();
223 insertLen = entry.getValue();
224 if (override != null && key == override[0])
226 insertLen = override[1];
230 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
234 insertPos = nextPos; // bugfix - trigger by override + insertions in
238 newRead.append(read.substring(nextPos, insertPos));
240 addGaps(newRead, insertLen); // add insertions
246 * Get list of positions inserted to the reference sequence. Note: assumes all
247 * reads are on the same chromosome.
250 * Iterator over all the SAMRecords
251 * @return sorted map of insertion positions <position, insertion size>
257 R1: insert 3 = gap before 3rd res
260 R2: insert 4 = gap before 4th res
263 R3: insert 3,4 = 2 gaps before 3rd res
268 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
269 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
272 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
274 SortedMap<Integer, Integer> inserts = new TreeMap<>();
277 // check each record for insertions in the CIGAR string
278 SAMRecord rec = it.next();
279 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
282 while (cit.hasNext())
284 CigarElement el = cit.next();
285 switch (el.getOperator())
288 // add to insertions list, and move along the read
289 // location is the start of next CIGAR segment
290 // because getReferencePositionAtReadPosition returns 0 for read
291 // positions which are not in the reference (like insertions)
292 int refLocation = rec.getReferencePositionAtReadPosition(
293 next + el.getLength());
295 // if there's already an insertion at this location, keep the longest
296 // insertion; if there's no insertion keep this one
297 if (!inserts.containsKey(refLocation)
298 || (inserts.containsKey(refLocation)
299 && inserts.get(refLocation) < el.getLength()))
301 inserts.put(refLocation, el.getLength());
303 next += el.getLength();
307 // match to reference, or soft clip, move along the read
308 next += el.getLength();
311 // deletions, introns etc don't consume any residues from the read
321 * Add sequence of gaps to a read
328 private void addGaps(StringBuilder read, int length)
332 char[] gaps = new char[length];
333 Arrays.fill(gaps, gapChar);
339 * Get the number of insertions before the start of an aligned read (i.e.
340 * insertions in soft clipped region are counted too)
343 * the SAMRecord for the read
345 * the map of insertions
346 * @return number of insertions before the read starts
348 private int countInsertionsBeforeRead(SAMRecord rec,
349 SortedMap<Integer, Integer> inserts)
353 // add in any insertion gaps before read
354 // although we only need to get the submap from alignmentStart, there won't
355 // be any insertions before that so we can just start at 0
356 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
359 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
363 Entry<Integer, Integer> entry = it.next();
364 gaps += entry.getValue();