1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @param alignmentStart
31 * start of alignment to be used as offset when padding reads with
34 * sequence to be annotated with inserts
35 * @return string representing read with gaps, clipping etc applied
37 public String parseCigarToSequence(SAMRecord rec,
38 SortedMap<Integer, Integer> insertions, int alignmentStart,
41 StringBuilder newRead = new StringBuilder();
42 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
48 // pad with spaces before read
49 // first count gaps needed to pad to reference position
50 // NB position is 1-based so number of gaps = pos-1
51 int gaps = rec.getStart() - alignmentStart;
53 // now count gaps to pad for insertions in other reads
54 int insertCount = countInsertionsBeforeRead(rec, insertions);
55 addGaps(newRead, gaps + insertCount);
60 CigarElement el = it.next();
61 int length = el.getLength();
63 // which insertions coincide with current CIGAR operation?
64 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
65 rec.getStart() + refnext, rec.getStart() + refnext + length);
67 int[] override = null;
70 if (!seqInserts.containsKey(rec.getStart() + refnext))
73 int pos = rec.getStart() + refnext;
75 "Read " + rec.getReadName() + " has an insertion at "
76 + pos + " which is not in seqInserts");
80 // we just inserted something
81 // remove these inserts now - should be very first entry
82 int count = seqInserts.get(rec.getStart() + refnext);
83 override = new int[] { rec.getStart() + refnext,
84 count - lastinserts };
89 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
92 String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
93 newRead.append(nextSegment);
95 if (el.getOperator().consumesReferenceBases())
99 if (el.getOperator().consumesReadBases())
103 if (el.getOperator().equals(CigarOperator.I))
105 // count how many insertions we made
106 lastinserts = length;
109 return newRead.toString();
113 * Apply a cigar operation to a segment of a read, accounting for any
114 * insertions to the reference from other reads in the region
117 * the current CIGAR element
119 * location in read to start applying CIGAR op
123 * SAMRecord corresponding to read
125 * iterator over insertions which coincide with rec
127 * an optional <insertion position, length> which can override a
128 * <position,length> in it to change the length. Set to null if not
133 private String applyCigarOp(CigarElement el, int next,
134 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
135 int[] override, SequenceI seq)
137 StringBuilder newRead = new StringBuilder();
138 String read = rec.getReadString();
140 int length = el.getLength();
143 switch (el.getOperator())
148 // matched or mismatched residues
149 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
152 case N: // intron in RNA
157 // just count all the insertions we need to add in the deletion/intron
159 Entry<Integer, Integer> entry = it.next();
160 insertCount += entry.getValue();
162 addGaps(newRead, length + insertCount);
165 // the reference sequence and other reads should have been gapped for
166 // this insertion, so just add in the residues
167 newRead.append(read.substring(nextPos, nextPos + length));
168 seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
169 nextPos + 1, nextPos + length, "bamfile"));
172 // soft clipping - just skip this bit of the read
173 seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
174 nextPos + 1, nextPos + length, "bamfile"));
177 // hard clipping - this stretch will not appear in the read
178 seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
179 nextPos + 1, nextPos + length, "bamfile"));
185 return newRead.toString();
189 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
192 * next position in read
194 * length of cigar operation
196 * SAMRecord to apply the CIGAR op to
198 * iterator over insertions in the CIGAR region
200 * insertions which have already been accounted for
204 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
205 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
206 int[] override, SequenceI seq)
208 StringBuilder newRead = new StringBuilder();
209 String read = rec.getReadString();
211 int nextPos = next; // location of next position in read
212 while (nextPos < next + length)
214 int insertPos = next + length; // location of next insert in read (or end
216 int insertLen = 0; // no of gaps to insert after insertPos
219 // account for sequence already having insertion
220 // if an override entry exists it gives the number of positions still to
221 // be inserted at this point, after accounting for insertions driven by
223 // e.g. if the current read has 2 inserted positions here, and the
224 // insertions list also has 2 inserted positions, the override will
225 // cancel the insert for this read. If the insertions list had 3
226 // insertions, the override would reduce this to 1 insertion.
227 Entry<Integer, Integer> entry = it.next();
228 int key = entry.getKey();
229 insertLen = entry.getValue();
230 if (override != null && key == override[0])
232 insertLen = override[1];
236 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
240 insertPos = nextPos; // bugfix - trigger by override + insertions in
244 newRead.append(read.substring(nextPos, insertPos));
246 addGaps(newRead, insertLen); // add insertions
252 * Get list of positions inserted to the reference sequence. Note: assumes all
253 * reads are on the same chromosome.
256 * Iterator over all the SAMRecords
257 * @return sorted map of insertion positions <position, insertion size>
263 R1: insert 3 = gap before 3rd res
266 R2: insert 4 = gap before 4th res
269 R3: insert 3,4 = 2 gaps before 3rd res
274 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
275 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
278 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
280 SortedMap<Integer, Integer> inserts = new TreeMap<>();
283 // check each record for insertions in the CIGAR string
284 SAMRecord rec = it.next();
285 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
288 while (cit.hasNext())
290 CigarElement el = cit.next();
291 switch (el.getOperator())
294 // add to insertions list, and move along the read
295 // location is the start of next CIGAR segment
296 // because getReferencePositionAtReadPosition returns 0 for read
297 // positions which are not in the reference (like insertions)
298 int refLocation = rec.getReferencePositionAtReadPosition(
299 next + el.getLength());
301 // if there's already an insertion at this location, keep the longest
302 // insertion; if there's no insertion keep this one
303 if (!inserts.containsKey(refLocation)
304 || (inserts.containsKey(refLocation)
305 && inserts.get(refLocation) < el.getLength()))
307 inserts.put(refLocation, el.getLength());
309 next += el.getLength();
313 // match to reference, or soft clip, move along the read
314 next += el.getLength();
317 // deletions, introns etc don't consume any residues from the read
327 * Add sequence of gaps to a read
334 private void addGaps(StringBuilder read, int length)
338 char[] gaps = new char[length];
339 Arrays.fill(gaps, gapChar);
345 * Get the number of insertions before the start of an aligned read (i.e.
346 * insertions in soft clipped region are counted too)
349 * the SAMRecord for the read
351 * the map of insertions
352 * @return number of insertions before the read starts
354 private int countInsertionsBeforeRead(SAMRecord rec,
355 SortedMap<Integer, Integer> inserts)
359 // add in any insertion gaps before read
360 // although we only need to get the submap from alignmentStart, there won't
361 // be any insertions before that so we can just start at 0
362 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
365 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
369 Entry<Integer, Integer> entry = it.next();
370 gaps += entry.getValue();