1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @param alignmentStart
31 * start of alignment to be used as offset when padding reads with
33 * @return string representing read with gaps, clipping etc applied
35 public String parseCigarToSequence(SAMRecord rec,
36 SortedMap<Integer, Integer> insertions, int alignmentStart)
38 StringBuilder newRead = new StringBuilder();
39 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
45 // pad with spaces before read
46 // first count gaps needed to pad to reference position
47 // NB position is 1-based so number of gaps = pos-1
48 int gaps = rec.getStart() - alignmentStart;
50 // now count gaps to pad for insertions in other reads
51 int insertCount = countInsertionsBeforeRead(rec, insertions);
52 addGaps(newRead, gaps + insertCount);
57 CigarElement el = it.next();
58 int length = el.getLength();
60 // which insertions coincide with current CIGAR operation?
61 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
62 rec.getStart() + refnext, rec.getStart() + refnext + length);
64 int[] override = null;
67 if (!seqInserts.containsKey(rec.getStart() + refnext))
70 int pos = rec.getStart() + refnext;
71 System.out.println("Insertion not added to seqInserts: " + pos);
76 // we just inserted something
77 // remove these inserts now - should be very first entry
78 int count = seqInserts.get(rec.getStart() + refnext);
79 override = new int[] { rec.getStart() + refnext,
80 count - lastinserts };
85 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
88 String nextSegment = applyCigarOp(el, next, rec, iit, override);
89 newRead.append(nextSegment);
91 if (el.getOperator().consumesReferenceBases())
95 if (el.getOperator().consumesReadBases())
99 if (el.getOperator().equals(CigarOperator.I))
101 // count how many insertions we made
102 lastinserts = length;
105 return newRead.toString();
109 * Apply a cigar operation to a segment of a read, accounting for any
110 * insertions to the reference from other reads in the region
113 * the current CIGAR element
115 * location in read to start applying CIGAR op
119 * SAMRecord corresponding to read
121 * iterator over insertions which coincide with rec
123 * an optional <insertion position, length> which can override a
124 * <position,length> in it to change the length. Set to null if not
128 private String applyCigarOp(CigarElement el, int next,
129 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
132 StringBuilder newRead = new StringBuilder();
133 String read = rec.getReadString();
135 int length = el.getLength();
138 switch (el.getOperator())
143 // matched or mismatched residues
144 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
147 case N: // intron in RNA
152 // just count all the insertions we need to add in the deletion/intron
154 Entry<Integer, Integer> entry = it.next();
155 insertCount += entry.getValue();
157 addGaps(newRead, length + insertCount);
160 // the reference sequence and other reads should have been gapped for
161 // this insertion, so just add in the residues
162 newRead.append(read.substring(nextPos, nextPos + length));
165 // soft clipping - just skip this bit of the read
167 // hard clipping - this stretch will not appear in the read
172 return newRead.toString();
176 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
179 * next position in read
181 * length of cigar operation
183 * SAMRecord to apply the CIGAR op to
185 * iterator over insertions in the CIGAR region
187 * insertions which have already been accounted for
190 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
191 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
194 StringBuilder newRead = new StringBuilder();
195 String read = rec.getReadString();
198 while (nextPos < next + length)
200 int insertPos = next + length;
204 // account for sequence already having insertion
205 // if an override entry exists it gives the number of positions still to
206 // be inserted at this point, after accounting for insertions driven by
208 // e.g. if the current read has 2 inserted positions here, and the
209 // insertions list also has 2 inserted positions, the override will
210 // cancel the insert for this read. If the insertions list had 3
211 // insertions, the override would reduce this to 1 insertion.
212 Entry<Integer, Integer> entry = it.next();
213 int key = entry.getKey();
214 insertLen = entry.getValue();
215 if (override != null && key == override[0])
217 insertLen = override[1];
221 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
224 newRead.append(read.substring(nextPos, insertPos));
226 addGaps(newRead, insertLen); // add insertions
232 * Get list of positions inserted to the reference sequence. Note: assumes all
233 * reads are on the same chromosome.
236 * Iterator over all the SAMRecords
237 * @return sorted map of insertion positions <position, insertion size>
243 R1: insert 3 = gap before 3rd res
246 R2: insert 4 = gap before 4th res
249 R3: insert 3,4 = 2 gaps before 3rd res
254 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
255 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
258 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
260 SortedMap<Integer, Integer> inserts = new TreeMap<>();
263 // check each record for insertions in the CIGAR string
264 SAMRecord rec = it.next();
265 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
268 while (cit.hasNext())
270 CigarElement el = cit.next();
271 switch (el.getOperator())
274 // add to insertions list, and move along the read
275 // location is 1 past the current position of next
276 // (note if we try to retrieve +1 position from reference by calling
277 // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
279 int refLocation = rec.getReferencePositionAtReadPosition(next)
282 // if there's already an insertion at this location, keep the longest
283 // insertion; if there's no insertion keep this one
284 if (!inserts.containsKey(refLocation)
285 || (inserts.containsKey(refLocation)
286 && inserts.get(refLocation) < el.getLength()))
288 inserts.put(refLocation, el.getLength());
290 next += el.getLength();
294 // match to reference, or soft clip, move along the read
295 next += el.getLength();
298 // deletions, introns etc don't consume any residues from the read
308 * Add sequence of gaps to a read
315 private void addGaps(StringBuilder read, int length)
319 char[] gaps = new char[length];
320 Arrays.fill(gaps, gapChar);
326 * Get the number of insertions before the start of an aligned read (i.e.
327 * insertions in soft clipped region are counted too)
330 * the SAMRecord for the read
332 * the map of insertions
333 * @return number of insertions before the read starts
335 private int countInsertionsBeforeRead(SAMRecord rec,
336 SortedMap<Integer, Integer> inserts)
340 // add in any insertion gaps before read
341 // although we only need to get the submap from alignmentStart, there won't
342 // be any insertions before that so we can just start at 0
343 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
346 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
350 Entry<Integer, Integer> entry = it.next();
351 gaps += entry.getValue();