1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @param alignmentStart
31 * start of alignment to be used as offset when padding reads with
33 * @return string representing read with gaps, clipping etc applied
35 public String parseCigarToSequence(SAMRecord rec,
36 SortedMap<Integer, Integer> insertions, int alignmentStart)
38 StringBuilder newRead = new StringBuilder();
39 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
45 // pad with spaces before read
46 // first count gaps needed to pad to reference position
47 // NB position is 1-based so number of gaps = pos-1
48 int gaps = rec.getStart() - alignmentStart;
50 // now count gaps to pad for insertions in other reads
51 int insertCount = countInsertionsBeforeRead(rec, insertions);
52 addGaps(newRead, gaps + insertCount);
57 CigarElement el = it.next();
58 int length = el.getLength();
60 // which insertions coincide with current CIGAR operation?
61 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
62 rec.getStart() + refnext, rec.getStart() + refnext + length);
64 int[] override = null;
67 if (!seqInserts.containsKey(rec.getStart() + refnext))
70 int pos = rec.getStart() + refnext;
71 System.out.println("Insertion not added to seqInserts: " + pos);
76 // we just inserted something
77 // remove these inserts now - should be very first entry
78 int count = seqInserts.get(rec.getStart() + refnext);
79 override = new int[] { rec.getStart() + refnext,
80 count - lastinserts };
85 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
88 String nextSegment = applyCigarOp(el, next, rec, iit, override);
89 newRead.append(nextSegment);
91 if (el.getOperator().consumesReferenceBases())
95 if (el.getOperator().consumesReadBases())
99 if (el.getOperator().equals(CigarOperator.I))
101 // count how many insertions we made
102 lastinserts = length;
105 return newRead.toString();
109 * Apply a cigar operation to a segment of a read, accounting for any
110 * insertions to the reference from other reads in the region
113 * the current CIGAR element
115 * location in read to start applying CIGAR op
119 * SAMRecord corresponding to read
121 * iterator over insertions which coincide with rec
123 * an optional <insertion position, length> which can override a
124 * <position,length> in it to change the length. Set to null if not
128 private String applyCigarOp(CigarElement el, int next,
129 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
132 StringBuilder newRead = new StringBuilder();
133 String read = rec.getReadString();
135 int length = el.getLength();
138 switch (el.getOperator())
143 // matched or mismatched residues
144 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
147 case N: // intron in RNA
152 // just count all the insertions we need to add in the deletion/intron
154 Entry<Integer, Integer> entry = it.next();
155 insertCount += entry.getValue();
157 addGaps(newRead, length + insertCount);
160 // the reference sequence and other reads should have been gapped for
161 // this insertion, so just add in the residues
162 newRead.append(read.substring(nextPos, nextPos + length));
165 // soft clipping - just skip this bit of the read
167 // hard clipping - this stretch will not appear in the read
172 return newRead.toString();
176 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
179 * next position in read
181 * length of cigar operation
183 * SAMRecord to apply the CIGAR op to
185 * iterator over insertions in the CIGAR region
187 * insertions which have already been accounted for
190 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
191 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
194 StringBuilder newRead = new StringBuilder();
195 String read = rec.getReadString();
197 int nextPos = next; // location of next position in read
198 while (nextPos < next + length)
200 int insertPos = next + length; // location of next insert in read (or end
202 int insertLen = 0; // no of gaps to insert after insertPos
205 // account for sequence already having insertion
206 // if an override entry exists it gives the number of positions still to
207 // be inserted at this point, after accounting for insertions driven by
209 // e.g. if the current read has 2 inserted positions here, and the
210 // insertions list also has 2 inserted positions, the override will
211 // cancel the insert for this read. If the insertions list had 3
212 // insertions, the override would reduce this to 1 insertion.
213 Entry<Integer, Integer> entry = it.next();
214 int key = entry.getKey();
215 insertLen = entry.getValue();
216 if (override != null && key == override[0])
218 insertLen = override[1];
222 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
226 insertPos = nextPos; // bugfix - trigger by override + insertions in
230 newRead.append(read.substring(nextPos, insertPos));
232 addGaps(newRead, insertLen); // add insertions
238 * Get list of positions inserted to the reference sequence. Note: assumes all
239 * reads are on the same chromosome.
242 * Iterator over all the SAMRecords
243 * @return sorted map of insertion positions <position, insertion size>
249 R1: insert 3 = gap before 3rd res
252 R2: insert 4 = gap before 4th res
255 R3: insert 3,4 = 2 gaps before 3rd res
260 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
261 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
264 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
266 SortedMap<Integer, Integer> inserts = new TreeMap<>();
269 // check each record for insertions in the CIGAR string
270 SAMRecord rec = it.next();
271 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
274 while (cit.hasNext())
276 CigarElement el = cit.next();
277 switch (el.getOperator())
280 // add to insertions list, and move along the read
281 // location is 1 past the current position of next
282 // (note if we try to retrieve +1 position from reference by calling
283 // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
285 int refLocation = rec.getReferencePositionAtReadPosition(next)
288 // if there's already an insertion at this location, keep the longest
289 // insertion; if there's no insertion keep this one
290 if (!inserts.containsKey(refLocation)
291 || (inserts.containsKey(refLocation)
292 && inserts.get(refLocation) < el.getLength()))
294 inserts.put(refLocation, el.getLength());
296 next += el.getLength();
300 // match to reference, or soft clip, move along the read
301 next += el.getLength();
304 // deletions, introns etc don't consume any residues from the read
314 * Add sequence of gaps to a read
321 private void addGaps(StringBuilder read, int length)
325 char[] gaps = new char[length];
326 Arrays.fill(gaps, gapChar);
332 * Get the number of insertions before the start of an aligned read (i.e.
333 * insertions in soft clipped region are counted too)
336 * the SAMRecord for the read
338 * the map of insertions
339 * @return number of insertions before the read starts
341 private int countInsertionsBeforeRead(SAMRecord rec,
342 SortedMap<Integer, Integer> inserts)
346 // add in any insertion gaps before read
347 // although we only need to get the submap from alignmentStart, there won't
348 // be any insertions before that so we can just start at 0
349 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
352 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
356 Entry<Integer, Integer> entry = it.next();
357 gaps += entry.getValue();