1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gapCharacter)
20 gapChar = gapCharacter;
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @return string representing read with gaps, clipping etc applied
32 public String parseCigarToSequence(SAMRecord rec,
33 SortedMap<Integer, Integer> insertions)
35 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
38 StringBuilder newRead = new StringBuilder();
42 // pad with spaces before read
43 // first count gaps needed to pad to reference position
44 // NB position is 1-based so number of gaps = pos-1
45 int gaps = rec.getUnclippedStart() - 1;
46 // now count gaps to pad for insertions in other reads
47 int insertCount = countInsertionsBeforeRead(rec, insertions);
48 addGaps(newRead, gaps + insertCount);
53 CigarElement el = it.next();
54 int length = el.getLength();
56 // which insertions coincide with current CIGAR operation?
57 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
58 rec.getStart() + refnext, rec.getStart() + refnext + length);
60 int[] override = null;
63 // we just inserted something
64 // remove these inserts now - should be very first entry
65 int count = seqInserts.get(rec.getStart() + refnext);
66 override = new int[] { rec.getStart() + refnext,
67 count - lastinserts };
71 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
74 newRead.append(applyCigarOp(el, next, rec, iit, override));
76 if (el.getOperator().consumesReferenceBases())
80 if (el.getOperator().consumesReadBases())
84 if (el.getOperator().equals(CigarOperator.I))
86 // count how many insertions we made
90 return newRead.toString();
94 * Apply a cigar operation to a segment of a read, accounting for any
95 * insertions to the reference from other reads in the region
98 * the current CIGAR element
100 * location in read to start applying CIGAR op
104 * SAMRecord corresponding to read
106 * iterator over insertions which coincide with rec
108 * an optional <insertion position, length> which can override a
109 * <position,length> in it to change the length. Set to null if not
113 private String applyCigarOp(CigarElement el, int next,
114 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
117 StringBuilder newRead = new StringBuilder();
118 String read = rec.getReadString();
120 int length = el.getLength();
123 switch (el.getOperator())
128 // matched or mismatched residues
129 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
132 case N: // intron in RNA
137 // just count all the insertions we need to add in the deletion/intron
139 Entry<Integer, Integer> entry = it.next();
140 insertCount += entry.getValue();
142 addGaps(newRead, length + insertCount);
145 // soft clipping - just skip this bit of the read
149 read.substring(nextPos, nextPos + length).toLowerCase());
150 // nextPos += length;
153 // the reference sequence and other reads should have been gapped for
154 // this insertion, so just add in the residues
155 newRead.append(read.substring(nextPos, nextPos + length));
156 // nextPos += length;
159 // hard clipping - this stretch will not appear in the read
164 return newRead.toString();
168 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
171 * next position in read
173 * length of cigar operation
175 * SAMRecord to apply the CIGAR op to
177 * iterator over insertions in the CIGAR region
179 * insertions which have already been accounted for
182 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
183 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
186 StringBuilder newRead = new StringBuilder();
187 String read = rec.getReadString();
190 while (nextPos < next + length)
192 int insertPos = next + length;
196 // account for sequence already having insertion
197 // if an override entry exists it gives the number of positions still to
198 // be inserted at this point, after accounting for insertions driven by
200 // e.g. if the current read has 2 inserted positions here, and the
201 // insertions list also has 2 inserted positions, the override will
202 // cancel the insert for this read. If the insertions list had 3
203 // insertions, the override would reduce this to 1 insertion.
204 Entry<Integer, Integer> entry = it.next();
205 int key = entry.getKey();
206 insertLen = entry.getValue();
207 if (override != null && key == override[0])
209 insertLen = override[1];
213 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
216 newRead.append(read.substring(nextPos, insertPos));
218 addGaps(newRead, insertLen); // add insertions
224 * Get list of positions inserted to the reference sequence. Note: assumes all
225 * reads are on the same chromosome.
228 * Iterator over all the SAMRecords
229 * @return sorted map of insertion positions <position, insertion size>
235 R1: insert 3 = gap before 3rd res
238 R2: insert 4 = gap before 4th res
241 R3: insert 3,4 = 2 gaps before 3rd res
246 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
247 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
250 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
252 SortedMap<Integer, Integer> insertions = new TreeMap<>();
255 // check each record for insertions in the CIGAR string
256 SAMRecord rec = it.next();
257 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
260 while (cit.hasNext())
262 CigarElement el = cit.next();
263 switch (el.getOperator())
266 // add to insertions list, and move along the read
267 // location is 1 past the current position of next
268 // (note if we try to retrieve +1 position from reference by calling
269 // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
271 int refLocation = rec.getReferencePositionAtReadPosition(next)
274 // if there's already an insertion at this location, keep the longest
275 // insertion; if there's no insertion keep this one
276 if (!insertions.containsKey(refLocation)
277 || (insertions.containsKey(refLocation)
278 && insertions.get(refLocation) < el.getLength()))
280 insertions.put(refLocation, el.getLength());
282 next += el.getLength();
286 // match to reference, or soft clip, move along the read
287 next += el.getLength();
290 // deletions, introns etc don't consume any residues from the read
300 * Add sequence of gaps to a read
307 private void addGaps(StringBuilder read, int length)
311 char[] gaps = new char[length];
312 Arrays.fill(gaps, gapChar);
318 * Get the number of insertions before the start of an aligned read (i.e.
319 * insertions in soft clipped region are counted too)
322 * the SAMRecord for the read
324 * the map of insertions
325 * @return number of insertions before the read starts
327 private int countInsertionsBeforeRead(SAMRecord rec,
328 SortedMap<Integer, Integer> insertions)
332 // add in any insertion gaps before read
333 // TODO start point should be start of alignment not 0
334 SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
337 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
341 Entry<Integer, Integer> entry = it.next();
342 gaps += entry.getValue();