1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMRecord;
14 public class CigarParser
16 private final char gapChar;
18 public CigarParser(char gap)
24 * Apply the CIGAR string to a read sequence and return the updated read.
27 * the SAM record for the read
29 * a map of inserted positions and lengths
30 * @return string representing read with gaps, clipping etc applied
32 public String parseCigarToSequence(SAMRecord rec,
33 SortedMap<Integer, Integer> insertions)
35 StringBuilder newRead = new StringBuilder();
36 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
42 // pad with spaces before read
43 // first count gaps needed to pad to reference position
44 // NB position is 1-based so number of gaps = pos-1
45 int gaps = rec.getStart() - 1; // rec.getUnclippedStart() - 1;
46 // now count gaps to pad for insertions in other reads
47 int insertCount = countInsertionsBeforeRead(rec, insertions);
48 addGaps(newRead, gaps + insertCount);
49 // addGaps(newTrimmedRead, gaps + insertCount);
54 CigarElement el = it.next();
55 int length = el.getLength();
57 // which insertions coincide with current CIGAR operation?
58 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
59 rec.getStart() + refnext, rec.getStart() + refnext + length);
61 int[] override = null;
64 if (!seqInserts.containsKey(rec.getStart() + refnext))
67 int pos = rec.getStart() + refnext;
68 System.out.println("Insertion not added to seqInserts: " + pos);
73 // we just inserted something
74 // remove these inserts now - should be very first entry
75 int count = seqInserts.get(rec.getStart() + refnext);
76 override = new int[] { rec.getStart() + refnext,
77 count - lastinserts };
82 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
85 String nextSegment = applyCigarOp(el, next, rec, iit, override);
86 newRead.append(nextSegment);
88 if (el.getOperator().consumesReferenceBases())
92 if (el.getOperator().consumesReadBases())
96 if (el.getOperator().equals(CigarOperator.I))
98 // count how many insertions we made
102 return newRead.toString();
106 * Apply a cigar operation to a segment of a read, accounting for any
107 * insertions to the reference from other reads in the region
110 * the current CIGAR element
112 * location in read to start applying CIGAR op
116 * SAMRecord corresponding to read
118 * iterator over insertions which coincide with rec
120 * an optional <insertion position, length> which can override a
121 * <position,length> in it to change the length. Set to null if not
125 private String applyCigarOp(CigarElement el, int next,
126 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
129 StringBuilder newRead = new StringBuilder();
130 String read = rec.getReadString();
132 int length = el.getLength();
135 switch (el.getOperator())
140 // matched or mismatched residues
141 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
144 case N: // intron in RNA
149 // just count all the insertions we need to add in the deletion/intron
151 Entry<Integer, Integer> entry = it.next();
152 insertCount += entry.getValue();
154 addGaps(newRead, length + insertCount);
157 // the reference sequence and other reads should have been gapped for
158 // this insertion, so just add in the residues
159 newRead.append(read.substring(nextPos, nextPos + length));
162 // soft clipping - just skip this bit of the read
164 // hard clipping - this stretch will not appear in the read
169 return newRead.toString();
173 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
176 * next position in read
178 * length of cigar operation
180 * SAMRecord to apply the CIGAR op to
182 * iterator over insertions in the CIGAR region
184 * insertions which have already been accounted for
187 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
188 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
191 StringBuilder newRead = new StringBuilder();
192 String read = rec.getReadString();
195 while (nextPos < next + length)
197 int insertPos = next + length;
201 // account for sequence already having insertion
202 // if an override entry exists it gives the number of positions still to
203 // be inserted at this point, after accounting for insertions driven by
205 // e.g. if the current read has 2 inserted positions here, and the
206 // insertions list also has 2 inserted positions, the override will
207 // cancel the insert for this read. If the insertions list had 3
208 // insertions, the override would reduce this to 1 insertion.
209 Entry<Integer, Integer> entry = it.next();
210 int key = entry.getKey();
211 insertLen = entry.getValue();
212 if (override != null && key == override[0])
214 insertLen = override[1];
218 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
221 newRead.append(read.substring(nextPos, insertPos));
223 addGaps(newRead, insertLen); // add insertions
229 * Get list of positions inserted to the reference sequence. Note: assumes all
230 * reads are on the same chromosome.
233 * Iterator over all the SAMRecords
234 * @return sorted map of insertion positions <position, insertion size>
240 R1: insert 3 = gap before 3rd res
243 R2: insert 4 = gap before 4th res
246 R3: insert 3,4 = 2 gaps before 3rd res
251 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
252 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
255 public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
257 SortedMap<Integer, Integer> inserts = new TreeMap<>();
260 // check each record for insertions in the CIGAR string
261 SAMRecord rec = it.next();
262 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
265 while (cit.hasNext())
267 CigarElement el = cit.next();
268 switch (el.getOperator())
271 // add to insertions list, and move along the read
272 // location is 1 past the current position of next
273 // (note if we try to retrieve +1 position from reference by calling
274 // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
276 int refLocation = rec.getReferencePositionAtReadPosition(next)
279 // if there's already an insertion at this location, keep the longest
280 // insertion; if there's no insertion keep this one
281 if (!inserts.containsKey(refLocation)
282 || (inserts.containsKey(refLocation)
283 && inserts.get(refLocation) < el.getLength()))
285 inserts.put(refLocation, el.getLength());
287 next += el.getLength();
291 // match to reference, or soft clip, move along the read
292 next += el.getLength();
295 // deletions, introns etc don't consume any residues from the read
305 * Add sequence of gaps to a read
312 private void addGaps(StringBuilder read, int length)
316 char[] gaps = new char[length];
317 Arrays.fill(gaps, gapChar);
323 * Get the number of insertions before the start of an aligned read (i.e.
324 * insertions in soft clipped region are counted too)
327 * the SAMRecord for the read
329 * the map of insertions
330 * @return number of insertions before the read starts
332 private int countInsertionsBeforeRead(SAMRecord rec,
333 SortedMap<Integer, Integer> inserts)
337 // add in any insertion gaps before read
338 // TODO start point should be start of alignment not 0
339 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
342 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
346 Entry<Integer, Integer> entry = it.next();
347 gaps += entry.getValue();