1 package jalview.datamodel;
3 import java.util.Arrays;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.SortedMap;
8 import java.util.TreeMap;
10 import htsjdk.samtools.CigarElement;
11 import htsjdk.samtools.CigarOperator;
12 import htsjdk.samtools.SAMFormatException;
13 import htsjdk.samtools.SAMRecord;
14 import jalview.bin.Cache;
16 public class CigarParser
18 private final char gapChar;
20 public CigarParser(char gap)
26 * Apply the CIGAR string to a read sequence and return the updated read.
29 * the SAM record for the read
31 * a map of inserted positions and lengths
32 * @param alignmentStart
33 * start of alignment to be used as offset when padding reads with
36 * sequence to be annotated with inserts
37 * @return string representing read with gaps, clipping etc applied
39 public String parseCigarToSequence(SAMRecord rec,
40 SortedMap<Integer, Integer> insertions, int alignmentStart,
43 StringBuilder newRead = new StringBuilder();
44 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
50 // pad with spaces before read
51 // first count gaps needed to pad to reference position
52 // NB position is 1-based so number of gaps = pos-1
53 int gaps = rec.getStart() - alignmentStart;
55 // now count gaps to pad for insertions in other reads
56 int insertCount = countInsertionsBeforeRead(rec, insertions);
57 addGaps(newRead, gaps + insertCount);
62 CigarElement el = it.next();
63 int length = el.getLength();
65 // which insertions coincide with current CIGAR operation?
66 SortedMap<Integer, Integer> seqInserts = insertions.subMap(
67 rec.getStart() + refnext, rec.getStart() + refnext + length);
69 int[] override = null;
72 if (!seqInserts.containsKey(rec.getStart() + refnext))
75 int pos = rec.getStart() + refnext;
77 "Read " + rec.getReadName() + " has an insertion at "
78 + pos + " which is not in seqInserts");
82 // we just inserted something
83 // remove these inserts now - should be very first entry
84 int count = seqInserts.get(rec.getStart() + refnext);
85 override = new int[] { rec.getStart() + refnext,
86 count - lastinserts };
91 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
94 String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
95 newRead.append(nextSegment);
97 if (el.getOperator().consumesReferenceBases())
101 if (el.getOperator().consumesReadBases())
105 if (el.getOperator().equals(CigarOperator.I))
107 // count how many insertions we made
108 lastinserts = length;
111 return newRead.toString();
115 * Apply a cigar operation to a segment of a read, accounting for any
116 * insertions to the reference from other reads in the region
119 * the current CIGAR element
121 * location in read to start applying CIGAR op
125 * SAMRecord corresponding to read
127 * iterator over insertions which coincide with rec
129 * an optional <insertion position, length> which can override a
130 * <position,length> in it to change the length. Set to null if not
135 private String applyCigarOp(CigarElement el, int next,
136 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
137 int[] override, SequenceI seq)
139 StringBuilder newRead = new StringBuilder();
140 String read = rec.getReadString();
142 int length = el.getLength();
145 switch (el.getOperator())
150 // matched or mismatched residues
151 newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
154 case N: // intron in RNA
159 // just count all the insertions we need to add in the deletion/intron
161 Entry<Integer, Integer> entry = it.next();
162 insertCount += entry.getValue();
164 addGaps(newRead, length + insertCount);
167 // the reference sequence and other reads should have been gapped for
168 // this insertion, so just add in the residues
169 newRead.append(read.substring(nextPos, nextPos + length));
170 seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
171 nextPos + 1, nextPos + length, "bamfile"));
174 // soft clipping - just skip this bit of the read
175 seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
176 nextPos + 1, nextPos + length, "bamfile"));
179 // hard clipping - this stretch will not appear in the read
180 seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
181 nextPos + 1, nextPos + length, "bamfile"));
187 return newRead.toString();
191 * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
194 * next position in read
196 * length of cigar operation
198 * SAMRecord to apply the CIGAR op to
200 * iterator over insertions in the CIGAR region
202 * insertions which have already been accounted for
206 private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
207 SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
208 int[] override, SequenceI seq)
210 StringBuilder newRead = new StringBuilder();
211 String read = rec.getReadString();
213 int nextPos = next; // location of next position in read
214 while (nextPos < next + length)
216 int insertPos = next + length; // location of next insert in read (or end
218 int insertLen = 0; // no of gaps to insert after insertPos
221 // account for sequence already having insertion
222 // if an override entry exists it gives the number of positions still to
223 // be inserted at this point, after accounting for insertions driven by
225 // e.g. if the current read has 2 inserted positions here, and the
226 // insertions list also has 2 inserted positions, the override will
227 // cancel the insert for this read. If the insertions list had 3
228 // insertions, the override would reduce this to 1 insertion.
229 Entry<Integer, Integer> entry = it.next();
230 int key = entry.getKey();
231 insertLen = entry.getValue();
232 if (override != null && key == override[0])
234 insertLen = override[1];
238 insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
242 insertPos = nextPos; // bugfix - trigger by override + insertions in
246 newRead.append(read.substring(nextPos, insertPos));
248 addGaps(newRead, insertLen); // add insertions
254 * Get list of positions inserted to the reference sequence. Note: assumes all
255 * reads are on the same chromosome.
258 * Iterator over all the SAMRecords
259 * @return sorted map of insertion positions <position, insertion size>
265 R1: insert 3 = gap before 3rd res
268 R2: insert 4 = gap before 4th res
271 R3: insert 3,4 = 2 gaps before 3rd res
276 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
277 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
280 public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
282 return getInsertions(it, null);
285 public int firstAlColumn = -1, firstRposCol = -1;
287 public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
290 SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
291 new TreeMap<>(), new TreeMap<>() };
292 Range xten = extent != null && extent[0] != null ? extent[0] : null;
295 // check extent of read
296 SAMRecord rec = null;
299 } catch (SAMFormatException ex)
301 Cache.log.info("Bailing on parsing SAM File - see error below",ex);
307 int nstart = rec.getAlignmentStart();
310 nstart = rec.getReferencePositionAtReadPosition(nstart);
312 int nend = rec.getAlignmentEnd();
315 nend = rec.getReferencePositionAtReadPosition(nend);
319 nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
320 nend == 0 ? xten.end : Math.max(nend, xten.end));
323 // check each record for insertions in the CIGAR string
325 // pick the insert map to update
326 int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
327 SortedMap<Integer, Integer> inserts = sinserts[insmap];
329 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
331 int refLocation = rec.getAlignmentStart();
335 while (cit.hasNext())
337 CigarElement el = cit.next();
338 switch (el.getOperator())
342 // ignore soft/hardclipped
346 if (el.getOperator().consumesReadBases()
347 && !el.getOperator().consumesReferenceBases())
349 // add to insertions list, and move along the read
350 // location is the start of next CIGAR segment
351 // because getReferencePositionAtReadPosition returns 0 for read
352 // positions which are not in the reference (like insertions)
354 // if there's already an insertion at this location, keep the
356 // insertion; if there's no insertion keep this one
357 if (!inserts.containsKey(refLocation)
358 || (inserts.containsKey(refLocation)
359 && inserts.get(refLocation) < el.getLength()))
361 inserts.put(refLocation, el.getLength());
365 if (el.getOperator().consumesReferenceBases())
367 if (el.getOperator().consumesReadBases() && alcol == 0
368 && refLocation + 1 > extent[0].start)
370 // first match position : last rloc + first match op
372 alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
374 refLocation += el.getLength();
378 if (el.getOperator().consumesReadBases()
379 || el.getOperator().consumesReferenceBases())
381 rloc += el.getLength();
384 // Wrong: hack that fails to relocate reference to correct place in
386 if (firstAlColumn < 0)
388 firstAlColumn = alcol;
389 firstRposCol = alpos;
391 } while (it.hasNext());
400 * Add sequence of gaps to a read
407 private void addGaps(StringBuilder read, int length)
411 char[] gaps = new char[length];
412 Arrays.fill(gaps, gapChar);
418 * Get the number of insertions before the start of an aligned read (i.e.
419 * insertions in soft clipped region are counted too)
422 * the SAMRecord for the read
424 * the map of insertions
425 * @return number of insertions before the read starts
427 private int countInsertionsBeforeRead(SAMRecord rec,
428 SortedMap<Integer, Integer> inserts)
432 // add in any insertion gaps before read
433 // although we only need to get the submap from alignmentStart, there won't
434 // be any insertions before that so we can just start at 0
435 SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
438 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
442 Entry<Integer, Integer> entry = it.next();
443 gaps += entry.getValue();