* the SAM record for the read
* @param insertions
* a map of inserted positions and lengths
+ * @param alignmentStart
+ * start of alignment to be used as offset when padding reads with
+ * gaps
+ * @param seq
+ * sequence to be annotated with inserts
* @return string representing read with gaps, clipping etc applied
*/
public String parseCigarToSequence(SAMRecord rec,
- SortedMap<Integer, Integer> insertions)
+ SortedMap<Integer, Integer> insertions, int alignmentStart,
+ SequenceI seq)
{
StringBuilder newRead = new StringBuilder();
Iterator<CigarElement> it = rec.getCigar().getCigarElements()
// pad with spaces before read
// first count gaps needed to pad to reference position
// NB position is 1-based so number of gaps = pos-1
- int gaps = rec.getStart() - 1; // rec.getUnclippedStart() - 1;
+ int gaps = rec.getStart() - alignmentStart;
+
// now count gaps to pad for insertions in other reads
int insertCount = countInsertionsBeforeRead(rec, insertions);
addGaps(newRead, gaps + insertCount);
- // addGaps(newTrimmedRead, gaps + insertCount);
int lastinserts = 0;
while (it.hasNext())
{
// ERROR
int pos = rec.getStart() + refnext;
- System.out.println("Insertion not added to seqInserts: " + pos);
+ System.out.println(
+ "Read " + rec.getReadName() + " has an insertion at "
+ + pos + " which is not in seqInserts");
}
else
{
-
// we just inserted something
// remove these inserts now - should be very first entry
int count = seqInserts.get(rec.getStart() + refnext);
Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
.iterator();
- String nextSegment = applyCigarOp(el, next, rec, iit, override);
+ String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
newRead.append(nextSegment);
if (el.getOperator().consumesReferenceBases())
* an optional <insertion position, length> which can override a
* <position,length> in it to change the length. Set to null if not
* used.
+ * @param seq
* @return
*/
private String applyCigarOp(CigarElement el, int next,
SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
- int[] override)
+ int[] override, SequenceI seq)
{
StringBuilder newRead = new StringBuilder();
String read = rec.getReadString();
case EQ:
// matched or mismatched residues
newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
- override);
+ override, seq);
break;
case N: // intron in RNA
case D: // deletion
// the reference sequence and other reads should have been gapped for
// this insertion, so just add in the residues
newRead.append(read.substring(nextPos, nextPos + length));
+ seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
+ nextPos + 1, nextPos + length, "bamfile"));
break;
case S:
// soft clipping - just skip this bit of the read
+ seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
case H:
// hard clipping - this stretch will not appear in the read
+ seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
default:
break;
}
* iterator over insertions in the CIGAR region
* @param override
* insertions which have already been accounted for
+ * @param seq
* @return
*/
private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
- int[] override)
+ int[] override, SequenceI seq)
{
StringBuilder newRead = new StringBuilder();
String read = rec.getReadString();
- int nextPos = next;
+ int nextPos = next; // location of next position in read
while (nextPos < next + length)
{
- int insertPos = next + length;
- int insertLen = 0;
+ int insertPos = next + length; // location of next insert in read (or end
+ // of read)
+ int insertLen = 0; // no of gaps to insert after insertPos
if (it.hasNext())
{
// account for sequence already having insertion
{
insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
}
+ else
+ {
+ insertPos = nextPos; // bugfix - trigger by override + insertions in
+ // same M stretch
+ }
}
newRead.append(read.substring(nextPos, insertPos));
nextPos = insertPos;
Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
*/
- public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
{
- SortedMap<Integer, Integer> inserts = new TreeMap<>();
- while (it.hasNext())
+ return getInsertions(it, null);
+ }
+
+ public int firstAlColumn = -1, firstRposCol = -1;
+
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+ Range[] extent)
+ {
+ SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+ new TreeMap<>(), new TreeMap<>() };
+ Range xten = extent != null && extent[0] != null ? extent[0] : null;
+ do
{
- // check each record for insertions in the CIGAR string
+ // check extent of read
SAMRecord rec = it.next();
+ if (extent != null)
+ {
+
+ int nstart = rec.getAlignmentStart();
+ if (nstart != 0)
+ {
+ nstart = rec.getReferencePositionAtReadPosition(nstart);
+ }
+ int nend = rec.getAlignmentEnd();
+ if (nend != 0)
+ {
+ nend = rec.getReferencePositionAtReadPosition(nend);
+ }
+
+ xten = new Range(
+ nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+ nend == 0 ? xten.end : Math.max(nend, xten.end));
+ }
+
+ // check each record for insertions in the CIGAR string
+
+ // pick the insert map to update
+ int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+ SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
.iterator();
- int next = 0;
+ int refLocation = rec.getAlignmentStart();
+ int rloc = 0;
+ int alcol = 0;
+ int alpos = 0;
while (cit.hasNext())
{
CigarElement el = cit.next();
switch (el.getOperator())
{
- case I:
- // add to insertions list, and move along the read
- // location is 1 past the current position of next
- // (note if we try to retrieve +1 position from reference by calling
- // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
- // insertion!)
- int refLocation = rec.getReferencePositionAtReadPosition(next)
- + 1;
-
- // if there's already an insertion at this location, keep the longest
- // insertion; if there's no insertion keep this one
- if (!inserts.containsKey(refLocation)
- || (inserts.containsKey(refLocation)
- && inserts.get(refLocation) < el.getLength()))
- {
- inserts.put(refLocation, el.getLength());
- }
- next += el.getLength();
- break;
- case M:
case S:
- // match to reference, or soft clip, move along the read
- next += el.getLength();
+ case H:
+ // ignore soft/hardclipped
+
break;
default:
- // deletions, introns etc don't consume any residues from the read
- break;
+ if (el.getOperator().consumesReadBases()
+ && !el.getOperator().consumesReferenceBases())
+ {
+ // add to insertions list, and move along the read
+ // location is the start of next CIGAR segment
+ // because getReferencePositionAtReadPosition returns 0 for read
+ // positions which are not in the reference (like insertions)
+
+ // if there's already an insertion at this location, keep the
+ // longest
+ // insertion; if there's no insertion keep this one
+ if (!inserts.containsKey(refLocation)
+ || (inserts.containsKey(refLocation)
+ && inserts.get(refLocation) < el.getLength()))
+ {
+ inserts.put(refLocation, el.getLength());
+ }
+ break;
+ }
+ if (el.getOperator().consumesReferenceBases())
+ {
+ if (el.getOperator().consumesReadBases() && alcol == 0
+ && refLocation + 1 > extent[0].start)
+ {
+ // first match position : last rloc + first match op
+ alcol = rloc + 1;
+ alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+ }
+ refLocation += el.getLength();
+ }
+
+ }
+ if (el.getOperator().consumesReadBases()
+ || el.getOperator().consumesReferenceBases())
+ {
+ rloc += el.getLength();
}
}
-
+ // Wrong: hack that fails to relocate reference to correct place in
+ // sequence
+ if (firstAlColumn < 0)
+ {
+ firstAlColumn = alcol;
+ firstRposCol = alpos;
+ }
+ } while (it.hasNext());
+ if (xten != null)
+ {
+ extent[0] = xten;
}
- return inserts;
+ return sinserts;
}
/**
int gaps = 0;
// add in any insertion gaps before read
- // TODO start point should be start of alignment not 0
+ // although we only need to get the submap from alignmentStart, there won't
+ // be any insertions before that so we can just start at 0
SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
rec.getStart());