{
private final char gapChar;
- public CigarParser(char gapCharacter)
+ public CigarParser(char gap)
{
- gapChar = gapCharacter;
+ gapChar = gap;
}
/**
* the SAM record for the read
* @param insertions
* a map of inserted positions and lengths
+ * @param alignmentStart
+ * start of alignment to be used as offset when padding reads with
+ * gaps
* @return string representing read with gaps, clipping etc applied
*/
public String parseCigarToSequence(SAMRecord rec,
- SortedMap<Integer, Integer> insertions)
+ SortedMap<Integer, Integer> insertions, int alignmentStart)
{
+ StringBuilder newRead = new StringBuilder();
Iterator<CigarElement> it = rec.getCigar().getCigarElements()
.iterator();
- StringBuilder newRead = new StringBuilder();
int next = 0;
int refnext = 0;
// pad with spaces before read
// first count gaps needed to pad to reference position
// NB position is 1-based so number of gaps = pos-1
- int gaps = rec.getUnclippedStart() - 1;
+ int gaps = rec.getStart() - alignmentStart;
+
// now count gaps to pad for insertions in other reads
int insertCount = countInsertionsBeforeRead(rec, insertions);
addGaps(newRead, gaps + insertCount);
int[] override = null;
if (lastinserts > 0)
{
- // we just inserted something
- // remove these inserts now - should be very first entry
- int count = seqInserts.get(rec.getStart() + refnext);
- override = new int[] { rec.getStart() + refnext,
- count - lastinserts };
- lastinserts = 0;
+ if (!seqInserts.containsKey(rec.getStart() + refnext))
+ {
+ // ERROR
+ int pos = rec.getStart() + refnext;
+ System.out.println(
+ "Read " + rec.getReadName() + " has an insertion at "
+ + pos + " which is not in seqInserts");
+ }
+ else
+ {
+ // we just inserted something
+ // remove these inserts now - should be very first entry
+ int count = seqInserts.get(rec.getStart() + refnext);
+ override = new int[] { rec.getStart() + refnext,
+ count - lastinserts };
+ lastinserts = 0;
+ }
}
Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
.iterator();
- newRead.append(applyCigarOp(el, next, rec, iit, override));
+ String nextSegment = applyCigarOp(el, next, rec, iit, override);
+ newRead.append(nextSegment);
if (el.getOperator().consumesReferenceBases())
{
}
addGaps(newRead, length + insertCount);
break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- newRead.append(
- read.substring(nextPos, nextPos + length).toLowerCase());
- // nextPos += length;
- break;
case I:
// the reference sequence and other reads should have been gapped for
// this insertion, so just add in the residues
newRead.append(read.substring(nextPos, nextPos + length));
- // nextPos += length;
break;
+ case S:
+ // soft clipping - just skip this bit of the read
case H:
// hard clipping - this stretch will not appear in the read
default:
StringBuilder newRead = new StringBuilder();
String read = rec.getReadString();
- int nextPos = next;
+ int nextPos = next; // location of next position in read
while (nextPos < next + length)
{
- int insertPos = next + length;
- int insertLen = 0;
+ int insertPos = next + length; // location of next insert in read (or end
+ // of read)
+ int insertLen = 0; // no of gaps to insert after insertPos
if (it.hasNext())
{
// account for sequence already having insertion
{
insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
}
+ else
+ {
+ insertPos = nextPos; // bugfix - trigger by override + insertions in
+ // same M stretch
+ }
}
newRead.append(read.substring(nextPos, insertPos));
nextPos = insertPos;
*/
public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
{
- SortedMap<Integer, Integer> insertions = new TreeMap<>();
+ SortedMap<Integer, Integer> inserts = new TreeMap<>();
while (it.hasNext())
{
// check each record for insertions in the CIGAR string
SAMRecord rec = it.next();
Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
.iterator();
- int next = 0;
+ int next = 1;
while (cit.hasNext())
{
CigarElement el = cit.next();
{
case I:
// add to insertions list, and move along the read
- // location is 1 past the current position of next
- // (note if we try to retrieve +1 position from reference by calling
- // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
- // insertion!)
- int refLocation = rec.getReferencePositionAtReadPosition(next)
- + 1;
+ // location is the start of next CIGAR segment
+ // because getReferencePositionAtReadPosition returns 0 for read
+ // positions which are not in the reference (like insertions)
+ int refLocation = rec.getReferencePositionAtReadPosition(
+ next + el.getLength());
// if there's already an insertion at this location, keep the longest
// insertion; if there's no insertion keep this one
- if (!insertions.containsKey(refLocation)
- || (insertions.containsKey(refLocation)
- && insertions.get(refLocation) < el.getLength()))
+ if (!inserts.containsKey(refLocation)
+ || (inserts.containsKey(refLocation)
+ && inserts.get(refLocation) < el.getLength()))
{
- insertions.put(refLocation, el.getLength());
+ inserts.put(refLocation, el.getLength());
}
next += el.getLength();
break;
}
}
- return insertions;
+ return inserts;
}
/**
*
* @param rec
* the SAMRecord for the read
- * @param insertions
+ * @param inserts
* the map of insertions
* @return number of insertions before the read starts
*/
private int countInsertionsBeforeRead(SAMRecord rec,
- SortedMap<Integer, Integer> insertions)
+ SortedMap<Integer, Integer> inserts)
{
int gaps = 0;
// add in any insertion gaps before read
- // TODO start point should be start of alignment not 0
- SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+ // although we only need to get the submap from alignmentStart, there won't
+ // be any insertions before that so we can just start at 0
+ SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
rec.getStart());
Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()