Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
*/
- public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
{
- SortedMap<Integer, Integer> inserts = new TreeMap<>();
- while (it.hasNext())
+ return getInsertions(it, null);
+ }
+
+ public int firstAlColumn = -1, firstRposCol = -1;
+
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+ Range[] extent)
+ {
+ SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+ new TreeMap<>(), new TreeMap<>() };
+ Range xten = extent != null && extent[0] != null ? extent[0] : null;
+ do
{
- // check each record for insertions in the CIGAR string
+ // check extent of read
SAMRecord rec = it.next();
+ if (extent != null)
+ {
+
+ int nstart = rec.getAlignmentStart();
+ if (nstart != 0)
+ {
+ nstart = rec.getReferencePositionAtReadPosition(nstart);
+ }
+ int nend = rec.getAlignmentEnd();
+ if (nend != 0)
+ {
+ nend = rec.getReferencePositionAtReadPosition(nend);
+ }
+
+ xten = new Range(
+ nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+ nend == 0 ? xten.end : Math.max(nend, xten.end));
+ }
+
+ // check each record for insertions in the CIGAR string
+
+ // pick the insert map to update
+ int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+ SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
.iterator();
- int next = 1;
+ int refLocation = rec.getAlignmentStart();
+ int rloc = 0;
+ int alcol = 0;
+ int alpos = 0;
while (cit.hasNext())
{
CigarElement el = cit.next();
switch (el.getOperator())
{
- case I:
- // add to insertions list, and move along the read
- // location is the start of next CIGAR segment
- // because getReferencePositionAtReadPosition returns 0 for read
- // positions which are not in the reference (like insertions)
- int refLocation = rec.getReferencePositionAtReadPosition(
- next + el.getLength());
-
- // if there's already an insertion at this location, keep the longest
- // insertion; if there's no insertion keep this one
- if (!inserts.containsKey(refLocation)
- || (inserts.containsKey(refLocation)
- && inserts.get(refLocation) < el.getLength()))
- {
- inserts.put(refLocation, el.getLength());
- }
- next += el.getLength();
- break;
- case M:
case S:
- // match to reference, or soft clip, move along the read
- next += el.getLength();
+ case H:
+ // ignore soft/hardclipped
+
break;
default:
- // deletions, introns etc don't consume any residues from the read
- break;
+ if (el.getOperator().consumesReadBases()
+ && !el.getOperator().consumesReferenceBases())
+ {
+ // add to insertions list, and move along the read
+ // location is the start of next CIGAR segment
+ // because getReferencePositionAtReadPosition returns 0 for read
+ // positions which are not in the reference (like insertions)
+
+ // if there's already an insertion at this location, keep the
+ // longest
+ // insertion; if there's no insertion keep this one
+ if (!inserts.containsKey(refLocation)
+ || (inserts.containsKey(refLocation)
+ && inserts.get(refLocation) < el.getLength()))
+ {
+ inserts.put(refLocation, el.getLength());
+ }
+ break;
+ }
+ if (el.getOperator().consumesReferenceBases())
+ {
+ if (el.getOperator().consumesReadBases() && alcol == 0
+ && refLocation + 1 > extent[0].start)
+ {
+ // first match position : last rloc + first match op
+ alcol = rloc + 1;
+ alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+ }
+ refLocation += el.getLength();
+ }
+
+ }
+ if (el.getOperator().consumesReadBases()
+ || el.getOperator().consumesReferenceBases())
+ {
+ rloc += el.getLength();
}
}
-
+ // Wrong: hack that fails to relocate reference to correct place in
+ // sequence
+ if (firstAlColumn < 0)
+ {
+ firstAlColumn = alcol;
+ firstRposCol = alpos;
+ }
+ } while (it.hasNext());
+ if (xten != null)
+ {
+ extent[0] = xten;
}
- return inserts;
+ return sinserts;
}
/**
return null;
}
+ private StringBuilder insertRefSeq(
+ SortedMap<Integer, Integer> insertions, Range xtent)
+ {
+ StringBuilder refseq = new StringBuilder();
+ int inserted = 0;
+ for (int p = xtent.start; p < xtent.end; p++)
+ {
+ refseq.append('N');
+ }
+ for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
+ {
+ int inspos = insert.getKey() - xtent.start + inserted;
+ if (inspos < 0)
+ {
+ System.out.println("Ignoring -ve insert position " + insert.getKey()
+ + " of " + insert.getValue() +
+ " (alpos: " + inspos + ")");
+ inspos = 0;
+ // continue;
+ }
+ for (int i = 0, j = insert.getValue(); i < j; i++)
+ {
+ inserted++;
+ refseq.insert(inspos, '-');
+ }
+ }
+ return refseq;
+ }
+
+ private void padRef(SequenceI ref, CigarParser cig)
+ { // pad to column
+ int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
+ System.out.println("Padding " + padding + " to move refseq position "
+ + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
+
+ ref.insertCharAt(0, padding, '-');
+ }
+
@Override
public void parse()
{
SAMRecordIterator it = fileReader.query(chromosome, start, end,
false);
CigarParser parser = new CigarParser('-');
- SortedMap<Integer, Integer> insertions = parser.getInsertions(it);
+ Range[] xtent = new Range[] { new Range(start, end) };
+ SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
+ xtent);
it.close();
+ SequenceI refSeq = new Sequence("chr:" + chromosome,
+ insertRefSeq(insertions[0], xtent[0])
+ .toString());
+
+ refSeq.setStart(xtent[0].start);
+ refSeq.setEnd(xtent[0].end);
+ SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
+ insertRefSeq(insertions[1], xtent[0])
+ .toString());
+ revRefSeq.setStart(xtent[0].start);
+ revRefSeq.setEnd(xtent[0].end);
+
+ // Hack to move the seqs along
+ padRef(refSeq, parser);
+ padRef(revRefSeq, parser);
+
it = fileReader.query(chromosome, start, end, false);
+
+ ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
while (it.hasNext())
{
SAMRecord rec = it.next();
}
// make dataset sequence: start at 1, end at read length
- SequenceI seq = new Sequence(rec.getReadName(),
+ SequenceI seq = new Sequence(
+ "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
+ + rec.getReadName(),
rec.getReadString().toLowerCase());
seq.setStart(1);
seq.setEnd(rec.getReadLength());
(float) q.next() - ' ', "bamfile"));
p++;
}
- String newRead = parser.parseCigarToSequence(rec, insertions,
+ String newRead = parser.parseCigarToSequence(rec,
+ insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
alignmentStart, seq);
// make alignment sequences
// set start relative to soft clip; assume end is set by Sequence code
alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
- seqs.add(alsq);
+ if (rec.getReadNegativeStrandFlag())
+ {
+ rev.add(alsq);
+ }
+ else
+ {
+ fwd.add(alsq);
+ }
+ }
+ // Add forward
+ if (fwd.size() > 0)
+ {
+ seqs.add(refSeq); // FIXME needs to be optional, and properly padded
+ seqs.addAll(fwd);
+ fwd.clear();
+ }
+ // and reverse strand reads.
+ if (rev.size() > 0)
+ {
+ seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
+ seqs.addAll(rev);
+ rev.clear();
}
}
}