From 429bbdae3d5229ec1b92ec59f4b5a25bcfcf13f8 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Tue, 20 Mar 2018 16:34:10 +0000 Subject: [PATCH] =?utf8?q?JAL-2909=20separate=20reverse=20and=20forward=20st?= =?utf8?q?rand=20reads.=20fix=20subtle=20bug=20in=20handling=20inserts.=20gr?= =?utf8?q?oss=20hack=20&=20WIP=20to=20try=20to=20get=20reference=20sequences?= =?utf8?q?=20added=20to=20alignment=20(doesn=E2=80=99t=20work=20-=20see=20FI?= =?utf8?q?XME)?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/jalview/datamodel/CigarParser.java | 124 ++++++++++++++++++++++++-------- src/jalview/io/BamFile.java | 90 +++++++++++++++++++++-- 2 files changed, 180 insertions(+), 34 deletions(-) diff --git a/src/jalview/datamodel/CigarParser.java b/src/jalview/datamodel/CigarParser.java index a1124d4..a42c2b1 100644 --- a/src/jalview/datamodel/CigarParser.java +++ b/src/jalview/datamodel/CigarParser.java @@ -275,52 +275,116 @@ public class CigarParser Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added */ - public SortedMap getInsertions(Iterator it) + public SortedMap[] getInsertions(Iterator it) { - SortedMap inserts = new TreeMap<>(); - while (it.hasNext()) + return getInsertions(it, null); + } + + public int firstAlColumn = -1, firstRposCol = -1; + + public SortedMap[] getInsertions(Iterator it, + Range[] extent) + { + SortedMap sinserts[] = new SortedMap[] { + new TreeMap<>(), new TreeMap<>() }; + Range xten = extent != null && extent[0] != null ? extent[0] : null; + do { - // check each record for insertions in the CIGAR string + // check extent of read SAMRecord rec = it.next(); + if (extent != null) + { + + int nstart = rec.getAlignmentStart(); + if (nstart != 0) + { + nstart = rec.getReferencePositionAtReadPosition(nstart); + } + int nend = rec.getAlignmentEnd(); + if (nend != 0) + { + nend = rec.getReferencePositionAtReadPosition(nend); + } + + xten = new Range( + nstart <= 0 ? xten.start : Math.min(nstart, xten.start), + nend == 0 ? xten.end : Math.max(nend, xten.end)); + } + + // check each record for insertions in the CIGAR string + + // pick the insert map to update + int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0; + SortedMap inserts = sinserts[insmap]; + Iterator cit = rec.getCigar().getCigarElements() .iterator(); - int next = 1; + int refLocation = rec.getAlignmentStart(); + int rloc = 0; + int alcol = 0; + int alpos = 0; while (cit.hasNext()) { CigarElement el = cit.next(); switch (el.getOperator()) { - case I: - // add to insertions list, and move along the read - // location is the start of next CIGAR segment - // because getReferencePositionAtReadPosition returns 0 for read - // positions which are not in the reference (like insertions) - int refLocation = rec.getReferencePositionAtReadPosition( - next + el.getLength()); - - // if there's already an insertion at this location, keep the longest - // insertion; if there's no insertion keep this one - if (!inserts.containsKey(refLocation) - || (inserts.containsKey(refLocation) - && inserts.get(refLocation) < el.getLength())) - { - inserts.put(refLocation, el.getLength()); - } - next += el.getLength(); - break; - case M: case S: - // match to reference, or soft clip, move along the read - next += el.getLength(); + case H: + // ignore soft/hardclipped + break; default: - // deletions, introns etc don't consume any residues from the read - break; + if (el.getOperator().consumesReadBases() + && !el.getOperator().consumesReferenceBases()) + { + // add to insertions list, and move along the read + // location is the start of next CIGAR segment + // because getReferencePositionAtReadPosition returns 0 for read + // positions which are not in the reference (like insertions) + + // if there's already an insertion at this location, keep the + // longest + // insertion; if there's no insertion keep this one + if (!inserts.containsKey(refLocation) + || (inserts.containsKey(refLocation) + && inserts.get(refLocation) < el.getLength())) + { + inserts.put(refLocation, el.getLength()); + } + break; + } + if (el.getOperator().consumesReferenceBases()) + { + if (el.getOperator().consumesReadBases() && alcol == 0 + && refLocation + 1 > extent[0].start) + { + // first match position : last rloc + first match op + alcol = rloc + 1; + alpos = rec.getReferencePositionAtReadPosition(rloc + 1); + } + refLocation += el.getLength(); + } + + } + if (el.getOperator().consumesReadBases() + || el.getOperator().consumesReferenceBases()) + { + rloc += el.getLength(); } } - + // Wrong: hack that fails to relocate reference to correct place in + // sequence + if (firstAlColumn < 0) + { + firstAlColumn = alcol; + firstRposCol = alpos; + } + } while (it.hasNext()); + if (xten != null) + { + extent[0] = xten; } - return inserts; + return sinserts; } /** diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index 8a04867..8d55c53 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -125,6 +125,44 @@ public class BamFile extends AlignFile return null; } + private StringBuilder insertRefSeq( + SortedMap insertions, Range xtent) + { + StringBuilder refseq = new StringBuilder(); + int inserted = 0; + for (int p = xtent.start; p < xtent.end; p++) + { + refseq.append('N'); + } + for (Map.Entry insert : insertions.entrySet()) + { + int inspos = insert.getKey() - xtent.start + inserted; + if (inspos < 0) + { + System.out.println("Ignoring -ve insert position " + insert.getKey() + + " of " + insert.getValue() + + " (alpos: " + inspos + ")"); + inspos = 0; + // continue; + } + for (int i = 0, j = insert.getValue(); i < j; i++) + { + inserted++; + refseq.insert(inspos, '-'); + } + } + return refseq; + } + + private void padRef(SequenceI ref, CigarParser cig) + { // pad to column + int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol); + System.out.println("Padding " + padding + " to move refseq position " + + cig.firstRposCol + " to " + cig.firstAlColumn + "col."); + + ref.insertCharAt(0, padding, '-'); + } + @Override public void parse() { @@ -134,10 +172,30 @@ public class BamFile extends AlignFile SAMRecordIterator it = fileReader.query(chromosome, start, end, false); CigarParser parser = new CigarParser('-'); - SortedMap insertions = parser.getInsertions(it); + Range[] xtent = new Range[] { new Range(start, end) }; + SortedMap insertions[] = parser.getInsertions(it, + xtent); it.close(); + SequenceI refSeq = new Sequence("chr:" + chromosome, + insertRefSeq(insertions[0], xtent[0]) + .toString()); + + refSeq.setStart(xtent[0].start); + refSeq.setEnd(xtent[0].end); + SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome, + insertRefSeq(insertions[1], xtent[0]) + .toString()); + revRefSeq.setStart(xtent[0].start); + revRefSeq.setEnd(xtent[0].end); + + // Hack to move the seqs along + padRef(refSeq, parser); + padRef(revRefSeq, parser); + it = fileReader.query(chromosome, start, end, false); + + ArrayList fwd = new ArrayList(), rev = new ArrayList(); while (it.hasNext()) { SAMRecord rec = it.next(); @@ -150,7 +208,9 @@ public class BamFile extends AlignFile } // make dataset sequence: start at 1, end at read length - SequenceI seq = new Sequence(rec.getReadName(), + SequenceI seq = new Sequence( + "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "") + + rec.getReadName(), rec.getReadString().toLowerCase()); seq.setStart(1); seq.setEnd(rec.getReadLength()); @@ -163,7 +223,8 @@ public class BamFile extends AlignFile (float) q.next() - ' ', "bamfile")); p++; } - String newRead = parser.parseCigarToSequence(rec, insertions, + String newRead = parser.parseCigarToSequence(rec, + insertions[rec.getReadNegativeStrandFlag() ? 1 : 0], alignmentStart, seq); // make alignment sequences @@ -172,7 +233,28 @@ public class BamFile extends AlignFile // set start relative to soft clip; assume end is set by Sequence code alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1); - seqs.add(alsq); + if (rec.getReadNegativeStrandFlag()) + { + rev.add(alsq); + } + else + { + fwd.add(alsq); + } + } + // Add forward + if (fwd.size() > 0) + { + seqs.add(refSeq); // FIXME needs to be optional, and properly padded + seqs.addAll(fwd); + fwd.clear(); + } + // and reverse strand reads. + if (rev.size() > 0) + { + seqs.add(revRefSeq); // FIXME needs to be optional and properly padded + seqs.addAll(rev); + rev.clear(); } } } -- 1.7.10.2