X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FCigarParser.java;h=a42c2b1eea4be5b467356391cf30f59b04966bd7;hb=429bbdae3d5229ec1b92ec59f4b5a25bcfcf13f8;hp=95f1f2d289a29b35f1b0b1efacd2242071413a28;hpb=0b0b3d3687479204da2d199042c7bc90f3a6fd43;p=jalview.git diff --git a/src/jalview/datamodel/CigarParser.java b/src/jalview/datamodel/CigarParser.java index 95f1f2d..a42c2b1 100644 --- a/src/jalview/datamodel/CigarParser.java +++ b/src/jalview/datamodel/CigarParser.java @@ -27,10 +27,16 @@ public class CigarParser * the SAM record for the read * @param insertions * a map of inserted positions and lengths + * @param alignmentStart + * start of alignment to be used as offset when padding reads with + * gaps + * @param seq + * sequence to be annotated with inserts * @return string representing read with gaps, clipping etc applied */ public String parseCigarToSequence(SAMRecord rec, - SortedMap insertions) + SortedMap insertions, int alignmentStart, + SequenceI seq) { StringBuilder newRead = new StringBuilder(); Iterator it = rec.getCigar().getCigarElements() @@ -42,11 +48,11 @@ public class CigarParser // pad with spaces before read // first count gaps needed to pad to reference position // NB position is 1-based so number of gaps = pos-1 - int gaps = rec.getStart() - 1; // rec.getUnclippedStart() - 1; + int gaps = rec.getStart() - alignmentStart; + // now count gaps to pad for insertions in other reads int insertCount = countInsertionsBeforeRead(rec, insertions); addGaps(newRead, gaps + insertCount); - // addGaps(newTrimmedRead, gaps + insertCount); int lastinserts = 0; while (it.hasNext()) @@ -65,11 +71,12 @@ public class CigarParser { // ERROR int pos = rec.getStart() + refnext; - System.out.println("Insertion not added to seqInserts: " + pos); + System.out.println( + "Read " + rec.getReadName() + " has an insertion at " + + pos + " which is not in seqInserts"); } else { - // we just inserted something // remove these inserts now - should be very first entry int count = seqInserts.get(rec.getStart() + refnext); @@ -82,7 +89,7 @@ public class CigarParser Iterator> iit = seqInserts.entrySet() .iterator(); - String nextSegment = applyCigarOp(el, next, rec, iit, override); + String nextSegment = applyCigarOp(el, next, rec, iit, override, seq); newRead.append(nextSegment); if (el.getOperator().consumesReferenceBases()) @@ -120,11 +127,12 @@ public class CigarParser * an optional which can override a * in it to change the length. Set to null if not * used. + * @param seq * @return */ private String applyCigarOp(CigarElement el, int next, SAMRecord rec, Iterator> it, - int[] override) + int[] override, SequenceI seq) { StringBuilder newRead = new StringBuilder(); String read = rec.getReadString(); @@ -139,7 +147,7 @@ public class CigarParser case EQ: // matched or mismatched residues newRead = applyCigarConsumeReadAndRef(next, length, rec, it, - override); + override, seq); break; case N: // intron in RNA case D: // deletion @@ -157,11 +165,19 @@ public class CigarParser // the reference sequence and other reads should have been gapped for // this insertion, so just add in the residues newRead.append(read.substring(nextPos, nextPos + length)); + seq.addSequenceFeature(new SequenceFeature("INSERTION", "", + nextPos + 1, nextPos + length, "bamfile")); break; case S: // soft clipping - just skip this bit of the read + seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "", + nextPos + 1, nextPos + length, "bamfile")); + break; case H: // hard clipping - this stretch will not appear in the read + seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "", + nextPos + 1, nextPos + length, "bamfile")); + break; default: break; } @@ -182,20 +198,22 @@ public class CigarParser * iterator over insertions in the CIGAR region * @param override * insertions which have already been accounted for + * @param seq * @return */ private StringBuilder applyCigarConsumeReadAndRef(int next, int length, SAMRecord rec, Iterator> it, - int[] override) + int[] override, SequenceI seq) { StringBuilder newRead = new StringBuilder(); String read = rec.getReadString(); - int nextPos = next; + int nextPos = next; // location of next position in read while (nextPos < next + length) { - int insertPos = next + length; - int insertLen = 0; + int insertPos = next + length; // location of next insert in read (or end + // of read) + int insertLen = 0; // no of gaps to insert after insertPos if (it.hasNext()) { // account for sequence already having insertion @@ -217,6 +235,11 @@ public class CigarParser { insertPos = rec.getReadPositionAtReferencePosition(key) - 1; } + else + { + insertPos = nextPos; // bugfix - trigger by override + insertions in + // same M stretch + } } newRead.append(read.substring(nextPos, insertPos)); nextPos = insertPos; @@ -252,53 +275,116 @@ public class CigarParser Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added */ - public SortedMap getInsertions(Iterator it) + public SortedMap[] getInsertions(Iterator it) { - SortedMap inserts = new TreeMap<>(); - while (it.hasNext()) + return getInsertions(it, null); + } + + public int firstAlColumn = -1, firstRposCol = -1; + + public SortedMap[] getInsertions(Iterator it, + Range[] extent) + { + SortedMap sinserts[] = new SortedMap[] { + new TreeMap<>(), new TreeMap<>() }; + Range xten = extent != null && extent[0] != null ? extent[0] : null; + do { - // check each record for insertions in the CIGAR string + // check extent of read SAMRecord rec = it.next(); + if (extent != null) + { + + int nstart = rec.getAlignmentStart(); + if (nstart != 0) + { + nstart = rec.getReferencePositionAtReadPosition(nstart); + } + int nend = rec.getAlignmentEnd(); + if (nend != 0) + { + nend = rec.getReferencePositionAtReadPosition(nend); + } + + xten = new Range( + nstart <= 0 ? xten.start : Math.min(nstart, xten.start), + nend == 0 ? xten.end : Math.max(nend, xten.end)); + } + + // check each record for insertions in the CIGAR string + + // pick the insert map to update + int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0; + SortedMap inserts = sinserts[insmap]; + Iterator cit = rec.getCigar().getCigarElements() .iterator(); - int next = 0; + int refLocation = rec.getAlignmentStart(); + int rloc = 0; + int alcol = 0; + int alpos = 0; while (cit.hasNext()) { CigarElement el = cit.next(); switch (el.getOperator()) { - case I: - // add to insertions list, and move along the read - // location is 1 past the current position of next - // (note if we try to retrieve +1 position from reference by calling - // getReferencePositionAtReadPosition(next+1) we get 0 because it's an - // insertion!) - int refLocation = rec.getReferencePositionAtReadPosition(next) - + 1; - - // if there's already an insertion at this location, keep the longest - // insertion; if there's no insertion keep this one - if (!inserts.containsKey(refLocation) - || (inserts.containsKey(refLocation) - && inserts.get(refLocation) < el.getLength())) - { - inserts.put(refLocation, el.getLength()); - } - next += el.getLength(); - break; - case M: case S: - // match to reference, or soft clip, move along the read - next += el.getLength(); + case H: + // ignore soft/hardclipped + break; default: - // deletions, introns etc don't consume any residues from the read - break; + if (el.getOperator().consumesReadBases() + && !el.getOperator().consumesReferenceBases()) + { + // add to insertions list, and move along the read + // location is the start of next CIGAR segment + // because getReferencePositionAtReadPosition returns 0 for read + // positions which are not in the reference (like insertions) + + // if there's already an insertion at this location, keep the + // longest + // insertion; if there's no insertion keep this one + if (!inserts.containsKey(refLocation) + || (inserts.containsKey(refLocation) + && inserts.get(refLocation) < el.getLength())) + { + inserts.put(refLocation, el.getLength()); + } + break; + } + if (el.getOperator().consumesReferenceBases()) + { + if (el.getOperator().consumesReadBases() && alcol == 0 + && refLocation + 1 > extent[0].start) + { + // first match position : last rloc + first match op + alcol = rloc + 1; + alpos = rec.getReferencePositionAtReadPosition(rloc + 1); + } + refLocation += el.getLength(); + } + + } + if (el.getOperator().consumesReadBases() + || el.getOperator().consumesReferenceBases()) + { + rloc += el.getLength(); } } - + // Wrong: hack that fails to relocate reference to correct place in + // sequence + if (firstAlColumn < 0) + { + firstAlColumn = alcol; + firstRposCol = alpos; + } + } while (it.hasNext()); + if (xten != null) + { + extent[0] = xten; } - return inserts; + return sinserts; } /** @@ -335,7 +421,8 @@ public class CigarParser int gaps = 0; // add in any insertion gaps before read - // TODO start point should be start of alignment not 0 + // although we only need to get the submap from alignmentStart, there won't + // be any insertions before that so we can just start at 0 SortedMap seqInserts = inserts.subMap(0, rec.getStart());