From: kiramt Date: Mon, 26 Feb 2018 10:43:10 +0000 (+0000) Subject: JAL-2909 Bam file parsing with insertions, first pass X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=9f118a08c0f093fb30a51a2459e2609b3c959614;p=jalview.git JAL-2909 Bam file parsing with insertions, first pass --- diff --git a/src/jalview/datamodel/CigarParser.java b/src/jalview/datamodel/CigarParser.java new file mode 100644 index 0000000..ee0d523 --- /dev/null +++ b/src/jalview/datamodel/CigarParser.java @@ -0,0 +1,346 @@ +package jalview.datamodel; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.SortedMap; +import java.util.TreeMap; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMRecord; + +public class CigarParser +{ + private final char gapChar; + + public CigarParser(char gapCharacter) + { + gapChar = gapCharacter; + } + + /** + * Apply the CIGAR string to a read sequence and return the updated read. + * + * @param rec + * the SAM record for the read + * @param insertions + * a map of inserted positions and lengths + * @return string representing read with gaps, clipping etc applied + */ + public String parseCigarToSequence(SAMRecord rec, + SortedMap insertions) + { + Iterator it = rec.getCigar().getCigarElements() + .iterator(); + + StringBuilder newRead = new StringBuilder(); + int next = 0; + int refnext = 0; + + // pad with spaces before read + // first count gaps needed to pad to reference position + // NB position is 1-based so number of gaps = pos-1 + int gaps = rec.getUnclippedStart() - 1; + // now count gaps to pad for insertions in other reads + int insertCount = countInsertionsBeforeRead(rec, insertions); + addGaps(newRead, gaps + insertCount); + + int lastinserts = 0; + while (it.hasNext()) + { + CigarElement el = it.next(); + int length = el.getLength(); + + // which insertions coincide with current CIGAR operation? + SortedMap seqInserts = insertions.subMap( + rec.getStart() + refnext, rec.getStart() + refnext + length); + + int[] override = null; + if (lastinserts > 0) + { + // we just inserted something + // remove these inserts now - should be very first entry + int count = seqInserts.get(rec.getStart() + refnext); + override = new int[] { rec.getStart() + refnext, + count - lastinserts }; + lastinserts = 0; + } + + Iterator> iit = seqInserts.entrySet() + .iterator(); + + newRead.append(applyCigarOp(el, next, rec, iit, override)); + + if (el.getOperator().consumesReferenceBases()) + { + refnext += length; + } + if (el.getOperator().consumesReadBases()) + { + next += length; + } + if (el.getOperator().equals(CigarOperator.I)) + { + // count how many insertions we made + lastinserts = length; + } + } + return newRead.toString(); + } + + /** + * Apply a cigar operation to a segment of a read, accounting for any + * insertions to the reference from other reads in the region + * + * @param el + * the current CIGAR element + * @param next + * location in read to start applying CIGAR op + * @param length + * length of CIGAR op + * @param rec + * SAMRecord corresponding to read + * @param it + * iterator over insertions which coincide with rec + * @param override + * an optional which can override a + * in it to change the length. Set to null if not + * used. + * @return + */ + private String applyCigarOp(CigarElement el, int next, + SAMRecord rec, Iterator> it, + int[] override) + { + StringBuilder newRead = new StringBuilder(); + String read = rec.getReadString(); + + int length = el.getLength(); + int nextPos = next; + + switch (el.getOperator()) + { + case M: + case X: + case EQ: + // matched or mismatched residues + newRead = applyCigarConsumeReadAndRef(next, length, rec, it, + override); + break; + case N: // intron in RNA + case D: // deletion + int insertCount = 0; + while (it.hasNext()) + { + // just count all the insertions we need to add in the deletion/intron + // region + Entry entry = it.next(); + insertCount += entry.getValue(); + } + addGaps(newRead, length + insertCount); + break; + case S: + // soft clipping - just skip this bit of the read + // do nothing + + newRead.append( + read.substring(nextPos, nextPos + length).toLowerCase()); + // nextPos += length; + break; + case I: + // the reference sequence and other reads should have been gapped for + // this insertion, so just add in the residues + newRead.append(read.substring(nextPos, nextPos + length)); + // nextPos += length; + break; + case H: + // hard clipping - this stretch will not appear in the read + default: + break; + } + + return newRead.toString(); + } + + /** + * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X + * + * @param next + * next position in read + * @param length + * length of cigar operation + * @param rec + * SAMRecord to apply the CIGAR op to + * @param it + * iterator over insertions in the CIGAR region + * @param override + * insertions which have already been accounted for + * @return + */ + private StringBuilder applyCigarConsumeReadAndRef(int next, int length, + SAMRecord rec, Iterator> it, + int[] override) + { + StringBuilder newRead = new StringBuilder(); + String read = rec.getReadString(); + + int nextPos = next; + while (nextPos < next + length) + { + int insertPos = next + length; + int insertLen = 0; + if (it.hasNext()) + { + // account for sequence already having insertion + // if an override entry exists it gives the number of positions still to + // be inserted at this point, after accounting for insertions driven by + // the current read. + // e.g. if the current read has 2 inserted positions here, and the + // insertions list also has 2 inserted positions, the override will + // cancel the insert for this read. If the insertions list had 3 + // insertions, the override would reduce this to 1 insertion. + Entry entry = it.next(); + int key = entry.getKey(); + insertLen = entry.getValue(); + if (override != null && key == override[0]) + { + insertLen = override[1]; + } + if (insertLen > 0) + { + insertPos = rec.getReadPositionAtReferencePosition(key) - 1; + } + } + newRead.append(read.substring(nextPos, insertPos)); + nextPos = insertPos; + addGaps(newRead, insertLen); // add insertions + } + return newRead; + } + + /** + * Get list of positions inserted to the reference sequence. Note: assumes all + * reads are on the same chromosome. + * + * @param it + * Iterator over all the SAMRecords + * @return sorted map of insertion positions + */ + /* + 1234567 + ABCDEFG insert 3,4 + + R1: insert 3 = gap before 3rd res + AB.CDEFG + + R2: insert 4 = gap before 4th res + ABC.DEFG + + R3: insert 3,4 = 2 gaps before 3rd res + AB..CDEFG + + => AB--C-DEFG + + So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) + Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added + + */ + public SortedMap getInsertions(Iterator it) + { + SortedMap insertions = new TreeMap<>(); + while (it.hasNext()) + { + // check each record for insertions in the CIGAR string + SAMRecord rec = it.next(); + Iterator cit = rec.getCigar().getCigarElements() + .iterator(); + int next = 0; + while (cit.hasNext()) + { + CigarElement el = cit.next(); + switch (el.getOperator()) + { + case I: + // add to insertions list, and move along the read + // location is 1 past the current position of next + // (note if we try to retrieve +1 position from reference by calling + // getReferencePositionAtReadPosition(next+1) we get 0 because it's an + // insertion!) + int refLocation = rec.getReferencePositionAtReadPosition(next) + + 1; + + // if there's already an insertion at this location, keep the longest + // insertion; if there's no insertion keep this one + if (!insertions.containsKey(refLocation) + || (insertions.containsKey(refLocation) + && insertions.get(refLocation) < el.getLength())) + { + insertions.put(refLocation, el.getLength()); + } + next += el.getLength(); + break; + case M: + case S: + // match to reference, or soft clip, move along the read + next += el.getLength(); + break; + default: + // deletions, introns etc don't consume any residues from the read + break; + } + } + + } + return insertions; + } + + /** + * Add sequence of gaps to a read + * + * @param read + * read to update + * @param length + * number of gaps + */ + private void addGaps(StringBuilder read, int length) + { + if (length > 0) + { + char[] gaps = new char[length]; + Arrays.fill(gaps, gapChar); + read.append(gaps); + } + } + + /** + * Get the number of insertions before the start of an aligned read (i.e. + * insertions in soft clipped region are counted too) + * + * @param rec + * the SAMRecord for the read + * @param insertions + * the map of insertions + * @return number of insertions before the read starts + */ + private int countInsertionsBeforeRead(SAMRecord rec, + SortedMap insertions) + { + int gaps = 0; + + // add in any insertion gaps before read + // TODO start point should be start of alignment not 0 + SortedMap seqInserts = insertions.subMap(0, + rec.getStart()); + + Iterator> it = seqInserts.entrySet() + .iterator(); + while (it.hasNext()) + { + Entry entry = it.next(); + gaps += entry.getValue(); + } + return gaps; + } +} diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index be5e3ee..0569692 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -20,19 +20,14 @@ */ package jalview.io; +import jalview.datamodel.CigarParser; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.Map; -import java.util.Map.Entry; import java.util.SortedMap; -import java.util.TreeMap; -import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamReader; @@ -95,22 +90,21 @@ public class BamFile extends AlignFile @Override public void parse() throws IOException { + CigarParser parser = new CigarParser('-'); SAMRecordIterator it = fileReader.iterator(); - SortedMap insertions = getInsertions(it); + SortedMap insertions = parser.getInsertions(it); it.close(); it = fileReader.iterator(); while (it.hasNext()) { SAMRecord rec = it.next(); - String read = rec.getReadString(); int start = rec.getStart(); int end = rec.getEnd(); SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString()); - String cigarredRead = parseCigarToSequence(read, rec, insertions, - '-'); + String cigarredRead = parser.parseCigarToSequence(rec, insertions); SequenceI alsq = seq.deriveSequence(); alsq.setSequence(cigarredRead); @@ -118,277 +112,9 @@ public class BamFile extends AlignFile alsq.setEnd(end); seqs.add(alsq); } - - // put insertions back into each sequence - for (SequenceI seq : seqs) - { - int insertCount = 0; - SortedMap seqInserts = insertions - .subMap(seq.getStart(), - seq.getEnd()); // TODO start point should be start of alignment - // not 0 - - Iterator> iit = seqInserts.entrySet() - .iterator(); - while (iit.hasNext()) - { - // TODO account for sequence already having insertion, and INDELS - Entry entry = iit.next(); - int pos = entry.getKey(); - int length = entry.getValue(); - seq.insertCharAt(pos + insertCount, length, '-'); - insertCount++; - } - } - } - - /* - 1234567 - ABCDEFG insert 3,4 - - R1: insert 3 = gap before 3rd res - AB.CDEFG - - R2: insert 4 = gap before 4th res - ABC.DEFG - - R3: insert 3,4 = 2 gaps before 3rd res - AB..CDEFG - - => AB--C-DEFG - - So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) - Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added - - */ - /** - * Apply the CIGAR string to a read sequence and return the updated read. - * - * @param read - * the read to update - * @param rec - * the corresponding SAM record - * @param gapChar - * gap character to use - * @return string representing read with gaps, clipping etc applied - */ - private String parseCigarToSequence(String read, - SAMRecord rec, SortedMap insertions, - char gapChar) - { - Iterator it = rec.getCigar().getCigarElements() - .iterator(); - - StringBuilder newRead = new StringBuilder(); - int next = 0; - - // pad with spaces before read - // first count gaps needed to pad to reference position - int gaps = rec.getReferencePositionAtReadPosition(next + 1); - // now count gaps to pad for insertions in other reads - int insertCount = countInsertionsBeforeRead(rec, insertions); - addGaps(newRead, gaps + insertCount, gapChar); - - SortedMap seqInserts = insertions - .subMap(rec.getStart(), rec.getEnd() + 1); - // TODO start point should be start of alignment not 0 - - Iterator> iit = seqInserts.entrySet() - .iterator(); - if (iit.hasNext()) - { - // TODO account for sequence already having insertion, and INDELS - Entry entry = iit.next(); - int insertPos = entry.getKey(); - int insertLen = entry.getValue(); - // seq.insertCharAt(pos + insertCount, length, '-'); - // insertCount++; - } - - while (it.hasNext()) - { - CigarElement el = it.next(); - int length = el.getLength(); - - - switch (el.getOperator()) - { - case M: - case X: - case EQ: - // matched or mismatched residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case N: // intron in RNA - case D: // deletion - addGaps(newRead, length, gapChar); - break; - case S: - // soft clipping - just skip this bit of the read - // do nothing - - // work out how many gaps we need before the start of the soft clip - - // don't do this at the end of the read! - if (next == 0) // at start of read - { - addGaps(newRead, rec.getUnclippedStart(), gapChar); - } - - newRead.append(read.substring(next, next + length).toLowerCase()); - next += length; - break; - case I: - // the reference sequence and other reads should have been gapped for - // this insertion, so just add in the residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case H: - // hard clipping - this stretch will not appear in the read - break; - default: - break; - } - - - } - return newRead.toString(); } - private String applyCigar(String read, CigarElement el, char gapChar, - int next, int length, int softStart) - { - StringBuilder newRead = new StringBuilder(); - switch (el.getOperator()) - { - case M: - case X: - case EQ: - // matched or mismatched residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case N: // intron in RNA - case D: // deletion - addGaps(newRead, length, gapChar); - break; - case S: - // soft clipping - just skip this bit of the read - // do nothing - - // work out how many gaps we need before the start of the soft clip - - // don't do this at the end of the read! - if (next == 0) // at start of read - { - addGaps(newRead, softStart, gapChar); - } - - newRead.append(read.substring(next, next + length).toLowerCase()); - next += length; - break; - case I: - // the reference sequence and other reads should have been gapped for - // this insertion, so just add in the residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case H: - // hard clipping - this stretch will not appear in the read - break; - default: - break; - } - return newRead.toString(); - } - - /** - * Get list of positions inserted to the reference sequence - * - * @param it - * @return - */ - private SortedMap getInsertions(Iterator it) - { - SortedMap insertions = new TreeMap<>(); - while (it.hasNext()) - { - // check each record for insertions in the CIGAR string - SAMRecord rec = it.next(); - Iterator cit = rec.getCigar().getCigarElements() - .iterator(); - int next = 0; - while (cit.hasNext()) - { - CigarElement el = cit.next(); - switch (el.getOperator()) - { - case I: - // add to insertions list, and move along the read - int refLocation = rec.getReferencePositionAtReadPosition(next); - // if there's already an insertion at this location, keep the longest - // insertion; if there's no insertion keep this one - if (!insertions.containsKey(refLocation) - || (insertions.containsKey(refLocation) - && insertions.get(refLocation) < el.getLength())) - { - insertions.put(refLocation, el.getLength()); - } - next += el.getLength(); - break; - case M: - // match to reference, move along the read - next += el.getLength(); - break; - default: - // deletions, introns etc don't consume any residues from the read - break; - } - } - } - return insertions; - } - - /** - * Add sequence of gaps to a read - * - * @param read - * read to update - * @param length - * number of gaps - * @param gapChar - * gap character to use - */ - private void addGaps(StringBuilder read, int length, char gapChar) - { - if (length > 0) - { - char[] gaps = new char[length]; - Arrays.fill(gaps, gapChar); - read.append(gaps); - } - } - - private int countInsertionsBeforeRead(SAMRecord rec, - SortedMap insertions) - { - int gaps = 0; - - // add in any insertion gaps before read - // TODO start point should be start of alignment not 0 - SortedMap seqInserts = insertions.subMap(0, - rec.getStart()); - - Iterator> it = seqInserts.entrySet() - .iterator(); - while (it.hasNext()) - { - Entry entry = it.next(); - gaps += entry.getValue(); - } - return gaps; - } }