From 69b00177c988226aace46a0aae533a9904da2535 Mon Sep 17 00:00:00 2001 From: kiramt Date: Wed, 21 Feb 2018 14:15:38 +0000 Subject: [PATCH] JAL-2909 Partial insertions --- src/jalview/io/BamFile.java | 169 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 28 deletions(-) diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index e9919ff..be5e3ee 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -27,6 +27,8 @@ import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; import java.util.SortedMap; import java.util.TreeMap; @@ -107,7 +109,8 @@ public class BamFile extends AlignFile SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString()); - String cigarredRead = parseCigarToSequence(read, rec, '-'); + String cigarredRead = parseCigarToSequence(read, rec, insertions, + '-'); SequenceI alsq = seq.deriveSequence(); alsq.setSequence(cigarredRead); @@ -116,17 +119,24 @@ public class BamFile extends AlignFile seqs.add(alsq); } + // put insertions back into each sequence for (SequenceI seq : seqs) { int insertCount = 0; - SortedMap seqInserts = insertions.subMap(0, + SortedMap seqInserts = insertions + .subMap(seq.getStart(), seq.getEnd()); // TODO start point should be start of alignment // not 0 - seqInserts.forEach(action); - while ((nextInsertion != -1) && (nextInsertion < seq.getEnd())) + + Iterator> iit = seqInserts.entrySet() + .iterator(); + while (iit.hasNext()) { - seq.insertCharsAt(nextInsertion + insertCount, '-'); - // nextInsertion = insertions.nextSetBit(nextInsertion + 1); + // TODO account for sequence already having insertion, and INDELS + Entry entry = iit.next(); + int pos = entry.getKey(); + int length = entry.getValue(); + seq.insertCharAt(pos + insertCount, length, '-'); insertCount++; } } @@ -163,43 +173,56 @@ public class BamFile extends AlignFile * @return string representing read with gaps, clipping etc applied */ private String parseCigarToSequence(String read, - SAMRecord rec, char gapChar) + SAMRecord rec, SortedMap insertions, + char gapChar) { Iterator it = rec.getCigar().getCigarElements() .iterator(); StringBuilder newRead = new StringBuilder(); int next = 0; - int ap = 0; // position of query region qi.start; // pad with spaces before read - for (int alp = rec - .getReferencePositionAtReadPosition(next + 1); ap < alp;) + // first count gaps needed to pad to reference position + int gaps = rec.getReferencePositionAtReadPosition(next + 1); + // now count gaps to pad for insertions in other reads + int insertCount = countInsertionsBeforeRead(rec, insertions); + addGaps(newRead, gaps + insertCount, gapChar); + + SortedMap seqInserts = insertions + .subMap(rec.getStart(), rec.getEnd() + 1); + // TODO start point should be start of alignment not 0 + + Iterator> iit = seqInserts.entrySet() + .iterator(); + if (iit.hasNext()) { - newRead.append(" "); - ap++; + // TODO account for sequence already having insertion, and INDELS + Entry entry = iit.next(); + int insertPos = entry.getKey(); + int insertLen = entry.getValue(); + // seq.insertCharAt(pos + insertCount, length, '-'); + // insertCount++; } while (it.hasNext()) { CigarElement el = it.next(); int length = el.getLength(); - char[] gaps; + switch (el.getOperator()) { case M: - // matched residues - newRead.append( - read.substring(next, next + length)); + case X: + case EQ: + // matched or mismatched residues + newRead.append(read.substring(next, next + length)); next += length; break; case N: // intron in RNA case D: // deletion - // add gaps - gaps = new char[length]; - Arrays.fill(gaps, gapChar); - newRead.append(gaps); + addGaps(newRead, length, gapChar); break; case S: // soft clipping - just skip this bit of the read @@ -209,14 +232,10 @@ public class BamFile extends AlignFile // don't do this at the end of the read! if (next == 0) // at start of read { - int numgaps = rec.getUnclippedStart(); - gaps = new char[numgaps]; - Arrays.fill(gaps, ' '); - newRead.append(gaps); + addGaps(newRead, rec.getUnclippedStart(), gapChar); } - newRead.append( - read.substring(next, next + length).toLowerCase()); + newRead.append(read.substring(next, next + length).toLowerCase()); next += length; break; case I: @@ -229,10 +248,56 @@ public class BamFile extends AlignFile // hard clipping - this stretch will not appear in the read break; default: - // P, X EQ don't know what to do with these break; } + + } + return newRead.toString(); + } + + private String applyCigar(String read, CigarElement el, char gapChar, + int next, int length, int softStart) + { + StringBuilder newRead = new StringBuilder(); + switch (el.getOperator()) + { + case M: + case X: + case EQ: + // matched or mismatched residues + newRead.append(read.substring(next, next + length)); + next += length; + break; + case N: // intron in RNA + case D: // deletion + addGaps(newRead, length, gapChar); + break; + case S: + // soft clipping - just skip this bit of the read + // do nothing + + // work out how many gaps we need before the start of the soft clip - + // don't do this at the end of the read! + if (next == 0) // at start of read + { + addGaps(newRead, softStart, gapChar); + } + + newRead.append(read.substring(next, next + length).toLowerCase()); + next += length; + break; + case I: + // the reference sequence and other reads should have been gapped for + // this insertion, so just add in the residues + newRead.append(read.substring(next, next + length)); + next += length; + break; + case H: + // hard clipping - this stretch will not appear in the read + break; + default: + break; } return newRead.toString(); } @@ -261,7 +326,15 @@ public class BamFile extends AlignFile case I: // add to insertions list, and move along the read int refLocation = rec.getReferencePositionAtReadPosition(next); - insertions.put(refLocation, el.getLength()); + + // if there's already an insertion at this location, keep the longest + // insertion; if there's no insertion keep this one + if (!insertions.containsKey(refLocation) + || (insertions.containsKey(refLocation) + && insertions.get(refLocation) < el.getLength())) + { + insertions.put(refLocation, el.getLength()); + } next += el.getLength(); break; case M: @@ -278,4 +351,44 @@ public class BamFile extends AlignFile return insertions; } + /** + * Add sequence of gaps to a read + * + * @param read + * read to update + * @param length + * number of gaps + * @param gapChar + * gap character to use + */ + private void addGaps(StringBuilder read, int length, char gapChar) + { + if (length > 0) + { + char[] gaps = new char[length]; + Arrays.fill(gaps, gapChar); + read.append(gaps); + } + } + + private int countInsertionsBeforeRead(SAMRecord rec, + SortedMap insertions) + { + int gaps = 0; + + // add in any insertion gaps before read + // TODO start point should be start of alignment not 0 + SortedMap seqInserts = insertions.subMap(0, + rec.getStart()); + + Iterator> it = seqInserts.entrySet() + .iterator(); + while (it.hasNext()) + { + Entry entry = it.next(); + gaps += entry.getValue(); + } + return gaps; + } + } -- 1.7.10.2