From: kiramt Date: Tue, 20 Feb 2018 16:37:01 +0000 (+0000) Subject: JAL-2909 Insertions almost X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=932cc89cff4bb8fb8d5885d1c4bdc110f34486d9;p=jalview.git JAL-2909 Insertions almost --- diff --git a/lib/htsjdk-1.133.jar b/lib/htsjdk-1.133.jar deleted file mode 100644 index f084258..0000000 Binary files a/lib/htsjdk-1.133.jar and /dev/null differ diff --git a/lib/htsjdk-2.12.0.jar b/lib/htsjdk-2.12.0.jar new file mode 100644 index 0000000..c9b8ea3 Binary files /dev/null and b/lib/htsjdk-2.12.0.jar differ diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index c94dc3e..e9919ff 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -20,12 +20,15 @@ */ package jalview.io; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; +import java.util.SortedMap; +import java.util.TreeMap; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMRecord; @@ -90,8 +93,11 @@ public class BamFile extends AlignFile @Override public void parse() throws IOException { - SequenceI seq = null; SAMRecordIterator it = fileReader.iterator(); + SortedMap insertions = getInsertions(it); + it.close(); + + it = fileReader.iterator(); while (it.hasNext()) { SAMRecord rec = it.next(); @@ -99,64 +105,125 @@ public class BamFile extends AlignFile int start = rec.getStart(); int end = rec.getEnd(); - Iterator cit = rec.getCigar().getCigarElements() - .iterator(); + SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString()); - seq = parseId(rec.getReadName()); - String cigarredRead = parseCigarToSequence(read, cit, '-'); - seq.setSequence(cigarredRead); - seq.setStart(start); - seq.setEnd(end); - seqs.addElement(seq); + String cigarredRead = parseCigarToSequence(read, rec, '-'); + + SequenceI alsq = seq.deriveSequence(); + alsq.setSequence(cigarredRead); + alsq.setStart(start); + alsq.setEnd(end); + seqs.add(alsq); } + for (SequenceI seq : seqs) + { + int insertCount = 0; + SortedMap seqInserts = insertions.subMap(0, + seq.getEnd()); // TODO start point should be start of alignment + // not 0 + seqInserts.forEach(action); + while ((nextInsertion != -1) && (nextInsertion < seq.getEnd())) + { + seq.insertCharsAt(nextInsertion + insertCount, '-'); + // nextInsertion = insertions.nextSetBit(nextInsertion + 1); + insertCount++; + } + } } + /* + 1234567 + ABCDEFG insert 3,4 + + R1: insert 3 = gap before 3rd res + AB.CDEFG + + R2: insert 4 = gap before 4th res + ABC.DEFG + + R3: insert 3,4 = 2 gaps before 3rd res + AB..CDEFG + + => AB--C-DEFG + + So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) + Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added + + */ /** - * Apply the CIGAR string to a read sequence and return the updated read + * Apply the CIGAR string to a read sequence and return the updated read. * * @param read * the read to update - * @param it - * iterator over cigar elements + * @param rec + * the corresponding SAM record * @param gapChar * gap character to use * @return string representing read with gaps, clipping etc applied */ private String parseCigarToSequence(String read, - Iterator it, char gapChar) + SAMRecord rec, char gapChar) { + Iterator it = rec.getCigar().getCigarElements() + .iterator(); + StringBuilder newRead = new StringBuilder(); int next = 0; + int ap = 0; // position of query region qi.start; + + // pad with spaces before read + for (int alp = rec + .getReferencePositionAtReadPosition(next + 1); ap < alp;) + { + newRead.append(" "); + ap++; + } while (it.hasNext()) { CigarElement el = it.next(); int length = el.getLength(); + char[] gaps; + switch (el.getOperator()) { case M: // matched residues - newRead.append(read.substring(next, next + length - 1)); + newRead.append( + read.substring(next, next + length)); next += length; break; case N: // intron in RNA case D: // deletion // add gaps - char[] gaps = new char[length]; + gaps = new char[length]; Arrays.fill(gaps, gapChar); newRead.append(gaps); break; case S: // soft clipping - just skip this bit of the read // do nothing + + // work out how many gaps we need before the start of the soft clip - + // don't do this at the end of the read! + if (next == 0) // at start of read + { + int numgaps = rec.getUnclippedStart(); + gaps = new char[numgaps]; + Arrays.fill(gaps, ' '); + newRead.append(gaps); + } + + newRead.append( + read.substring(next, next + length).toLowerCase()); next += length; break; case I: - // add gaps to the reference sequence, which we know nothing about just - // now - // TODO - newRead.append(read.substring(next, next + length - 1)); + // the reference sequence and other reads should have been gapped for + // this insertion, so just add in the residues + newRead.append(read.substring(next, next + length)); + next += length; break; case H: // hard clipping - this stretch will not appear in the read @@ -170,4 +237,45 @@ public class BamFile extends AlignFile return newRead.toString(); } + /** + * Get list of positions inserted to the reference sequence + * + * @param it + * @return + */ + private SortedMap getInsertions(Iterator it) + { + SortedMap insertions = new TreeMap<>(); + while (it.hasNext()) + { + // check each record for insertions in the CIGAR string + SAMRecord rec = it.next(); + Iterator cit = rec.getCigar().getCigarElements() + .iterator(); + int next = 0; + while (cit.hasNext()) + { + CigarElement el = cit.next(); + switch (el.getOperator()) + { + case I: + // add to insertions list, and move along the read + int refLocation = rec.getReferencePositionAtReadPosition(next); + insertions.put(refLocation, el.getLength()); + next += el.getLength(); + break; + case M: + // match to reference, move along the read + next += el.getLength(); + break; + default: + // deletions, introns etc don't consume any residues from the read + break; + } + } + + } + return insertions; + } + }