X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FBamFile.java;h=26c14a2f3683c57ac7b2a297db6dabd625580075;hb=0b0b3d3687479204da2d199042c7bc90f3a6fd43;hp=e9919ff11d0098b5c6160953a5e14d57f6b2a642;hpb=932cc89cff4bb8fb8d5885d1c4bdc110f34486d9;p=jalview.git diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index e9919ff..26c14a2 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -20,17 +20,14 @@ */ package jalview.io; +import jalview.datamodel.CigarParser; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; import java.util.SortedMap; -import java.util.TreeMap; -import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamReader; @@ -94,188 +91,30 @@ public class BamFile extends AlignFile public void parse() throws IOException { SAMRecordIterator it = fileReader.iterator(); - SortedMap insertions = getInsertions(it); + CigarParser parser = new CigarParser('-'); + SortedMap insertions = parser.getInsertions(it); it.close(); it = fileReader.iterator(); while (it.hasNext()) { SAMRecord rec = it.next(); - String read = rec.getReadString(); - int start = rec.getStart(); - int end = rec.getEnd(); - SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString()); + // make dataset sequence: start at 1, end at read length + SequenceI seq = new Sequence(rec.getReadName(), + rec.getReadString().toLowerCase()); + seq.setStart(1); + seq.setEnd(rec.getReadLength()); - String cigarredRead = parseCigarToSequence(read, rec, '-'); + String newRead = parser.parseCigarToSequence(rec, insertions); + // make alignment sequences SequenceI alsq = seq.deriveSequence(); - alsq.setSequence(cigarredRead); - alsq.setStart(start); - alsq.setEnd(end); - seqs.add(alsq); - } - - for (SequenceI seq : seqs) - { - int insertCount = 0; - SortedMap seqInserts = insertions.subMap(0, - seq.getEnd()); // TODO start point should be start of alignment - // not 0 - seqInserts.forEach(action); - while ((nextInsertion != -1) && (nextInsertion < seq.getEnd())) - { - seq.insertCharsAt(nextInsertion + insertCount, '-'); - // nextInsertion = insertions.nextSetBit(nextInsertion + 1); - insertCount++; - } - } - } - - /* - 1234567 - ABCDEFG insert 3,4 - - R1: insert 3 = gap before 3rd res - AB.CDEFG - - R2: insert 4 = gap before 4th res - ABC.DEFG - - R3: insert 3,4 = 2 gaps before 3rd res - AB..CDEFG - - => AB--C-DEFG - - So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) - Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added - - */ - /** - * Apply the CIGAR string to a read sequence and return the updated read. - * - * @param read - * the read to update - * @param rec - * the corresponding SAM record - * @param gapChar - * gap character to use - * @return string representing read with gaps, clipping etc applied - */ - private String parseCigarToSequence(String read, - SAMRecord rec, char gapChar) - { - Iterator it = rec.getCigar().getCigarElements() - .iterator(); - - StringBuilder newRead = new StringBuilder(); - int next = 0; - int ap = 0; // position of query region qi.start; - - // pad with spaces before read - for (int alp = rec - .getReferencePositionAtReadPosition(next + 1); ap < alp;) - { - newRead.append(" "); - ap++; - } - - while (it.hasNext()) - { - CigarElement el = it.next(); - int length = el.getLength(); - char[] gaps; - - switch (el.getOperator()) - { - case M: - // matched residues - newRead.append( - read.substring(next, next + length)); - next += length; - break; - case N: // intron in RNA - case D: // deletion - // add gaps - gaps = new char[length]; - Arrays.fill(gaps, gapChar); - newRead.append(gaps); - break; - case S: - // soft clipping - just skip this bit of the read - // do nothing - - // work out how many gaps we need before the start of the soft clip - - // don't do this at the end of the read! - if (next == 0) // at start of read - { - int numgaps = rec.getUnclippedStart(); - gaps = new char[numgaps]; - Arrays.fill(gaps, ' '); - newRead.append(gaps); - } - - newRead.append( - read.substring(next, next + length).toLowerCase()); - next += length; - break; - case I: - // the reference sequence and other reads should have been gapped for - // this insertion, so just add in the residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case H: - // hard clipping - this stretch will not appear in the read - break; - default: - // P, X EQ don't know what to do with these - break; - } - - } - return newRead.toString(); - } + alsq.setSequence(newRead); - /** - * Get list of positions inserted to the reference sequence - * - * @param it - * @return - */ - private SortedMap getInsertions(Iterator it) - { - SortedMap insertions = new TreeMap<>(); - while (it.hasNext()) - { - // check each record for insertions in the CIGAR string - SAMRecord rec = it.next(); - Iterator cit = rec.getCigar().getCigarElements() - .iterator(); - int next = 0; - while (cit.hasNext()) - { - CigarElement el = cit.next(); - switch (el.getOperator()) - { - case I: - // add to insertions list, and move along the read - int refLocation = rec.getReferencePositionAtReadPosition(next); - insertions.put(refLocation, el.getLength()); - next += el.getLength(); - break; - case M: - // match to reference, move along the read - next += el.getLength(); - break; - default: - // deletions, introns etc don't consume any residues from the read - break; - } - } - + // set start relative to soft clip; assume end is set by Sequence code + alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1); + seqs.add(alsq); } - return insertions; } - }