--- /dev/null
+package jalview.datamodel;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
+
+public class CigarParser
+{
+ private final char gapChar;
+
+ public CigarParser(char gapCharacter)
+ {
+ gapChar = gapCharacter;
+ }
+
+ /**
+ * Apply the CIGAR string to a read sequence and return the updated read.
+ *
+ * @param rec
+ * the SAM record for the read
+ * @param insertions
+ * a map of inserted positions and lengths
+ * @return string representing read with gaps, clipping etc applied
+ */
+ public String parseCigarToSequence(SAMRecord rec,
+ SortedMap<Integer, Integer> insertions)
+ {
+ Iterator<CigarElement> it = rec.getCigar().getCigarElements()
+ .iterator();
+
+ StringBuilder newRead = new StringBuilder();
+ int next = 0;
+ int refnext = 0;
+
+ // pad with spaces before read
+ // first count gaps needed to pad to reference position
+ // NB position is 1-based so number of gaps = pos-1
+ int gaps = rec.getUnclippedStart() - 1;
+ // now count gaps to pad for insertions in other reads
+ int insertCount = countInsertionsBeforeRead(rec, insertions);
+ addGaps(newRead, gaps + insertCount);
+
+ int lastinserts = 0;
+ while (it.hasNext())
+ {
+ CigarElement el = it.next();
+ int length = el.getLength();
+
+ // which insertions coincide with current CIGAR operation?
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(
+ rec.getStart() + refnext, rec.getStart() + refnext + length);
+
+ int[] override = null;
+ if (lastinserts > 0)
+ {
+ // we just inserted something
+ // remove these inserts now - should be very first entry
+ int count = seqInserts.get(rec.getStart() + refnext);
+ override = new int[] { rec.getStart() + refnext,
+ count - lastinserts };
+ lastinserts = 0;
+ }
+
+ Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
+ .iterator();
+
+ newRead.append(applyCigarOp(el, next, rec, iit, override));
+
+ if (el.getOperator().consumesReferenceBases())
+ {
+ refnext += length;
+ }
+ if (el.getOperator().consumesReadBases())
+ {
+ next += length;
+ }
+ if (el.getOperator().equals(CigarOperator.I))
+ {
+ // count how many insertions we made
+ lastinserts = length;
+ }
+ }
+ return newRead.toString();
+ }
+
+ /**
+ * Apply a cigar operation to a segment of a read, accounting for any
+ * insertions to the reference from other reads in the region
+ *
+ * @param el
+ * the current CIGAR element
+ * @param next
+ * location in read to start applying CIGAR op
+ * @param length
+ * length of CIGAR op
+ * @param rec
+ * SAMRecord corresponding to read
+ * @param it
+ * iterator over insertions which coincide with rec
+ * @param override
+ * an optional <insertion position, length> which can override a
+ * <position,length> in it to change the length. Set to null if not
+ * used.
+ * @return
+ */
+ private String applyCigarOp(CigarElement el, int next,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int length = el.getLength();
+ int nextPos = next;
+
+ switch (el.getOperator())
+ {
+ case M:
+ case X:
+ case EQ:
+ // matched or mismatched residues
+ newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
+ override);
+ break;
+ case N: // intron in RNA
+ case D: // deletion
+ int insertCount = 0;
+ while (it.hasNext())
+ {
+ // just count all the insertions we need to add in the deletion/intron
+ // region
+ Entry<Integer, Integer> entry = it.next();
+ insertCount += entry.getValue();
+ }
+ addGaps(newRead, length + insertCount);
+ break;
+ case S:
+ // soft clipping - just skip this bit of the read
+ // do nothing
+
+ newRead.append(
+ read.substring(nextPos, nextPos + length).toLowerCase());
+ // nextPos += length;
+ break;
+ case I:
+ // the reference sequence and other reads should have been gapped for
+ // this insertion, so just add in the residues
+ newRead.append(read.substring(nextPos, nextPos + length));
+ // nextPos += length;
+ break;
+ case H:
+ // hard clipping - this stretch will not appear in the read
+ default:
+ break;
+ }
+
+ return newRead.toString();
+ }
+
+ /**
+ * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
+ *
+ * @param next
+ * next position in read
+ * @param length
+ * length of cigar operation
+ * @param rec
+ * SAMRecord to apply the CIGAR op to
+ * @param it
+ * iterator over insertions in the CIGAR region
+ * @param override
+ * insertions which have already been accounted for
+ * @return
+ */
+ private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int nextPos = next;
+ while (nextPos < next + length)
+ {
+ int insertPos = next + length;
+ int insertLen = 0;
+ if (it.hasNext())
+ {
+ // account for sequence already having insertion
+ // if an override entry exists it gives the number of positions still to
+ // be inserted at this point, after accounting for insertions driven by
+ // the current read.
+ // e.g. if the current read has 2 inserted positions here, and the
+ // insertions list also has 2 inserted positions, the override will
+ // cancel the insert for this read. If the insertions list had 3
+ // insertions, the override would reduce this to 1 insertion.
+ Entry<Integer, Integer> entry = it.next();
+ int key = entry.getKey();
+ insertLen = entry.getValue();
+ if (override != null && key == override[0])
+ {
+ insertLen = override[1];
+ }
+ if (insertLen > 0)
+ {
+ insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
+ }
+ }
+ newRead.append(read.substring(nextPos, insertPos));
+ nextPos = insertPos;
+ addGaps(newRead, insertLen); // add insertions
+ }
+ return newRead;
+ }
+
+ /**
+ * Get list of positions inserted to the reference sequence. Note: assumes all
+ * reads are on the same chromosome.
+ *
+ * @param it
+ * Iterator over all the SAMRecords
+ * @return sorted map of insertion positions <position, insertion size>
+ */
+ /*
+ 1234567
+ ABCDEFG insert 3,4
+
+ R1: insert 3 = gap before 3rd res
+ AB.CDEFG
+
+ R2: insert 4 = gap before 4th res
+ ABC.DEFG
+
+ R3: insert 3,4 = 2 gaps before 3rd res
+ AB..CDEFG
+
+ => AB--C-DEFG
+
+ So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
+ Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
+
+ */
+ public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+ {
+ SortedMap<Integer, Integer> insertions = new TreeMap<>();
+ while (it.hasNext())
+ {
+ // check each record for insertions in the CIGAR string
+ SAMRecord rec = it.next();
+ Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+ .iterator();
+ int next = 0;
+ while (cit.hasNext())
+ {
+ CigarElement el = cit.next();
+ switch (el.getOperator())
+ {
+ case I:
+ // add to insertions list, and move along the read
+ // location is 1 past the current position of next
+ // (note if we try to retrieve +1 position from reference by calling
+ // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
+ // insertion!)
+ int refLocation = rec.getReferencePositionAtReadPosition(next)
+ + 1;
+
+ // if there's already an insertion at this location, keep the longest
+ // insertion; if there's no insertion keep this one
+ if (!insertions.containsKey(refLocation)
+ || (insertions.containsKey(refLocation)
+ && insertions.get(refLocation) < el.getLength()))
+ {
+ insertions.put(refLocation, el.getLength());
+ }
+ next += el.getLength();
+ break;
+ case M:
+ case S:
+ // match to reference, or soft clip, move along the read
+ next += el.getLength();
+ break;
+ default:
+ // deletions, introns etc don't consume any residues from the read
+ break;
+ }
+ }
+
+ }
+ return insertions;
+ }
+
+ /**
+ * Add sequence of gaps to a read
+ *
+ * @param read
+ * read to update
+ * @param length
+ * number of gaps
+ */
+ private void addGaps(StringBuilder read, int length)
+ {
+ if (length > 0)
+ {
+ char[] gaps = new char[length];
+ Arrays.fill(gaps, gapChar);
+ read.append(gaps);
+ }
+ }
+
+ /**
+ * Get the number of insertions before the start of an aligned read (i.e.
+ * insertions in soft clipped region are counted too)
+ *
+ * @param rec
+ * the SAMRecord for the read
+ * @param insertions
+ * the map of insertions
+ * @return number of insertions before the read starts
+ */
+ private int countInsertionsBeforeRead(SAMRecord rec,
+ SortedMap<Integer, Integer> insertions)
+ {
+ int gaps = 0;
+
+ // add in any insertion gaps before read
+ // TODO start point should be start of alignment not 0
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+ rec.getStart());
+
+ Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
+ .iterator();
+ while (it.hasNext())
+ {
+ Entry<Integer, Integer> entry = it.next();
+ gaps += entry.getValue();
+ }
+ return gaps;
+ }
+}
*/
package jalview.io;
+import jalview.datamodel.CigarParser;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
import java.util.SortedMap;
-import java.util.TreeMap;
-import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
@Override
public void parse() throws IOException
{
+ CigarParser parser = new CigarParser('-');
SAMRecordIterator it = fileReader.iterator();
- SortedMap<Integer,Integer> insertions = getInsertions(it);
+ SortedMap<Integer, Integer> insertions = parser.getInsertions(it);
it.close();
it = fileReader.iterator();
while (it.hasNext())
{
SAMRecord rec = it.next();
- String read = rec.getReadString();
int start = rec.getStart();
int end = rec.getEnd();
SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
- String cigarredRead = parseCigarToSequence(read, rec, insertions,
- '-');
+ String cigarredRead = parser.parseCigarToSequence(rec, insertions);
SequenceI alsq = seq.deriveSequence();
alsq.setSequence(cigarredRead);
alsq.setEnd(end);
seqs.add(alsq);
}
-
- // put insertions back into each sequence
- for (SequenceI seq : seqs)
- {
- int insertCount = 0;
- SortedMap<Integer, Integer> seqInserts = insertions
- .subMap(seq.getStart(),
- seq.getEnd()); // TODO start point should be start of alignment
- // not 0
-
- Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
- .iterator();
- while (iit.hasNext())
- {
- // TODO account for sequence already having insertion, and INDELS
- Entry<Integer, Integer> entry = iit.next();
- int pos = entry.getKey();
- int length = entry.getValue();
- seq.insertCharAt(pos + insertCount, length, '-');
- insertCount++;
- }
- }
- }
-
- /*
- 1234567
- ABCDEFG insert 3,4
-
- R1: insert 3 = gap before 3rd res
- AB.CDEFG
-
- R2: insert 4 = gap before 4th res
- ABC.DEFG
-
- R3: insert 3,4 = 2 gaps before 3rd res
- AB..CDEFG
-
- => AB--C-DEFG
-
- So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
- Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
-
- */
- /**
- * Apply the CIGAR string to a read sequence and return the updated read.
- *
- * @param read
- * the read to update
- * @param rec
- * the corresponding SAM record
- * @param gapChar
- * gap character to use
- * @return string representing read with gaps, clipping etc applied
- */
- private String parseCigarToSequence(String read,
- SAMRecord rec, SortedMap<Integer, Integer> insertions,
- char gapChar)
- {
- Iterator<CigarElement> it = rec.getCigar().getCigarElements()
- .iterator();
-
- StringBuilder newRead = new StringBuilder();
- int next = 0;
-
- // pad with spaces before read
- // first count gaps needed to pad to reference position
- int gaps = rec.getReferencePositionAtReadPosition(next + 1);
- // now count gaps to pad for insertions in other reads
- int insertCount = countInsertionsBeforeRead(rec, insertions);
- addGaps(newRead, gaps + insertCount, gapChar);
-
- SortedMap<Integer, Integer> seqInserts = insertions
- .subMap(rec.getStart(), rec.getEnd() + 1);
- // TODO start point should be start of alignment not 0
-
- Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
- .iterator();
- if (iit.hasNext())
- {
- // TODO account for sequence already having insertion, and INDELS
- Entry<Integer, Integer> entry = iit.next();
- int insertPos = entry.getKey();
- int insertLen = entry.getValue();
- // seq.insertCharAt(pos + insertCount, length, '-');
- // insertCount++;
- }
-
- while (it.hasNext())
- {
- CigarElement el = it.next();
- int length = el.getLength();
-
-
- switch (el.getOperator())
- {
- case M:
- case X:
- case EQ:
- // matched or mismatched residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- addGaps(newRead, length, gapChar);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- // work out how many gaps we need before the start of the soft clip -
- // don't do this at the end of the read!
- if (next == 0) // at start of read
- {
- addGaps(newRead, rec.getUnclippedStart(), gapChar);
- }
-
- newRead.append(read.substring(next, next + length).toLowerCase());
- next += length;
- break;
- case I:
- // the reference sequence and other reads should have been gapped for
- // this insertion, so just add in the residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- break;
- }
-
-
- }
- return newRead.toString();
}
- private String applyCigar(String read, CigarElement el, char gapChar,
- int next, int length, int softStart)
- {
- StringBuilder newRead = new StringBuilder();
- switch (el.getOperator())
- {
- case M:
- case X:
- case EQ:
- // matched or mismatched residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- addGaps(newRead, length, gapChar);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- // work out how many gaps we need before the start of the soft clip -
- // don't do this at the end of the read!
- if (next == 0) // at start of read
- {
- addGaps(newRead, softStart, gapChar);
- }
-
- newRead.append(read.substring(next, next + length).toLowerCase());
- next += length;
- break;
- case I:
- // the reference sequence and other reads should have been gapped for
- // this insertion, so just add in the residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- break;
- }
- return newRead.toString();
- }
-
- /**
- * Get list of positions inserted to the reference sequence
- *
- * @param it
- * @return
- */
- private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
- {
- SortedMap<Integer, Integer> insertions = new TreeMap<>();
- while (it.hasNext())
- {
- // check each record for insertions in the CIGAR string
- SAMRecord rec = it.next();
- Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
- .iterator();
- int next = 0;
- while (cit.hasNext())
- {
- CigarElement el = cit.next();
- switch (el.getOperator())
- {
- case I:
- // add to insertions list, and move along the read
- int refLocation = rec.getReferencePositionAtReadPosition(next);
- // if there's already an insertion at this location, keep the longest
- // insertion; if there's no insertion keep this one
- if (!insertions.containsKey(refLocation)
- || (insertions.containsKey(refLocation)
- && insertions.get(refLocation) < el.getLength()))
- {
- insertions.put(refLocation, el.getLength());
- }
- next += el.getLength();
- break;
- case M:
- // match to reference, move along the read
- next += el.getLength();
- break;
- default:
- // deletions, introns etc don't consume any residues from the read
- break;
- }
- }
- }
- return insertions;
- }
-
- /**
- * Add sequence of gaps to a read
- *
- * @param read
- * read to update
- * @param length
- * number of gaps
- * @param gapChar
- * gap character to use
- */
- private void addGaps(StringBuilder read, int length, char gapChar)
- {
- if (length > 0)
- {
- char[] gaps = new char[length];
- Arrays.fill(gaps, gapChar);
- read.append(gaps);
- }
- }
-
- private int countInsertionsBeforeRead(SAMRecord rec,
- SortedMap<Integer, Integer> insertions)
- {
- int gaps = 0;
-
- // add in any insertion gaps before read
- // TODO start point should be start of alignment not 0
- SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
- rec.getStart());
-
- Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
- .iterator();
- while (it.hasNext())
- {
- Entry<Integer, Integer> entry = it.next();
- gaps += entry.getValue();
- }
- return gaps;
- }
}