*/
package jalview.io;
+import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.TreeMap;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMRecord;
@Override
public void parse() throws IOException
{
- SequenceI seq = null;
SAMRecordIterator it = fileReader.iterator();
+ SortedMap<Integer,Integer> insertions = getInsertions(it);
+ it.close();
+
+ it = fileReader.iterator();
while (it.hasNext())
{
SAMRecord rec = it.next();
int start = rec.getStart();
int end = rec.getEnd();
- Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
- .iterator();
+ SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
- seq = parseId(rec.getReadName());
- String cigarredRead = parseCigarToSequence(read, cit, '-');
- seq.setSequence(cigarredRead);
- seq.setStart(start);
- seq.setEnd(end);
- seqs.addElement(seq);
+ String cigarredRead = parseCigarToSequence(read, rec, '-');
+
+ SequenceI alsq = seq.deriveSequence();
+ alsq.setSequence(cigarredRead);
+ alsq.setStart(start);
+ alsq.setEnd(end);
+ seqs.add(alsq);
}
+ for (SequenceI seq : seqs)
+ {
+ int insertCount = 0;
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+ seq.getEnd()); // TODO start point should be start of alignment
+ // not 0
+ seqInserts.forEach(action);
+ while ((nextInsertion != -1) && (nextInsertion < seq.getEnd()))
+ {
+ seq.insertCharsAt(nextInsertion + insertCount, '-');
+ // nextInsertion = insertions.nextSetBit(nextInsertion + 1);
+ insertCount++;
+ }
+ }
}
+ /*
+ 1234567
+ ABCDEFG insert 3,4
+
+ R1: insert 3 = gap before 3rd res
+ AB.CDEFG
+
+ R2: insert 4 = gap before 4th res
+ ABC.DEFG
+
+ R3: insert 3,4 = 2 gaps before 3rd res
+ AB..CDEFG
+
+ => AB--C-DEFG
+
+ So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
+ Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
+
+ */
/**
- * Apply the CIGAR string to a read sequence and return the updated read
+ * Apply the CIGAR string to a read sequence and return the updated read.
*
* @param read
* the read to update
- * @param it
- * iterator over cigar elements
+ * @param rec
+ * the corresponding SAM record
* @param gapChar
* gap character to use
* @return string representing read with gaps, clipping etc applied
*/
private String parseCigarToSequence(String read,
- Iterator<CigarElement> it, char gapChar)
+ SAMRecord rec, char gapChar)
{
+ Iterator<CigarElement> it = rec.getCigar().getCigarElements()
+ .iterator();
+
StringBuilder newRead = new StringBuilder();
int next = 0;
+ int ap = 0; // position of query region qi.start;
+
+ // pad with spaces before read
+ for (int alp = rec
+ .getReferencePositionAtReadPosition(next + 1); ap < alp;)
+ {
+ newRead.append(" ");
+ ap++;
+ }
while (it.hasNext())
{
CigarElement el = it.next();
int length = el.getLength();
+ char[] gaps;
+
switch (el.getOperator())
{
case M:
// matched residues
- newRead.append(read.substring(next, next + length - 1));
+ newRead.append(
+ read.substring(next, next + length));
next += length;
break;
case N: // intron in RNA
case D: // deletion
// add gaps
- char[] gaps = new char[length];
+ gaps = new char[length];
Arrays.fill(gaps, gapChar);
newRead.append(gaps);
break;
case S:
// soft clipping - just skip this bit of the read
// do nothing
+
+ // work out how many gaps we need before the start of the soft clip -
+ // don't do this at the end of the read!
+ if (next == 0) // at start of read
+ {
+ int numgaps = rec.getUnclippedStart();
+ gaps = new char[numgaps];
+ Arrays.fill(gaps, ' ');
+ newRead.append(gaps);
+ }
+
+ newRead.append(
+ read.substring(next, next + length).toLowerCase());
next += length;
break;
case I:
- // add gaps to the reference sequence, which we know nothing about just
- // now
- // TODO
- newRead.append(read.substring(next, next + length - 1));
+ // the reference sequence and other reads should have been gapped for
+ // this insertion, so just add in the residues
+ newRead.append(read.substring(next, next + length));
+ next += length;
break;
case H:
// hard clipping - this stretch will not appear in the read
return newRead.toString();
}
+ /**
+ * Get list of positions inserted to the reference sequence
+ *
+ * @param it
+ * @return
+ */
+ private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+ {
+ SortedMap<Integer, Integer> insertions = new TreeMap<>();
+ while (it.hasNext())
+ {
+ // check each record for insertions in the CIGAR string
+ SAMRecord rec = it.next();
+ Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+ .iterator();
+ int next = 0;
+ while (cit.hasNext())
+ {
+ CigarElement el = cit.next();
+ switch (el.getOperator())
+ {
+ case I:
+ // add to insertions list, and move along the read
+ int refLocation = rec.getReferencePositionAtReadPosition(next);
+ insertions.put(refLocation, el.getLength());
+ next += el.getLength();
+ break;
+ case M:
+ // match to reference, move along the read
+ next += el.getLength();
+ break;
+ default:
+ // deletions, introns etc don't consume any residues from the read
+ break;
+ }
+ }
+
+ }
+ return insertions;
+ }
+
}