import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;
SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
- String cigarredRead = parseCigarToSequence(read, rec, '-');
+ String cigarredRead = parseCigarToSequence(read, rec, insertions,
+ '-');
SequenceI alsq = seq.deriveSequence();
alsq.setSequence(cigarredRead);
seqs.add(alsq);
}
+ // put insertions back into each sequence
for (SequenceI seq : seqs)
{
int insertCount = 0;
- SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+ SortedMap<Integer, Integer> seqInserts = insertions
+ .subMap(seq.getStart(),
seq.getEnd()); // TODO start point should be start of alignment
// not 0
- seqInserts.forEach(action);
- while ((nextInsertion != -1) && (nextInsertion < seq.getEnd()))
+
+ Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
+ .iterator();
+ while (iit.hasNext())
{
- seq.insertCharsAt(nextInsertion + insertCount, '-');
- // nextInsertion = insertions.nextSetBit(nextInsertion + 1);
+ // TODO account for sequence already having insertion, and INDELS
+ Entry<Integer, Integer> entry = iit.next();
+ int pos = entry.getKey();
+ int length = entry.getValue();
+ seq.insertCharAt(pos + insertCount, length, '-');
insertCount++;
}
}
* @return string representing read with gaps, clipping etc applied
*/
private String parseCigarToSequence(String read,
- SAMRecord rec, char gapChar)
+ SAMRecord rec, SortedMap<Integer, Integer> insertions,
+ char gapChar)
{
Iterator<CigarElement> it = rec.getCigar().getCigarElements()
.iterator();
StringBuilder newRead = new StringBuilder();
int next = 0;
- int ap = 0; // position of query region qi.start;
// pad with spaces before read
- for (int alp = rec
- .getReferencePositionAtReadPosition(next + 1); ap < alp;)
+ // first count gaps needed to pad to reference position
+ int gaps = rec.getReferencePositionAtReadPosition(next + 1);
+ // now count gaps to pad for insertions in other reads
+ int insertCount = countInsertionsBeforeRead(rec, insertions);
+ addGaps(newRead, gaps + insertCount, gapChar);
+
+ SortedMap<Integer, Integer> seqInserts = insertions
+ .subMap(rec.getStart(), rec.getEnd() + 1);
+ // TODO start point should be start of alignment not 0
+
+ Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
+ .iterator();
+ if (iit.hasNext())
{
- newRead.append(" ");
- ap++;
+ // TODO account for sequence already having insertion, and INDELS
+ Entry<Integer, Integer> entry = iit.next();
+ int insertPos = entry.getKey();
+ int insertLen = entry.getValue();
+ // seq.insertCharAt(pos + insertCount, length, '-');
+ // insertCount++;
}
while (it.hasNext())
{
CigarElement el = it.next();
int length = el.getLength();
- char[] gaps;
+
switch (el.getOperator())
{
case M:
- // matched residues
- newRead.append(
- read.substring(next, next + length));
+ case X:
+ case EQ:
+ // matched or mismatched residues
+ newRead.append(read.substring(next, next + length));
next += length;
break;
case N: // intron in RNA
case D: // deletion
- // add gaps
- gaps = new char[length];
- Arrays.fill(gaps, gapChar);
- newRead.append(gaps);
+ addGaps(newRead, length, gapChar);
break;
case S:
// soft clipping - just skip this bit of the read
// don't do this at the end of the read!
if (next == 0) // at start of read
{
- int numgaps = rec.getUnclippedStart();
- gaps = new char[numgaps];
- Arrays.fill(gaps, ' ');
- newRead.append(gaps);
+ addGaps(newRead, rec.getUnclippedStart(), gapChar);
}
- newRead.append(
- read.substring(next, next + length).toLowerCase());
+ newRead.append(read.substring(next, next + length).toLowerCase());
next += length;
break;
case I:
// hard clipping - this stretch will not appear in the read
break;
default:
- // P, X EQ don't know what to do with these
break;
}
+
+ }
+ return newRead.toString();
+ }
+
+ private String applyCigar(String read, CigarElement el, char gapChar,
+ int next, int length, int softStart)
+ {
+ StringBuilder newRead = new StringBuilder();
+ switch (el.getOperator())
+ {
+ case M:
+ case X:
+ case EQ:
+ // matched or mismatched residues
+ newRead.append(read.substring(next, next + length));
+ next += length;
+ break;
+ case N: // intron in RNA
+ case D: // deletion
+ addGaps(newRead, length, gapChar);
+ break;
+ case S:
+ // soft clipping - just skip this bit of the read
+ // do nothing
+
+ // work out how many gaps we need before the start of the soft clip -
+ // don't do this at the end of the read!
+ if (next == 0) // at start of read
+ {
+ addGaps(newRead, softStart, gapChar);
+ }
+
+ newRead.append(read.substring(next, next + length).toLowerCase());
+ next += length;
+ break;
+ case I:
+ // the reference sequence and other reads should have been gapped for
+ // this insertion, so just add in the residues
+ newRead.append(read.substring(next, next + length));
+ next += length;
+ break;
+ case H:
+ // hard clipping - this stretch will not appear in the read
+ break;
+ default:
+ break;
}
return newRead.toString();
}
case I:
// add to insertions list, and move along the read
int refLocation = rec.getReferencePositionAtReadPosition(next);
- insertions.put(refLocation, el.getLength());
+
+ // if there's already an insertion at this location, keep the longest
+ // insertion; if there's no insertion keep this one
+ if (!insertions.containsKey(refLocation)
+ || (insertions.containsKey(refLocation)
+ && insertions.get(refLocation) < el.getLength()))
+ {
+ insertions.put(refLocation, el.getLength());
+ }
next += el.getLength();
break;
case M:
return insertions;
}
+ /**
+ * Add sequence of gaps to a read
+ *
+ * @param read
+ * read to update
+ * @param length
+ * number of gaps
+ * @param gapChar
+ * gap character to use
+ */
+ private void addGaps(StringBuilder read, int length, char gapChar)
+ {
+ if (length > 0)
+ {
+ char[] gaps = new char[length];
+ Arrays.fill(gaps, gapChar);
+ read.append(gaps);
+ }
+ }
+
+ private int countInsertionsBeforeRead(SAMRecord rec,
+ SortedMap<Integer, Integer> insertions)
+ {
+ int gaps = 0;
+
+ // add in any insertion gaps before read
+ // TODO start point should be start of alignment not 0
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+ rec.getStart());
+
+ Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
+ .iterator();
+ while (it.hasNext())
+ {
+ Entry<Integer, Integer> entry = it.next();
+ gaps += entry.getValue();
+ }
+ return gaps;
+ }
+
}