*/
package jalview.io;
+import jalview.datamodel.CigarParser;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
import java.util.SortedMap;
-import java.util.TreeMap;
-import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
public void parse() throws IOException
{
SAMRecordIterator it = fileReader.iterator();
- SortedMap<Integer,Integer> insertions = getInsertions(it);
+ CigarParser parser = new CigarParser('-');
+ SortedMap<Integer, Integer> insertions = parser.getInsertions(it);
it.close();
it = fileReader.iterator();
while (it.hasNext())
{
SAMRecord rec = it.next();
- String read = rec.getReadString();
- int start = rec.getStart();
- int end = rec.getEnd();
- SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
+ // make dataset sequence: start at 1, end at read length
+ SequenceI seq = new Sequence(rec.getReadName(),
+ rec.getReadString().toLowerCase());
+ seq.setStart(1);
+ seq.setEnd(rec.getReadLength());
- String cigarredRead = parseCigarToSequence(read, rec, '-');
+ String newRead = parser.parseCigarToSequence(rec, insertions);
+ // make alignment sequences
SequenceI alsq = seq.deriveSequence();
- alsq.setSequence(cigarredRead);
- alsq.setStart(start);
- alsq.setEnd(end);
- seqs.add(alsq);
- }
-
- for (SequenceI seq : seqs)
- {
- int insertCount = 0;
- SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
- seq.getEnd()); // TODO start point should be start of alignment
- // not 0
- seqInserts.forEach(action);
- while ((nextInsertion != -1) && (nextInsertion < seq.getEnd()))
- {
- seq.insertCharsAt(nextInsertion + insertCount, '-');
- // nextInsertion = insertions.nextSetBit(nextInsertion + 1);
- insertCount++;
- }
- }
- }
-
- /*
- 1234567
- ABCDEFG insert 3,4
-
- R1: insert 3 = gap before 3rd res
- AB.CDEFG
-
- R2: insert 4 = gap before 4th res
- ABC.DEFG
-
- R3: insert 3,4 = 2 gaps before 3rd res
- AB..CDEFG
-
- => AB--C-DEFG
-
- So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
- Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
-
- */
- /**
- * Apply the CIGAR string to a read sequence and return the updated read.
- *
- * @param read
- * the read to update
- * @param rec
- * the corresponding SAM record
- * @param gapChar
- * gap character to use
- * @return string representing read with gaps, clipping etc applied
- */
- private String parseCigarToSequence(String read,
- SAMRecord rec, char gapChar)
- {
- Iterator<CigarElement> it = rec.getCigar().getCigarElements()
- .iterator();
-
- StringBuilder newRead = new StringBuilder();
- int next = 0;
- int ap = 0; // position of query region qi.start;
-
- // pad with spaces before read
- for (int alp = rec
- .getReferencePositionAtReadPosition(next + 1); ap < alp;)
- {
- newRead.append(" ");
- ap++;
- }
-
- while (it.hasNext())
- {
- CigarElement el = it.next();
- int length = el.getLength();
- char[] gaps;
-
- switch (el.getOperator())
- {
- case M:
- // matched residues
- newRead.append(
- read.substring(next, next + length));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- // add gaps
- gaps = new char[length];
- Arrays.fill(gaps, gapChar);
- newRead.append(gaps);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- // work out how many gaps we need before the start of the soft clip -
- // don't do this at the end of the read!
- if (next == 0) // at start of read
- {
- int numgaps = rec.getUnclippedStart();
- gaps = new char[numgaps];
- Arrays.fill(gaps, ' ');
- newRead.append(gaps);
- }
-
- newRead.append(
- read.substring(next, next + length).toLowerCase());
- next += length;
- break;
- case I:
- // the reference sequence and other reads should have been gapped for
- // this insertion, so just add in the residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- // P, X EQ don't know what to do with these
- break;
- }
-
- }
- return newRead.toString();
- }
+ alsq.setSequence(newRead);
- /**
- * Get list of positions inserted to the reference sequence
- *
- * @param it
- * @return
- */
- private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
- {
- SortedMap<Integer, Integer> insertions = new TreeMap<>();
- while (it.hasNext())
- {
- // check each record for insertions in the CIGAR string
- SAMRecord rec = it.next();
- Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
- .iterator();
- int next = 0;
- while (cit.hasNext())
- {
- CigarElement el = cit.next();
- switch (el.getOperator())
- {
- case I:
- // add to insertions list, and move along the read
- int refLocation = rec.getReferencePositionAtReadPosition(next);
- insertions.put(refLocation, el.getLength());
- next += el.getLength();
- break;
- case M:
- // match to reference, move along the read
- next += el.getLength();
- break;
- default:
- // deletions, introns etc don't consume any residues from the read
- break;
- }
- }
-
+ // set start relative to soft clip; assume end is set by Sequence code
+ alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
+ seqs.add(alsq);
}
- return insertions;
}
-
}