*/
package jalview.io;
+import jalview.datamodel.CigarParser;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
+import java.util.ArrayList;
+import java.util.List;
import java.util.SortedMap;
-import java.util.TreeMap;
-import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
public class BamFile extends AlignFile
{
+ // SAM/BAM file reader
+ private SamReader fileReader;
- SamReader fileReader;
+ // start position to read from
+ private int start = -1;
+
+ // end position to read to
+ private int end = -1;
+
+ // chromosome/contig to read
+ private String chromosome = "";
+
+ // first position in alignment
+ private int alignmentStart = -1;
/**
* Creates a new BamFile object.
* Creates a new BamFile object.
*
* @param inFile
- * DOCUMENT ME!
+ * Name of file to read
* @param sourceType
- * DOCUMENT ME!
+ * Whether data source is file, url or other type of data source
*
* @throws IOException
- * DOCUMENT ME!
*/
public BamFile(String inFile, DataSourceType sourceType)
throws IOException
{
- super(inFile, sourceType);
+ super(true, inFile, sourceType);
final SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
fileReader = factory.open(new File(inFile));
}
+ /**
+ * Creates a new BamFile object
+ *
+ * @param source
+ * wrapper for datasource
+ * @throws IOException
+ */
public BamFile(FileParse source) throws IOException
{
+ super(true, source);
final SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
// File-based bam
fileReader = factory.open(source.inFile);
- parse();
}
@Override
}
@Override
- public void parse() throws IOException
+ public void parse()
{
- SAMRecordIterator it = fileReader.iterator();
- SortedMap<Integer,Integer> insertions = getInsertions(it);
- it.close();
-
- it = fileReader.iterator();
- while (it.hasNext())
+ // only actually parse if params are set
+ if (chromosome != null && chromosome != "")
{
- SAMRecord rec = it.next();
- String read = rec.getReadString();
- int start = rec.getStart();
- int end = rec.getEnd();
-
- SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
-
- String cigarredRead = parseCigarToSequence(read, rec, insertions,
- '-');
-
- SequenceI alsq = seq.deriveSequence();
- alsq.setSequence(cigarredRead);
- alsq.setStart(start);
- alsq.setEnd(end);
- seqs.add(alsq);
- }
-
- // put insertions back into each sequence
- for (SequenceI seq : seqs)
- {
- int insertCount = 0;
- SortedMap<Integer, Integer> seqInserts = insertions
- .subMap(seq.getStart(),
- seq.getEnd()); // TODO start point should be start of alignment
- // not 0
-
- Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
- .iterator();
- while (iit.hasNext())
+ SAMRecordIterator it = fileReader.query(chromosome, start, end,
+ false);
+ CigarParser parser = new CigarParser('-');
+ SortedMap<Integer, Integer> insertions = parser.getInsertions(it);
+ it.close();
+
+ it = fileReader.query(chromosome, start, end, false);
+ while (it.hasNext())
{
- // TODO account for sequence already having insertion, and INDELS
- Entry<Integer, Integer> entry = iit.next();
- int pos = entry.getKey();
- int length = entry.getValue();
- seq.insertCharAt(pos + insertCount, length, '-');
- insertCount++;
- }
- }
- }
-
- /*
- 1234567
- ABCDEFG insert 3,4
-
- R1: insert 3 = gap before 3rd res
- AB.CDEFG
-
- R2: insert 4 = gap before 4th res
- ABC.DEFG
-
- R3: insert 3,4 = 2 gaps before 3rd res
- AB..CDEFG
-
- => AB--C-DEFG
-
- So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
- Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
-
- */
- /**
- * Apply the CIGAR string to a read sequence and return the updated read.
- *
- * @param read
- * the read to update
- * @param rec
- * the corresponding SAM record
- * @param gapChar
- * gap character to use
- * @return string representing read with gaps, clipping etc applied
- */
- private String parseCigarToSequence(String read,
- SAMRecord rec, SortedMap<Integer, Integer> insertions,
- char gapChar)
- {
- Iterator<CigarElement> it = rec.getCigar().getCigarElements()
- .iterator();
-
- StringBuilder newRead = new StringBuilder();
- int next = 0;
-
- // pad with spaces before read
- // first count gaps needed to pad to reference position
- int gaps = rec.getReferencePositionAtReadPosition(next + 1);
- // now count gaps to pad for insertions in other reads
- int insertCount = countInsertionsBeforeRead(rec, insertions);
- addGaps(newRead, gaps + insertCount, gapChar);
-
- SortedMap<Integer, Integer> seqInserts = insertions
- .subMap(rec.getStart(), rec.getEnd() + 1);
- // TODO start point should be start of alignment not 0
-
- Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
- .iterator();
- if (iit.hasNext())
- {
- // TODO account for sequence already having insertion, and INDELS
- Entry<Integer, Integer> entry = iit.next();
- int insertPos = entry.getKey();
- int insertLen = entry.getValue();
- // seq.insertCharAt(pos + insertCount, length, '-');
- // insertCount++;
- }
+ SAMRecord rec = it.next();
- while (it.hasNext())
- {
- CigarElement el = it.next();
- int length = el.getLength();
-
-
- switch (el.getOperator())
- {
- case M:
- case X:
- case EQ:
- // matched or mismatched residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- addGaps(newRead, length, gapChar);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- // work out how many gaps we need before the start of the soft clip -
- // don't do this at the end of the read!
- if (next == 0) // at start of read
+ // set the alignment start to be start of first read (we assume reads
+ // are sorted)
+ if (alignmentStart == -1)
{
- addGaps(newRead, rec.getUnclippedStart(), gapChar);
+ alignmentStart = rec.getAlignmentStart();
}
- newRead.append(read.substring(next, next + length).toLowerCase());
- next += length;
- break;
- case I:
- // the reference sequence and other reads should have been gapped for
- // this insertion, so just add in the residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- break;
- }
+ // make dataset sequence: start at 1, end at read length
+ SequenceI seq = new Sequence(rec.getReadName(),
+ rec.getReadString().toLowerCase());
+ seq.setStart(1);
+ seq.setEnd(rec.getReadLength());
+ String newRead = parser.parseCigarToSequence(rec, insertions,
+ alignmentStart);
- }
- return newRead.toString();
- }
+ // make alignment sequences
+ SequenceI alsq = seq.deriveSequence();
+ alsq.setSequence(newRead);
- private String applyCigar(String read, CigarElement el, char gapChar,
- int next, int length, int softStart)
- {
- StringBuilder newRead = new StringBuilder();
- switch (el.getOperator())
- {
- case M:
- case X:
- case EQ:
- // matched or mismatched residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- addGaps(newRead, length, gapChar);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
-
- // work out how many gaps we need before the start of the soft clip -
- // don't do this at the end of the read!
- if (next == 0) // at start of read
- {
- addGaps(newRead, softStart, gapChar);
+ // set start relative to soft clip; assume end is set by Sequence code
+ alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
+ seqs.add(alsq);
}
-
- newRead.append(read.substring(next, next + length).toLowerCase());
- next += length;
- break;
- case I:
- // the reference sequence and other reads should have been gapped for
- // this insertion, so just add in the residues
- newRead.append(read.substring(next, next + length));
- next += length;
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- break;
}
- return newRead.toString();
}
/**
- * Get list of positions inserted to the reference sequence
+ * Get the list of chromosomes or contigs from the file (listed in SQ entries
+ * in BAM file header)
*
- * @param it
- * @return
+ * @return array of chromosome/contig strings
*/
- private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+ @Override
+ public Object[] preprocess()
{
- SortedMap<Integer, Integer> insertions = new TreeMap<>();
- while (it.hasNext())
- {
- // check each record for insertions in the CIGAR string
- SAMRecord rec = it.next();
- Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
- .iterator();
- int next = 0;
- while (cit.hasNext())
- {
- CigarElement el = cit.next();
- switch (el.getOperator())
- {
- case I:
- // add to insertions list, and move along the read
- int refLocation = rec.getReferencePositionAtReadPosition(next);
+ List<SAMSequenceRecord> refSeqs = fileReader.getFileHeader()
+ .getSequenceDictionary().getSequences();
+ List<String> chrs = new ArrayList<>();
- // if there's already an insertion at this location, keep the longest
- // insertion; if there's no insertion keep this one
- if (!insertions.containsKey(refLocation)
- || (insertions.containsKey(refLocation)
- && insertions.get(refLocation) < el.getLength()))
- {
- insertions.put(refLocation, el.getLength());
- }
- next += el.getLength();
- break;
- case M:
- // match to reference, move along the read
- next += el.getLength();
- break;
- default:
- // deletions, introns etc don't consume any residues from the read
- break;
- }
- }
-
- }
- return insertions;
- }
-
- /**
- * Add sequence of gaps to a read
- *
- * @param read
- * read to update
- * @param length
- * number of gaps
- * @param gapChar
- * gap character to use
- */
- private void addGaps(StringBuilder read, int length, char gapChar)
- {
- if (length > 0)
+ for (SAMSequenceRecord ref : refSeqs)
{
- char[] gaps = new char[length];
- Arrays.fill(gaps, gapChar);
- read.append(gaps);
+ chrs.add(ref.getSequenceName());
}
+ return chrs.toArray();
}
-
- private int countInsertionsBeforeRead(SAMRecord rec,
- SortedMap<Integer, Integer> insertions)
- {
- int gaps = 0;
-
- // add in any insertion gaps before read
- // TODO start point should be start of alignment not 0
- SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
- rec.getStart());
- Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
- .iterator();
- while (it.hasNext())
- {
- Entry<Integer, Integer> entry = it.next();
- gaps += entry.getValue();
- }
- return gaps;
+ public void setOptions(String chr, int s, int e)
+ {
+ chromosome = chr;
+ start = s;
+ end = e;
}
-
}