X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FBamFile.java;h=2d079cfb90a1d8738ecf66fceb3e603bfb3fad49;hb=refs%2Fheads%2Ffeatures%2FJAL-2909_bamImport_2_11_2;hp=be5e3eee90e4722578c1ad12000c5e3bc86bc71d;hpb=69b00177c988226aace46a0aae533a9904da2535;p=jalview.git diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java index be5e3ee..2d079cf 100644 --- a/src/jalview/io/BamFile.java +++ b/src/jalview/io/BamFile.java @@ -20,29 +20,49 @@ */ package jalview.io; +import jalview.bin.Cache; +import jalview.datamodel.CigarParser; +import jalview.datamodel.Range; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; import java.util.Map; -import java.util.Map.Entry; +import java.util.PrimitiveIterator.OfInt; import java.util.SortedMap; -import java.util.TreeMap; -import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; public class BamFile extends AlignFile { + // SAM/BAM file reader + private SamReader fileReader; - SamReader fileReader; + // start position to read from + private int start = -1; + + // end position to read to + private int end = -1; + + // chromosome/contig to read + private String chromosome = ""; + + // first position in alignment + private int alignmentStart = -1; + + private File _bamFile; /** * Creates a new BamFile object. @@ -55,34 +75,57 @@ public class BamFile extends AlignFile * Creates a new BamFile object. * * @param inFile - * DOCUMENT ME! + * Name of file to read * @param sourceType - * DOCUMENT ME! + * Whether data source is file, url or other type of data source * * @throws IOException - * DOCUMENT ME! */ public BamFile(String inFile, DataSourceType sourceType) throws IOException { - super(inFile, sourceType); - final SamReaderFactory factory = SamReaderFactory.makeDefault() - .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, - SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) - .validationStringency(ValidationStringency.SILENT); - fileReader = factory.open(new File(inFile)); + super(true, inFile, sourceType); + _bamFile = new File(inFile); + initFileReader(); } - - public BamFile(FileParse source) throws IOException + private void initFileReader() throws IOException { final SamReaderFactory factory = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) .validationStringency(ValidationStringency.SILENT); - // File-based bam - fileReader = factory.open(source.inFile); - parse(); + if (_bamFile!=null) + { + fileReader = factory.open(_bamFile); // will need to be adapted for JalviewJS/etc + } + else + { + // try and locate index ? + String index = getDataName() + ".bai"; + fileReader = factory.open(SamInputResource.of(getDataName()) + .index(new URL(index))); + } + } + /** + * Creates a new BamFile object + * + * @param source + * wrapper for datasource + * @throws IOException + */ + public BamFile(FileParse source) throws IOException + { + super(false, source); + parseSuffix(); + if (source.getDataSourceType() == DataSourceType.FILE) + { + _bamFile = source.inFile; + } else { + + } + initFileReader(); + doParse(); } @Override @@ -92,303 +135,230 @@ public class BamFile extends AlignFile return null; } - @Override - public void parse() throws IOException + private StringBuilder insertRefSeq( + SortedMap insertions, Range xtent) { - SAMRecordIterator it = fileReader.iterator(); - SortedMap insertions = getInsertions(it); - it.close(); - - it = fileReader.iterator(); - while (it.hasNext()) + StringBuilder refseq = new StringBuilder(); + int inserted = 0; + for (int p = xtent.start; p < xtent.end; p++) { - SAMRecord rec = it.next(); - String read = rec.getReadString(); - int start = rec.getStart(); - int end = rec.getEnd(); - - SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString()); - - String cigarredRead = parseCigarToSequence(read, rec, insertions, - '-'); - - SequenceI alsq = seq.deriveSequence(); - alsq.setSequence(cigarredRead); - alsq.setStart(start); - alsq.setEnd(end); - seqs.add(alsq); + refseq.append('N'); } - - // put insertions back into each sequence - for (SequenceI seq : seqs) + for (Map.Entry insert : insertions.entrySet()) { - int insertCount = 0; - SortedMap seqInserts = insertions - .subMap(seq.getStart(), - seq.getEnd()); // TODO start point should be start of alignment - // not 0 - - Iterator> iit = seqInserts.entrySet() - .iterator(); - while (iit.hasNext()) + int inspos = insert.getKey() - xtent.start + inserted; + if (inspos < 0) { - // TODO account for sequence already having insertion, and INDELS - Entry entry = iit.next(); - int pos = entry.getKey(); - int length = entry.getValue(); - seq.insertCharAt(pos + insertCount, length, '-'); - insertCount++; + System.out.println("Ignoring -ve insert position " + insert.getKey() + + " of " + insert.getValue() + + " (alpos: " + inspos + ")"); + inspos = 0; + // continue; + } + for (int i = 0, j = insert.getValue(); i < j; i++) + { + inserted++; + refseq.insert(inspos, '-'); } } + return refseq; } - /* - 1234567 - ABCDEFG insert 3,4 - - R1: insert 3 = gap before 3rd res - AB.CDEFG - - R2: insert 4 = gap before 4th res - ABC.DEFG - - R3: insert 3,4 = 2 gaps before 3rd res - AB..CDEFG - - => AB--C-DEFG - - So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) - Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added - - */ - /** - * Apply the CIGAR string to a read sequence and return the updated read. - * - * @param read - * the read to update - * @param rec - * the corresponding SAM record - * @param gapChar - * gap character to use - * @return string representing read with gaps, clipping etc applied - */ - private String parseCigarToSequence(String read, - SAMRecord rec, SortedMap insertions, - char gapChar) - { - Iterator it = rec.getCigar().getCigarElements() - .iterator(); - - StringBuilder newRead = new StringBuilder(); - int next = 0; + private void padRef(SequenceI ref, CigarParser cig) + { // pad to column + int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol); + System.out.println("Padding " + padding + " to move refseq position " + + cig.firstRposCol + " to " + cig.firstAlColumn + "col."); - // pad with spaces before read - // first count gaps needed to pad to reference position - int gaps = rec.getReferencePositionAtReadPosition(next + 1); - // now count gaps to pad for insertions in other reads - int insertCount = countInsertionsBeforeRead(rec, insertions); - addGaps(newRead, gaps + insertCount, gapChar); - - SortedMap seqInserts = insertions - .subMap(rec.getStart(), rec.getEnd() + 1); - // TODO start point should be start of alignment not 0 - - Iterator> iit = seqInserts.entrySet() - .iterator(); - if (iit.hasNext()) - { - // TODO account for sequence already having insertion, and INDELS - Entry entry = iit.next(); - int insertPos = entry.getKey(); - int insertLen = entry.getValue(); - // seq.insertCharAt(pos + insertCount, length, '-'); - // insertCount++; - } + ref.insertCharAt(0, padding, '-'); + } - while (it.hasNext()) + @Override + public void parse() + { + boolean needToReopen=false; + // only actually parse if params are set + if (chromosome != null && chromosome != "") { - CigarElement el = it.next(); - int length = el.getLength(); - - - switch (el.getOperator()) + SAMRecordIterator it; + try { + it = fileReader.query(chromosome, start, end, + false); + } catch (UnsupportedOperationException ex) { - case M: - case X: - case EQ: - // matched or mismatched residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case N: // intron in RNA - case D: // deletion - addGaps(newRead, length, gapChar); - break; - case S: - // soft clipping - just skip this bit of the read - // do nothing - - // work out how many gaps we need before the start of the soft clip - - // don't do this at the end of the read! - if (next == 0) // at start of read + needToReopen=true; + // could be a sam text file, so we just iterate through without query + it = fileReader.iterator(); + } + CigarParser parser = new CigarParser('-'); + Range[] xtent = new Range[] { new Range(start, end) }; + SortedMap insertions[] = parser.getInsertions(it, + xtent); + it.close(); + + SequenceI refSeq = new Sequence("chr:" + chromosome, + insertRefSeq(insertions[0], xtent[0]) + .toString()); + + refSeq.setStart(xtent[0].start); + refSeq.setEnd(xtent[0].end); + SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome, + insertRefSeq(insertions[1], xtent[0]) + .toString()); + revRefSeq.setStart(xtent[0].start); + revRefSeq.setEnd(xtent[0].end); + + // Hack to move the seqs along + padRef(refSeq, parser); + padRef(revRefSeq, parser); + + if (needToReopen) + { + try { + initFileReader(); + } catch (IOException x) { - addGaps(newRead, rec.getUnclippedStart(), gapChar); + Cache.log.warn("Couldn't reopen S/BAM file",x); } - - newRead.append(read.substring(next, next + length).toLowerCase()); - next += length; - break; - case I: - // the reference sequence and other reads should have been gapped for - // this insertion, so just add in the residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case H: - // hard clipping - this stretch will not appear in the read - break; - default: - break; } - - - } - return newRead.toString(); - } - - private String applyCigar(String read, CigarElement el, char gapChar, - int next, int length, int softStart) - { - StringBuilder newRead = new StringBuilder(); - switch (el.getOperator()) - { - case M: - case X: - case EQ: - // matched or mismatched residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case N: // intron in RNA - case D: // deletion - addGaps(newRead, length, gapChar); - break; - case S: - // soft clipping - just skip this bit of the read - // do nothing - - // work out how many gaps we need before the start of the soft clip - - // don't do this at the end of the read! - if (next == 0) // at start of read + try { + it = fileReader.query(chromosome, start, end, + false); + } catch (UnsupportedOperationException ex) { - addGaps(newRead, softStart, gapChar); + // could be a sam text file, so we just iterate through without query + it = fileReader.iterator(); } - newRead.append(read.substring(next, next + length).toLowerCase()); - next += length; - break; - case I: - // the reference sequence and other reads should have been gapped for - // this insertion, so just add in the residues - newRead.append(read.substring(next, next + length)); - next += length; - break; - case H: - // hard clipping - this stretch will not appear in the read - break; - default: - break; - } - return newRead.toString(); - } - - /** - * Get list of positions inserted to the reference sequence - * - * @param it - * @return - */ - private SortedMap getInsertions(Iterator it) - { - SortedMap insertions = new TreeMap<>(); - while (it.hasNext()) - { - // check each record for insertions in the CIGAR string - SAMRecord rec = it.next(); - Iterator cit = rec.getCigar().getCigarElements() - .iterator(); - int next = 0; - while (cit.hasNext()) + ArrayList fwd = new ArrayList(), rev = new ArrayList(); + while (it.hasNext()) { - CigarElement el = cit.next(); - switch (el.getOperator()) + SAMRecord rec=null; + try { + rec = it.next(); + } catch (SAMFormatException q) { - case I: - // add to insertions list, and move along the read - int refLocation = rec.getReferencePositionAtReadPosition(next); - - // if there's already an insertion at this location, keep the longest - // insertion; if there's no insertion keep this one - if (!insertions.containsKey(refLocation) - || (insertions.containsKey(refLocation) - && insertions.get(refLocation) < el.getLength())) - { - insertions.put(refLocation, el.getLength()); - } - next += el.getLength(); - break; - case M: - // match to reference, move along the read - next += el.getLength(); - break; - default: - // deletions, introns etc don't consume any residues from the read + Cache.log.info("Bailing on bad SAM line again",q); break; } + + // set the alignment start to be start of first read (we assume reads + // are sorted) + if (alignmentStart == -1) + { + alignmentStart = rec.getAlignmentStart(); + } + + // make dataset sequence: start at 1, end at read length + SequenceI seq = new Sequence( + "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "") + + rec.getReadName(), + rec.getReadString().toLowerCase()); + seq.setStart(1); + seq.setEnd(rec.getReadLength()); + OfInt q = rec.getBaseQualityString().chars() + .iterator(); + int p = seq.getStart(); + while (q.hasNext()) + { + seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p, + (float) q.next() - ' ', "bamfile")); + p++; + } + String newRead = parser.parseCigarToSequence(rec, + insertions[rec.getReadNegativeStrandFlag() ? 1 : 0], + alignmentStart, seq); + + // make alignment sequences + SequenceI alsq = seq.deriveSequence(); + alsq.setSequence(newRead); + + // set start relative to soft clip; assume end is set by Sequence code + alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1); + if (rec.getReadNegativeStrandFlag()) + { + rev.add(alsq); + } + else + { + fwd.add(alsq); + } + } + // Add forward + if (fwd.size() > 0) + { + seqs.add(refSeq); // FIXME needs to be optional, and properly padded + seqs.addAll(fwd); + fwd.clear(); + } + // and reverse strand reads. + if (rev.size() > 0) + { + seqs.add(revRefSeq); // FIXME needs to be optional and properly padded + seqs.addAll(rev); + rev.clear(); } - } - return insertions; } /** - * Add sequence of gaps to a read + * Get the list of chromosomes or contigs from the file (listed in SQ entries + * in BAM file header) * - * @param read - * read to update - * @param length - * number of gaps - * @param gapChar - * gap character to use + * @return array of chromosome/contig strings */ - private void addGaps(StringBuilder read, int length, char gapChar) + @Override + public Object[] preprocess() { - if (length > 0) + List refSeqs = fileReader.getFileHeader() + .getSequenceDictionary().getSequences(); + List chrs = new ArrayList<>(); + + for (SAMSequenceRecord ref : refSeqs) { - char[] gaps = new char[length]; - Arrays.fill(gaps, gapChar); - read.append(gaps); + chrs.add(ref.getSequenceName()); } + return chrs.toArray(); } - - private int countInsertionsBeforeRead(SAMRecord rec, - SortedMap insertions) + + public void setOptions(String chr, int s, int e) { - int gaps = 0; - - // add in any insertion gaps before read - // TODO start point should be start of alignment not 0 - SortedMap seqInserts = insertions.subMap(0, - rec.getStart()); + chromosome = chr; + start = s; + end = e; + suffix = chromosome + ":" + start + "-" + end; + } - Iterator> it = seqInserts.entrySet() - .iterator(); - while (it.hasNext()) + public boolean parseSuffix() + { + if (suffix == null) { - Entry entry = it.next(); - gaps += entry.getValue(); + return false; } - return gaps; + int csep = suffix.indexOf(":"); + int rsep = suffix.indexOf("-", csep); + if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1) + { + return false; + } + String chr, p1, p2; + chr = suffix.substring(0, csep); + p1 = suffix.substring(csep + 1, rsep); + p2 = suffix.substring(rsep + 1); + int cstart, cend; + try + { + cstart = Integer.parseInt(p1); + cend = Integer.parseInt(p2); + } catch (Exception e) + { + warningMessage = (warningMessage == null ? "" : warningMessage + "\n") + + "Couldn't parse range from " + suffix; + return false; + } + chromosome = chr; + start = cstart; + end = cend; + return true; } - }