*/
package jalview.io;
+import jalview.bin.Cache;
+import jalview.datamodel.CigarParser;
+import jalview.datamodel.Range;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Iterator;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator.OfInt;
+import java.util.SortedMap;
-import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.SAMFormatException;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
public class BamFile extends AlignFile
{
+ // SAM/BAM file reader
+ private SamReader fileReader;
- SamReader fileReader;
+ // start position to read from
+ private int start = -1;
+
+ // end position to read to
+ private int end = -1;
+
+ // chromosome/contig to read
+ private String chromosome = "";
+
+ // first position in alignment
+ private int alignmentStart = -1;
+
+ private File _bamFile;
/**
* Creates a new BamFile object.
* Creates a new BamFile object.
*
* @param inFile
- * DOCUMENT ME!
+ * Name of file to read
* @param sourceType
- * DOCUMENT ME!
+ * Whether data source is file, url or other type of data source
*
* @throws IOException
- * DOCUMENT ME!
*/
public BamFile(String inFile, DataSourceType sourceType)
throws IOException
{
- super(inFile, sourceType);
- final SamReaderFactory factory = SamReaderFactory.makeDefault()
- .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
- SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
- .validationStringency(ValidationStringency.SILENT);
- fileReader = factory.open(new File(inFile));
+ super(true, inFile, sourceType);
+ _bamFile = new File(inFile);
+ initFileReader();
}
-
- public BamFile(FileParse source) throws IOException
+ private void initFileReader() throws IOException
{
final SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
.validationStringency(ValidationStringency.SILENT);
-
// File-based bam
- fileReader = factory.open(source.inFile);
- parse();
+ if (_bamFile!=null)
+ {
+ fileReader = factory.open(_bamFile); // will need to be adapted for JalviewJS/etc
+ }
+ else
+ {
+ // try and locate index ?
+ String index = getDataName() + ".bai";
+ fileReader = factory.open(SamInputResource.of(getDataName())
+ .index(new URL(index)));
+ }
+ }
+ /**
+ * Creates a new BamFile object
+ *
+ * @param source
+ * wrapper for datasource
+ * @throws IOException
+ */
+ public BamFile(FileParse source) throws IOException
+ {
+ super(false, source);
+ parseSuffix();
+ if (source.getDataSourceType() == DataSourceType.FILE)
+ {
+ _bamFile = source.inFile;
+ } else {
+
+ }
+ initFileReader();
+ doParse();
}
@Override
return null;
}
- @Override
- public void parse() throws IOException
+ private StringBuilder insertRefSeq(
+ SortedMap<Integer, Integer> insertions, Range xtent)
{
- SequenceI seq = null;
- SAMRecordIterator it = fileReader.iterator();
- while (it.hasNext())
+ StringBuilder refseq = new StringBuilder();
+ int inserted = 0;
+ for (int p = xtent.start; p < xtent.end; p++)
{
- SAMRecord rec = it.next();
- String read = rec.getReadString();
- int start = rec.getStart();
- int end = rec.getEnd();
-
- Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
- .iterator();
-
- seq = parseId(rec.getReadName());
- String cigarredRead = parseCigarToSequence(read, cit, '-');
- seq.setSequence(cigarredRead);
- seq.setStart(start);
- seq.setEnd(end);
- seqs.addElement(seq);
+ refseq.append('N');
+ }
+ for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
+ {
+ int inspos = insert.getKey() - xtent.start + inserted;
+ if (inspos < 0)
+ {
+ System.out.println("Ignoring -ve insert position " + insert.getKey()
+ + " of " + insert.getValue() +
+ " (alpos: " + inspos + ")");
+ inspos = 0;
+ // continue;
+ }
+ for (int i = 0, j = insert.getValue(); i < j; i++)
+ {
+ inserted++;
+ refseq.insert(inspos, '-');
+ }
}
+ return refseq;
+ }
+
+ private void padRef(SequenceI ref, CigarParser cig)
+ { // pad to column
+ int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
+ System.out.println("Padding " + padding + " to move refseq position "
+ + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
+
+ ref.insertCharAt(0, padding, '-');
+ }
+
+ @Override
+ public void parse()
+ {
+ boolean needToReopen=false;
+ // only actually parse if params are set
+ if (chromosome != null && chromosome != "")
+ {
+ SAMRecordIterator it;
+ try {
+ it = fileReader.query(chromosome, start, end,
+ false);
+ } catch (UnsupportedOperationException ex)
+ {
+ needToReopen=true;
+ // could be a sam text file, so we just iterate through without query
+ it = fileReader.iterator();
+ }
+ CigarParser parser = new CigarParser('-');
+ Range[] xtent = new Range[] { new Range(start, end) };
+ SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
+ xtent);
+ it.close();
+
+ SequenceI refSeq = new Sequence("chr:" + chromosome,
+ insertRefSeq(insertions[0], xtent[0])
+ .toString());
+
+ refSeq.setStart(xtent[0].start);
+ refSeq.setEnd(xtent[0].end);
+ SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
+ insertRefSeq(insertions[1], xtent[0])
+ .toString());
+ revRefSeq.setStart(xtent[0].start);
+ revRefSeq.setEnd(xtent[0].end);
+
+ // Hack to move the seqs along
+ padRef(refSeq, parser);
+ padRef(revRefSeq, parser);
+
+ if (needToReopen)
+ {
+ try {
+ initFileReader();
+ } catch (IOException x)
+ {
+ Cache.log.warn("Couldn't reopen S/BAM file",x);
+ }
+ }
+ try {
+ it = fileReader.query(chromosome, start, end,
+ false);
+ } catch (UnsupportedOperationException ex)
+ {
+ // could be a sam text file, so we just iterate through without query
+ it = fileReader.iterator();
+ }
+ ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
+ while (it.hasNext())
+ {
+ SAMRecord rec=null;
+ try {
+ rec = it.next();
+ } catch (SAMFormatException q)
+ {
+ Cache.log.info("Bailing on bad SAM line again",q);
+ break;
+ }
+
+ // set the alignment start to be start of first read (we assume reads
+ // are sorted)
+ if (alignmentStart == -1)
+ {
+ alignmentStart = rec.getAlignmentStart();
+ }
+
+ // make dataset sequence: start at 1, end at read length
+ SequenceI seq = new Sequence(
+ "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
+ + rec.getReadName(),
+ rec.getReadString().toLowerCase());
+ seq.setStart(1);
+ seq.setEnd(rec.getReadLength());
+ OfInt q = rec.getBaseQualityString().chars()
+ .iterator();
+ int p = seq.getStart();
+ while (q.hasNext())
+ {
+ seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p,
+ (float) q.next() - ' ', "bamfile"));
+ p++;
+ }
+ String newRead = parser.parseCigarToSequence(rec,
+ insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
+ alignmentStart, seq);
+
+ // make alignment sequences
+ SequenceI alsq = seq.deriveSequence();
+ alsq.setSequence(newRead);
+
+ // set start relative to soft clip; assume end is set by Sequence code
+ alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
+ if (rec.getReadNegativeStrandFlag())
+ {
+ rev.add(alsq);
+ }
+ else
+ {
+ fwd.add(alsq);
+ }
+ }
+ // Add forward
+ if (fwd.size() > 0)
+ {
+ seqs.add(refSeq); // FIXME needs to be optional, and properly padded
+ seqs.addAll(fwd);
+ fwd.clear();
+ }
+ // and reverse strand reads.
+ if (rev.size() > 0)
+ {
+ seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
+ seqs.addAll(rev);
+ rev.clear();
+ }
+ }
}
/**
- * Apply the CIGAR string to a read sequence and return the updated read
+ * Get the list of chromosomes or contigs from the file (listed in SQ entries
+ * in BAM file header)
*
- * @param read
- * the read to update
- * @param it
- * iterator over cigar elements
- * @param gapChar
- * gap character to use
- * @return string representing read with gaps, clipping etc applied
+ * @return array of chromosome/contig strings
*/
- private String parseCigarToSequence(String read,
- Iterator<CigarElement> it, char gapChar)
+ @Override
+ public Object[] preprocess()
{
- StringBuilder newRead = new StringBuilder();
- int next = 0;
+ List<SAMSequenceRecord> refSeqs = fileReader.getFileHeader()
+ .getSequenceDictionary().getSequences();
+ List<String> chrs = new ArrayList<>();
- while (it.hasNext())
+ for (SAMSequenceRecord ref : refSeqs)
{
- CigarElement el = it.next();
- int length = el.getLength();
- switch (el.getOperator())
- {
- case M:
- // matched residues
- newRead.append(read.substring(next, next + length - 1));
- next += length;
- break;
- case N: // intron in RNA
- case D: // deletion
- // add gaps
- char[] gaps = new char[length];
- Arrays.fill(gaps, gapChar);
- newRead.append(gaps);
- break;
- case S:
- // soft clipping - just skip this bit of the read
- // do nothing
- next += length;
- break;
- case I:
- // add gaps to the reference sequence, which we know nothing about just
- // now
- // TODO
- newRead.append(read.substring(next, next + length - 1));
- break;
- case H:
- // hard clipping - this stretch will not appear in the read
- break;
- default:
- // P, X EQ don't know what to do with these
- break;
- }
-
+ chrs.add(ref.getSequenceName());
}
- return newRead.toString();
+ return chrs.toArray();
+ }
+
+ public void setOptions(String chr, int s, int e)
+ {
+ chromosome = chr;
+ start = s;
+ end = e;
+ suffix = chromosome + ":" + start + "-" + end;
}
+ public boolean parseSuffix()
+ {
+ if (suffix == null)
+ {
+ return false;
+ }
+ int csep = suffix.indexOf(":");
+ int rsep = suffix.indexOf("-", csep);
+ if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1)
+ {
+ return false;
+ }
+ String chr, p1, p2;
+ chr = suffix.substring(0, csep);
+ p1 = suffix.substring(csep + 1, rsep);
+ p2 = suffix.substring(rsep + 1);
+ int cstart, cend;
+ try
+ {
+ cstart = Integer.parseInt(p1);
+ cend = Integer.parseInt(p2);
+ } catch (Exception e)
+ {
+ warningMessage = (warningMessage == null ? "" : warningMessage + "\n")
+ + "Couldn't parse range from " + suffix;
+ return false;
+ }
+ chromosome = chr;
+ start = cstart;
+ end = cend;
+ return true;
+ }
}