From: kiramt Date: Fri, 16 Feb 2018 13:15:40 +0000 (+0000) Subject: JAL-2909 Read in a bam file - but strand/location unaware X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=4cc3017f070e0bc1b291d9cfaf572b1c9c5e76d3;hp=c471de0bd58af67bc91d413f3bf1209c4113a601;p=jalview.git JAL-2909 Read in a bam file - but strand/location unaware --- diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java new file mode 100644 index 0000000..c94dc3e --- /dev/null +++ b/src/jalview/io/BamFile.java @@ -0,0 +1,173 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.io; + +import jalview.datamodel.SequenceI; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +public class BamFile extends AlignFile +{ + + SamReader fileReader; + + /** + * Creates a new BamFile object. + */ + public BamFile() + { + } + + /** + * Creates a new BamFile object. + * + * @param inFile + * DOCUMENT ME! + * @param sourceType + * DOCUMENT ME! + * + * @throws IOException + * DOCUMENT ME! + */ + public BamFile(String inFile, DataSourceType sourceType) + throws IOException + { + super(inFile, sourceType); + final SamReaderFactory factory = SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, + SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) + .validationStringency(ValidationStringency.SILENT); + fileReader = factory.open(new File(inFile)); + } + + public BamFile(FileParse source) throws IOException + { + final SamReaderFactory factory = SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, + SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) + .validationStringency(ValidationStringency.SILENT); + + // File-based bam + fileReader = factory.open(source.inFile); + parse(); + } + + @Override + public String print(SequenceI[] seqs, boolean jvsuffix) + { + // TODO Auto-generated method stub + return null; + } + + @Override + public void parse() throws IOException + { + SequenceI seq = null; + SAMRecordIterator it = fileReader.iterator(); + while (it.hasNext()) + { + SAMRecord rec = it.next(); + String read = rec.getReadString(); + int start = rec.getStart(); + int end = rec.getEnd(); + + Iterator cit = rec.getCigar().getCigarElements() + .iterator(); + + seq = parseId(rec.getReadName()); + String cigarredRead = parseCigarToSequence(read, cit, '-'); + seq.setSequence(cigarredRead); + seq.setStart(start); + seq.setEnd(end); + seqs.addElement(seq); + } + + } + + /** + * Apply the CIGAR string to a read sequence and return the updated read + * + * @param read + * the read to update + * @param it + * iterator over cigar elements + * @param gapChar + * gap character to use + * @return string representing read with gaps, clipping etc applied + */ + private String parseCigarToSequence(String read, + Iterator it, char gapChar) + { + StringBuilder newRead = new StringBuilder(); + int next = 0; + + while (it.hasNext()) + { + CigarElement el = it.next(); + int length = el.getLength(); + switch (el.getOperator()) + { + case M: + // matched residues + newRead.append(read.substring(next, next + length - 1)); + next += length; + break; + case N: // intron in RNA + case D: // deletion + // add gaps + char[] gaps = new char[length]; + Arrays.fill(gaps, gapChar); + newRead.append(gaps); + break; + case S: + // soft clipping - just skip this bit of the read + // do nothing + next += length; + break; + case I: + // add gaps to the reference sequence, which we know nothing about just + // now + // TODO + newRead.append(read.substring(next, next + length - 1)); + break; + case H: + // hard clipping - this stretch will not appear in the read + break; + default: + // P, X EQ don't know what to do with these + break; + } + + } + return newRead.toString(); + } + +} diff --git a/src/jalview/io/FileFormat.java b/src/jalview/io/FileFormat.java index 4b33dbf..21fadd2 100644 --- a/src/jalview/io/FileFormat.java +++ b/src/jalview/io/FileFormat.java @@ -347,6 +347,27 @@ public enum FileFormat implements FileFormatI return true; } }, + Bam("bam", "bam", true, true) + { + @Override + public AlignmentFileReaderI getReader(FileParse source) + throws IOException + { + return new BamFile(source); + } + + @Override + public AlignmentFileWriterI getWriter(AlignmentI al) + { + return new BamFile(); + } + + @Override + public boolean isStructureFile() + { + return false; + } + }, Jalview("Jalview", "jar,jvp", true, true) { @Override diff --git a/src/jalview/io/IdentifyFile.java b/src/jalview/io/IdentifyFile.java index ff959b0..38414e3 100755 --- a/src/jalview/io/IdentifyFile.java +++ b/src/jalview/io/IdentifyFile.java @@ -135,6 +135,12 @@ public class IdentifyFile || fileStr.lastIndexOf(".zip") > -1) { reply = FileFormat.Jalview; + // TODO shouldn't there be a break here? + } + else if (fileStr.lastIndexOf(".bam") > -1) + { + reply = FileFormat.Bam; + break; } } if (!lineswereskipped && data.startsWith("PK"))