2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.CigarParser;
24 import jalview.datamodel.Range;
25 import jalview.datamodel.Sequence;
26 import jalview.datamodel.SequenceFeature;
27 import jalview.datamodel.SequenceI;
30 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.List;
35 import java.util.PrimitiveIterator.OfInt;
36 import java.util.SortedMap;
38 import htsjdk.samtools.SAMRecord;
39 import htsjdk.samtools.SAMRecordIterator;
40 import htsjdk.samtools.SAMSequenceRecord;
41 import htsjdk.samtools.SamInputResource;
42 import htsjdk.samtools.SamReader;
43 import htsjdk.samtools.SamReaderFactory;
44 import htsjdk.samtools.ValidationStringency;
46 public class BamFile extends AlignFile
48 // SAM/BAM file reader
49 private SamReader fileReader;
51 // start position to read from
52 private int start = -1;
54 // end position to read to
57 // chromosome/contig to read
58 private String chromosome = "";
60 // first position in alignment
61 private int alignmentStart = -1;
64 * Creates a new BamFile object.
71 * Creates a new BamFile object.
74 * Name of file to read
76 * Whether data source is file, url or other type of data source
80 public BamFile(String inFile, DataSourceType sourceType)
83 super(true, inFile, sourceType);
84 final SamReaderFactory factory = SamReaderFactory.makeDefault()
85 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
86 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
87 .validationStringency(ValidationStringency.SILENT);
88 fileReader = factory.open(new File(inFile));
92 * Creates a new BamFile object
95 * wrapper for datasource
98 public BamFile(FileParse source) throws IOException
102 final SamReaderFactory factory = SamReaderFactory.makeDefault()
103 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
104 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
105 .validationStringency(ValidationStringency.SILENT);
108 if (source.getDataSourceType() == DataSourceType.FILE)
110 fileReader = factory.open(source.inFile);
115 String index = source.getDataName() + ".bai";
116 fileReader = factory.open(SamInputResource.of(source.getDataName())
117 .index(new URL(index)));
122 public String print(SequenceI[] seqs, boolean jvsuffix)
124 // TODO Auto-generated method stub
128 private StringBuilder insertRefSeq(
129 SortedMap<Integer, Integer> insertions, Range xtent)
131 StringBuilder refseq = new StringBuilder();
133 for (int p = xtent.start; p < xtent.end; p++)
137 for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
139 int inspos = insert.getKey() - xtent.start + inserted;
142 System.out.println("Ignoring -ve insert position " + insert.getKey()
143 + " of " + insert.getValue() +
144 " (alpos: " + inspos + ")");
148 for (int i = 0, j = insert.getValue(); i < j; i++)
151 refseq.insert(inspos, '-');
157 private void padRef(SequenceI ref, CigarParser cig)
159 int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
160 System.out.println("Padding " + padding + " to move refseq position "
161 + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
163 ref.insertCharAt(0, padding, '-');
169 // only actually parse if params are set
170 if (chromosome != null && chromosome != "")
172 SAMRecordIterator it = fileReader.query(chromosome, start, end,
174 CigarParser parser = new CigarParser('-');
175 Range[] xtent = new Range[] { new Range(start, end) };
176 SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
180 SequenceI refSeq = new Sequence("chr:" + chromosome,
181 insertRefSeq(insertions[0], xtent[0])
184 refSeq.setStart(xtent[0].start);
185 refSeq.setEnd(xtent[0].end);
186 SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
187 insertRefSeq(insertions[1], xtent[0])
189 revRefSeq.setStart(xtent[0].start);
190 revRefSeq.setEnd(xtent[0].end);
192 // Hack to move the seqs along
193 padRef(refSeq, parser);
194 padRef(revRefSeq, parser);
196 it = fileReader.query(chromosome, start, end, false);
198 ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
201 SAMRecord rec = it.next();
203 // set the alignment start to be start of first read (we assume reads
205 if (alignmentStart == -1)
207 alignmentStart = rec.getAlignmentStart();
210 // make dataset sequence: start at 1, end at read length
211 SequenceI seq = new Sequence(
212 "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
214 rec.getReadString().toLowerCase());
216 seq.setEnd(rec.getReadLength());
217 OfInt q = rec.getBaseQualityString().chars()
219 int p = seq.getStart();
222 seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p,
223 (float) q.next() - ' ', "bamfile"));
226 String newRead = parser.parseCigarToSequence(rec,
227 insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
228 alignmentStart, seq);
230 // make alignment sequences
231 SequenceI alsq = seq.deriveSequence();
232 alsq.setSequence(newRead);
234 // set start relative to soft clip; assume end is set by Sequence code
235 alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
236 if (rec.getReadNegativeStrandFlag())
248 seqs.add(refSeq); // FIXME needs to be optional, and properly padded
252 // and reverse strand reads.
255 seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
263 * Get the list of chromosomes or contigs from the file (listed in SQ entries
264 * in BAM file header)
266 * @return array of chromosome/contig strings
269 public Object[] preprocess()
271 List<SAMSequenceRecord> refSeqs = fileReader.getFileHeader()
272 .getSequenceDictionary().getSequences();
273 List<String> chrs = new ArrayList<>();
275 for (SAMSequenceRecord ref : refSeqs)
277 chrs.add(ref.getSequenceName());
279 return chrs.toArray();
282 public void setOptions(String chr, int s, int e)
287 suffix = chromosome + ":" + start + "-" + end;
290 public boolean parseSuffix()
296 int csep = suffix.indexOf(":");
297 int rsep = suffix.indexOf("-", csep);
298 if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1)
303 chr = suffix.substring(0, csep);
304 p1 = suffix.substring(csep + 1, rsep);
305 p2 = suffix.substring(rsep + 1);
309 cstart = Integer.parseInt(p1);
310 cend = Integer.parseInt(p2);
311 } catch (Exception e)
313 warningMessage = (warningMessage == null ? "" : warningMessage + "\n")
314 + "Couldn't parse range from " + suffix;