2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.bin.Cache;
24 import jalview.datamodel.CigarParser;
25 import jalview.datamodel.Range;
26 import jalview.datamodel.Sequence;
27 import jalview.datamodel.SequenceFeature;
28 import jalview.datamodel.SequenceI;
31 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.List;
36 import java.util.PrimitiveIterator.OfInt;
37 import java.util.SortedMap;
39 import htsjdk.samtools.SAMFormatException;
40 import htsjdk.samtools.SAMRecord;
41 import htsjdk.samtools.SAMRecordIterator;
42 import htsjdk.samtools.SAMSequenceRecord;
43 import htsjdk.samtools.SamInputResource;
44 import htsjdk.samtools.SamReader;
45 import htsjdk.samtools.SamReaderFactory;
46 import htsjdk.samtools.ValidationStringency;
48 public class BamFile extends AlignFile
50 // SAM/BAM file reader
51 private SamReader fileReader;
53 // start position to read from
54 private int start = -1;
56 // end position to read to
59 // chromosome/contig to read
60 private String chromosome = "";
62 // first position in alignment
63 private int alignmentStart = -1;
65 private File _bamFile;
68 * Creates a new BamFile object.
75 * Creates a new BamFile object.
78 * Name of file to read
80 * Whether data source is file, url or other type of data source
84 public BamFile(String inFile, DataSourceType sourceType)
87 super(true, inFile, sourceType);
88 _bamFile = new File(inFile);
91 private void initFileReader() throws IOException
93 final SamReaderFactory factory = SamReaderFactory.makeDefault()
94 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
95 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
96 .validationStringency(ValidationStringency.SILENT);
100 fileReader = factory.open(_bamFile); // will need to be adapted for JalviewJS/etc
104 // try and locate index ?
105 String index = getDataName() + ".bai";
106 fileReader = factory.open(SamInputResource.of(getDataName())
107 .index(new URL(index)));
111 * Creates a new BamFile object
114 * wrapper for datasource
115 * @throws IOException
117 public BamFile(FileParse source) throws IOException
119 super(false, source);
121 if (source.getDataSourceType() == DataSourceType.FILE)
123 _bamFile = source.inFile;
132 public String print(SequenceI[] seqs, boolean jvsuffix)
134 // TODO Auto-generated method stub
138 private StringBuilder insertRefSeq(
139 SortedMap<Integer, Integer> insertions, Range xtent)
141 StringBuilder refseq = new StringBuilder();
143 for (int p = xtent.start; p < xtent.end; p++)
147 for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
149 int inspos = insert.getKey() - xtent.start + inserted;
152 System.out.println("Ignoring -ve insert position " + insert.getKey()
153 + " of " + insert.getValue() +
154 " (alpos: " + inspos + ")");
158 for (int i = 0, j = insert.getValue(); i < j; i++)
161 refseq.insert(inspos, '-');
167 private void padRef(SequenceI ref, CigarParser cig)
169 int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
170 System.out.println("Padding " + padding + " to move refseq position "
171 + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
173 ref.insertCharAt(0, padding, '-');
179 boolean needToReopen=false;
180 // only actually parse if params are set
181 if (chromosome != null && chromosome != "")
183 SAMRecordIterator it;
185 it = fileReader.query(chromosome, start, end,
187 } catch (UnsupportedOperationException ex)
190 // could be a sam text file, so we just iterate through without query
191 it = fileReader.iterator();
193 CigarParser parser = new CigarParser('-');
194 Range[] xtent = new Range[] { new Range(start, end) };
195 SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
199 SequenceI refSeq = new Sequence("chr:" + chromosome,
200 insertRefSeq(insertions[0], xtent[0])
203 refSeq.setStart(xtent[0].start);
204 refSeq.setEnd(xtent[0].end);
205 SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
206 insertRefSeq(insertions[1], xtent[0])
208 revRefSeq.setStart(xtent[0].start);
209 revRefSeq.setEnd(xtent[0].end);
211 // Hack to move the seqs along
212 padRef(refSeq, parser);
213 padRef(revRefSeq, parser);
219 } catch (IOException x)
221 Cache.log.warn("Couldn't reopen S/BAM file",x);
225 it = fileReader.query(chromosome, start, end,
227 } catch (UnsupportedOperationException ex)
229 // could be a sam text file, so we just iterate through without query
230 it = fileReader.iterator();
233 ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
239 } catch (SAMFormatException q)
241 Cache.log.info("Bailing on bad SAM line again",q);
245 // set the alignment start to be start of first read (we assume reads
247 if (alignmentStart == -1)
249 alignmentStart = rec.getAlignmentStart();
252 // make dataset sequence: start at 1, end at read length
253 SequenceI seq = new Sequence(
254 "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
256 rec.getReadString().toLowerCase());
258 seq.setEnd(rec.getReadLength());
259 OfInt q = rec.getBaseQualityString().chars()
261 int p = seq.getStart();
264 seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p,
265 (float) q.next() - ' ', "bamfile"));
268 String newRead = parser.parseCigarToSequence(rec,
269 insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
270 alignmentStart, seq);
272 // make alignment sequences
273 SequenceI alsq = seq.deriveSequence();
274 alsq.setSequence(newRead);
276 // set start relative to soft clip; assume end is set by Sequence code
277 alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
278 if (rec.getReadNegativeStrandFlag())
290 seqs.add(refSeq); // FIXME needs to be optional, and properly padded
294 // and reverse strand reads.
297 seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
305 * Get the list of chromosomes or contigs from the file (listed in SQ entries
306 * in BAM file header)
308 * @return array of chromosome/contig strings
311 public Object[] preprocess()
313 List<SAMSequenceRecord> refSeqs = fileReader.getFileHeader()
314 .getSequenceDictionary().getSequences();
315 List<String> chrs = new ArrayList<>();
317 for (SAMSequenceRecord ref : refSeqs)
319 chrs.add(ref.getSequenceName());
321 return chrs.toArray();
324 public void setOptions(String chr, int s, int e)
329 suffix = chromosome + ":" + start + "-" + end;
332 public boolean parseSuffix()
338 int csep = suffix.indexOf(":");
339 int rsep = suffix.indexOf("-", csep);
340 if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1)
345 chr = suffix.substring(0, csep);
346 p1 = suffix.substring(csep + 1, rsep);
347 p2 = suffix.substring(rsep + 1);
351 cstart = Integer.parseInt(p1);
352 cend = Integer.parseInt(p2);
353 } catch (Exception e)
355 warningMessage = (warningMessage == null ? "" : warningMessage + "\n")
356 + "Couldn't parse range from " + suffix;