2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
27 import java.io.IOException;
28 import java.util.Arrays;
29 import java.util.Iterator;
31 import java.util.Map.Entry;
32 import java.util.SortedMap;
33 import java.util.TreeMap;
35 import htsjdk.samtools.CigarElement;
36 import htsjdk.samtools.SAMRecord;
37 import htsjdk.samtools.SAMRecordIterator;
38 import htsjdk.samtools.SamReader;
39 import htsjdk.samtools.SamReaderFactory;
40 import htsjdk.samtools.ValidationStringency;
42 public class BamFile extends AlignFile
48 * Creates a new BamFile object.
55 * Creates a new BamFile object.
65 public BamFile(String inFile, DataSourceType sourceType)
68 super(inFile, sourceType);
69 final SamReaderFactory factory = SamReaderFactory.makeDefault()
70 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
71 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
72 .validationStringency(ValidationStringency.SILENT);
73 fileReader = factory.open(new File(inFile));
76 public BamFile(FileParse source) throws IOException
78 final SamReaderFactory factory = SamReaderFactory.makeDefault()
79 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
80 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
81 .validationStringency(ValidationStringency.SILENT);
84 fileReader = factory.open(source.inFile);
89 public String print(SequenceI[] seqs, boolean jvsuffix)
91 // TODO Auto-generated method stub
96 public void parse() throws IOException
98 SAMRecordIterator it = fileReader.iterator();
99 SortedMap<Integer,Integer> insertions = getInsertions(it);
102 it = fileReader.iterator();
105 SAMRecord rec = it.next();
106 String read = rec.getReadString();
107 int start = rec.getStart();
108 int end = rec.getEnd();
110 SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
112 String cigarredRead = parseCigarToSequence(read, rec, insertions,
115 SequenceI alsq = seq.deriveSequence();
116 alsq.setSequence(cigarredRead);
117 alsq.setStart(start);
122 // put insertions back into each sequence
123 for (SequenceI seq : seqs)
126 SortedMap<Integer, Integer> seqInserts = insertions
127 .subMap(seq.getStart(),
128 seq.getEnd()); // TODO start point should be start of alignment
131 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
133 while (iit.hasNext())
135 // TODO account for sequence already having insertion, and INDELS
136 Entry<Integer, Integer> entry = iit.next();
137 int pos = entry.getKey();
138 int length = entry.getValue();
139 seq.insertCharAt(pos + insertCount, length, '-');
149 R1: insert 3 = gap before 3rd res
152 R2: insert 4 = gap before 4th res
155 R3: insert 3,4 = 2 gaps before 3rd res
160 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
161 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
165 * Apply the CIGAR string to a read sequence and return the updated read.
170 * the corresponding SAM record
172 * gap character to use
173 * @return string representing read with gaps, clipping etc applied
175 private String parseCigarToSequence(String read,
176 SAMRecord rec, SortedMap<Integer, Integer> insertions,
179 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
182 StringBuilder newRead = new StringBuilder();
185 // pad with spaces before read
186 // first count gaps needed to pad to reference position
187 int gaps = rec.getReferencePositionAtReadPosition(next + 1);
188 // now count gaps to pad for insertions in other reads
189 int insertCount = countInsertionsBeforeRead(rec, insertions);
190 addGaps(newRead, gaps + insertCount, gapChar);
192 SortedMap<Integer, Integer> seqInserts = insertions
193 .subMap(rec.getStart(), rec.getEnd() + 1);
194 // TODO start point should be start of alignment not 0
196 Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
200 // TODO account for sequence already having insertion, and INDELS
201 Entry<Integer, Integer> entry = iit.next();
202 int insertPos = entry.getKey();
203 int insertLen = entry.getValue();
204 // seq.insertCharAt(pos + insertCount, length, '-');
210 CigarElement el = it.next();
211 int length = el.getLength();
214 switch (el.getOperator())
219 // matched or mismatched residues
220 newRead.append(read.substring(next, next + length));
223 case N: // intron in RNA
225 addGaps(newRead, length, gapChar);
228 // soft clipping - just skip this bit of the read
231 // work out how many gaps we need before the start of the soft clip -
232 // don't do this at the end of the read!
233 if (next == 0) // at start of read
235 addGaps(newRead, rec.getUnclippedStart(), gapChar);
238 newRead.append(read.substring(next, next + length).toLowerCase());
242 // the reference sequence and other reads should have been gapped for
243 // this insertion, so just add in the residues
244 newRead.append(read.substring(next, next + length));
248 // hard clipping - this stretch will not appear in the read
256 return newRead.toString();
259 private String applyCigar(String read, CigarElement el, char gapChar,
260 int next, int length, int softStart)
262 StringBuilder newRead = new StringBuilder();
263 switch (el.getOperator())
268 // matched or mismatched residues
269 newRead.append(read.substring(next, next + length));
272 case N: // intron in RNA
274 addGaps(newRead, length, gapChar);
277 // soft clipping - just skip this bit of the read
280 // work out how many gaps we need before the start of the soft clip -
281 // don't do this at the end of the read!
282 if (next == 0) // at start of read
284 addGaps(newRead, softStart, gapChar);
287 newRead.append(read.substring(next, next + length).toLowerCase());
291 // the reference sequence and other reads should have been gapped for
292 // this insertion, so just add in the residues
293 newRead.append(read.substring(next, next + length));
297 // hard clipping - this stretch will not appear in the read
302 return newRead.toString();
306 * Get list of positions inserted to the reference sequence
311 private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
313 SortedMap<Integer, Integer> insertions = new TreeMap<>();
316 // check each record for insertions in the CIGAR string
317 SAMRecord rec = it.next();
318 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
321 while (cit.hasNext())
323 CigarElement el = cit.next();
324 switch (el.getOperator())
327 // add to insertions list, and move along the read
328 int refLocation = rec.getReferencePositionAtReadPosition(next);
330 // if there's already an insertion at this location, keep the longest
331 // insertion; if there's no insertion keep this one
332 if (!insertions.containsKey(refLocation)
333 || (insertions.containsKey(refLocation)
334 && insertions.get(refLocation) < el.getLength()))
336 insertions.put(refLocation, el.getLength());
338 next += el.getLength();
341 // match to reference, move along the read
342 next += el.getLength();
345 // deletions, introns etc don't consume any residues from the read
355 * Add sequence of gaps to a read
362 * gap character to use
364 private void addGaps(StringBuilder read, int length, char gapChar)
368 char[] gaps = new char[length];
369 Arrays.fill(gaps, gapChar);
374 private int countInsertionsBeforeRead(SAMRecord rec,
375 SortedMap<Integer, Integer> insertions)
379 // add in any insertion gaps before read
380 // TODO start point should be start of alignment not 0
381 SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
384 Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
388 Entry<Integer, Integer> entry = it.next();
389 gaps += entry.getValue();