2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
27 import java.io.IOException;
28 import java.util.Arrays;
29 import java.util.Iterator;
30 import java.util.SortedMap;
31 import java.util.TreeMap;
33 import htsjdk.samtools.CigarElement;
34 import htsjdk.samtools.SAMRecord;
35 import htsjdk.samtools.SAMRecordIterator;
36 import htsjdk.samtools.SamReader;
37 import htsjdk.samtools.SamReaderFactory;
38 import htsjdk.samtools.ValidationStringency;
40 public class BamFile extends AlignFile
46 * Creates a new BamFile object.
53 * Creates a new BamFile object.
63 public BamFile(String inFile, DataSourceType sourceType)
66 super(inFile, sourceType);
67 final SamReaderFactory factory = SamReaderFactory.makeDefault()
68 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
69 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
70 .validationStringency(ValidationStringency.SILENT);
71 fileReader = factory.open(new File(inFile));
74 public BamFile(FileParse source) throws IOException
76 final SamReaderFactory factory = SamReaderFactory.makeDefault()
77 .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
78 SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
79 .validationStringency(ValidationStringency.SILENT);
82 fileReader = factory.open(source.inFile);
87 public String print(SequenceI[] seqs, boolean jvsuffix)
89 // TODO Auto-generated method stub
94 public void parse() throws IOException
96 SAMRecordIterator it = fileReader.iterator();
97 SortedMap<Integer,Integer> insertions = getInsertions(it);
100 it = fileReader.iterator();
103 SAMRecord rec = it.next();
104 String read = rec.getReadString();
105 int start = rec.getStart();
106 int end = rec.getEnd();
108 SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
110 String cigarredRead = parseCigarToSequence(read, rec, '-');
112 SequenceI alsq = seq.deriveSequence();
113 alsq.setSequence(cigarredRead);
114 alsq.setStart(start);
119 for (SequenceI seq : seqs)
122 SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
123 seq.getEnd()); // TODO start point should be start of alignment
125 seqInserts.forEach(action);
126 while ((nextInsertion != -1) && (nextInsertion < seq.getEnd()))
128 seq.insertCharsAt(nextInsertion + insertCount, '-');
129 // nextInsertion = insertions.nextSetBit(nextInsertion + 1);
139 R1: insert 3 = gap before 3rd res
142 R2: insert 4 = gap before 4th res
145 R3: insert 3,4 = 2 gaps before 3rd res
150 So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
151 Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
155 * Apply the CIGAR string to a read sequence and return the updated read.
160 * the corresponding SAM record
162 * gap character to use
163 * @return string representing read with gaps, clipping etc applied
165 private String parseCigarToSequence(String read,
166 SAMRecord rec, char gapChar)
168 Iterator<CigarElement> it = rec.getCigar().getCigarElements()
171 StringBuilder newRead = new StringBuilder();
173 int ap = 0; // position of query region qi.start;
175 // pad with spaces before read
177 .getReferencePositionAtReadPosition(next + 1); ap < alp;)
185 CigarElement el = it.next();
186 int length = el.getLength();
189 switch (el.getOperator())
194 read.substring(next, next + length));
197 case N: // intron in RNA
200 gaps = new char[length];
201 Arrays.fill(gaps, gapChar);
202 newRead.append(gaps);
205 // soft clipping - just skip this bit of the read
208 // work out how many gaps we need before the start of the soft clip -
209 // don't do this at the end of the read!
210 if (next == 0) // at start of read
212 int numgaps = rec.getUnclippedStart();
213 gaps = new char[numgaps];
214 Arrays.fill(gaps, ' ');
215 newRead.append(gaps);
219 read.substring(next, next + length).toLowerCase());
223 // the reference sequence and other reads should have been gapped for
224 // this insertion, so just add in the residues
225 newRead.append(read.substring(next, next + length));
229 // hard clipping - this stretch will not appear in the read
232 // P, X EQ don't know what to do with these
237 return newRead.toString();
241 * Get list of positions inserted to the reference sequence
246 private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
248 SortedMap<Integer, Integer> insertions = new TreeMap<>();
251 // check each record for insertions in the CIGAR string
252 SAMRecord rec = it.next();
253 Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
256 while (cit.hasNext())
258 CigarElement el = cit.next();
259 switch (el.getOperator())
262 // add to insertions list, and move along the read
263 int refLocation = rec.getReferencePositionAtReadPosition(next);
264 insertions.put(refLocation, el.getLength());
265 next += el.getLength();
268 // match to reference, move along the read
269 next += el.getLength();
272 // deletions, introns etc don't consume any residues from the read