2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ext.htsjdk;
23 import jalview.datamodel.Sequence;
24 import jalview.datamodel.SequenceI;
27 import java.io.IOException;
28 import java.math.BigInteger;
29 import java.nio.file.Path;
30 import java.security.MessageDigest;
31 import java.security.NoSuchAlgorithmException;
32 import java.util.ArrayList;
33 import java.util.HashSet;
34 import java.util.List;
37 import htsjdk.samtools.SAMException;
38 import htsjdk.samtools.SAMSequenceDictionary;
39 import htsjdk.samtools.SAMSequenceRecord;
40 import htsjdk.samtools.reference.FastaSequenceIndexCreator;
41 import htsjdk.samtools.reference.ReferenceSequence;
42 import htsjdk.samtools.reference.ReferenceSequenceFile;
43 import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
44 import htsjdk.samtools.util.StringUtil;
47 * a source of sequence data accessed via the HTSJDK
52 public class HtsContigDb
56 private File dbLocation;
58 private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null;
60 public static void createFastaSequenceIndex(Path path, boolean overwrite)
65 FastaSequenceIndexCreator.create(path, overwrite);
66 } catch (SAMException e)
68 throw new IOException(e.getMessage());
72 public HtsContigDb(String name, File descriptor)
74 if (descriptor.isFile())
77 dbLocation = descriptor;
89 } catch (IOException e)
96 private void initSource()
103 refFile = ReferenceSequenceFileFactory
104 .getReferenceSequenceFile(dbLocation, true);
105 if (refFile == null || refFile.getSequenceDictionary() == null)
107 // refFile = initSequenceDictionaryFor(dbLocation);
112 SAMSequenceDictionary rrefDict = null;
114 private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2)
117 rrefDict = getDictionary(dbLocation2, true);
118 if (rrefDict != null)
120 ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory
121 .getReferenceSequenceFile(dbLocation2, true);
128 * code below hacked out from picard ----
130 * picard/src/java/picard/sam/CreateSequenceDictionary.java
131 * https://github.com/
132 * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876
138 * Copyright (c) 2009 The Broad Institute
140 * Permission is hereby granted, free of charge, to any person obtaining a
141 * copy of this software and associated documentation files (the "Software"),
142 * to deal in the Software without restriction, including without limitation
143 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
144 * and/or sell copies of the Software, and to permit persons to whom the
145 * Software is furnished to do so, subject to the following conditions:
147 * The above copyright notice and this permission notice shall be included in
148 * all copies or substantial portions of the Software.
150 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
151 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
152 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
153 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
154 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
155 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
156 * DEALINGS IN THE SOFTWARE.
165 SAMSequenceDictionary getDictionary(File f, boolean truncate)
170 initCreateSequenceDictionary();
172 final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory
173 .getReferenceSequenceFile(f, truncate);
174 ReferenceSequence refSeq;
175 List<SAMSequenceRecord> ret = new ArrayList<>();
176 Set<String> sequenceNames = new HashSet<>();
177 for (int numSequences = 0; (refSeq = refSeqFile
178 .nextSequence()) != null; ++numSequences)
180 if (sequenceNames.contains(refSeq.getName()))
183 "Sequence name appears more than once in reference: "
186 sequenceNames.add(refSeq.getName());
187 ret.add(makeSequenceRecord(refSeq));
189 return new SAMSequenceDictionary(ret);
192 public boolean isValid()
194 return dbLocation != null && refFile != null;
198 * Create one SAMSequenceRecord from a single fasta sequence
200 private SAMSequenceRecord makeSequenceRecord(
201 final ReferenceSequence refSeq)
204 final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(),
207 // Compute MD5 of upcased bases
208 final byte[] bases = refSeq.getBases();
209 for (int i = 0; i < bases.length; ++i)
211 bases[i] = StringUtil.toUpperCase(bases[i]);
214 ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
215 // if (GENOME_ASSEMBLY != null) {
216 // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
218 // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
219 // if (SPECIES != null) {
220 // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
225 private MessageDigest md5;
227 public void initCreateSequenceDictionary() throws Exception
231 md5 = MessageDigest.getInstance("MD5");
232 } catch (NoSuchAlgorithmException e)
234 throw new Exception("MD5 algorithm not found", e);
238 private String md5Hash(final byte[] bytes)
242 String s = new BigInteger(1, md5.digest()).toString(16);
243 if (s.length() != 32)
245 final String zeros = "00000000000000000000000000000000";
246 s = zeros.substring(0, 32 - s.length()) + s;
251 // ///// end of hts bits.
254 * Reads the contig with the given id and returns as a Jalview SequenceI object.
255 * Note the database must be indexed for this operation to succeed.
260 public SequenceI getSequenceProxy(String id)
262 if (!isValid() || !refFile.isIndexed())
265 "Cannot read contig as file is invalid or not indexed");
269 ReferenceSequence sseq = refFile.getSequence(id);
270 return new Sequence(sseq.getName(), new String(sseq.getBases()));
273 public boolean isIndexed()
275 return refFile != null && refFile.isIndexed();