2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.ext.htsjdk;
23 import htsjdk.samtools.SAMSequenceDictionary;
24 import htsjdk.samtools.SAMSequenceRecord;
25 import htsjdk.samtools.reference.ReferenceSequence;
26 import htsjdk.samtools.reference.ReferenceSequenceFile;
27 import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
28 import htsjdk.samtools.util.StringUtil;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceI;
33 import java.math.BigInteger;
34 import java.security.MessageDigest;
35 import java.security.NoSuchAlgorithmException;
36 import java.util.ArrayList;
37 import java.util.HashSet;
38 import java.util.List;
42 * a source of sequence data accessed via the HTSJDK
47 public class HtsContigDb
52 private File dbLocation;
54 private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null;
56 public HtsContigDb(String name, File descriptor) throws Exception
58 if (descriptor.isFile())
61 dbLocation = descriptor;
66 private void initSource() throws Exception
73 refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
75 if (refFile == null || refFile.getSequenceDictionary() == null)
77 // refFile = initSequenceDictionaryFor(dbLocation);
82 SAMSequenceDictionary rrefDict = null;
84 private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2)
87 rrefDict = getDictionary(dbLocation2, true);
90 ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory
91 .getReferenceSequenceFile(dbLocation2, true);
98 * code below hacked out from picard ----
100 * picard/src/java/picard/sam/CreateSequenceDictionary.java
101 * https://github.com/
102 * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876
108 * Copyright (c) 2009 The Broad Institute
110 * Permission is hereby granted, free of charge, to any person obtaining a
111 * copy of this software and associated documentation files (the "Software"),
112 * to deal in the Software without restriction, including without limitation
113 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
114 * and/or sell copies of the Software, and to permit persons to whom the
115 * Software is furnished to do so, subject to the following conditions:
117 * The above copyright notice and this permission notice shall be included in
118 * all copies or substantial portions of the Software.
120 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
121 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
122 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
123 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
124 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
125 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
126 * DEALINGS IN THE SOFTWARE.
135 SAMSequenceDictionary getDictionary(File f, boolean truncate)
140 initCreateSequenceDictionary();
142 final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory
143 .getReferenceSequenceFile(f, truncate);
144 ReferenceSequence refSeq;
145 List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
146 Set<String> sequenceNames = new HashSet<String>();
147 for (int numSequences = 0; (refSeq = refSeqFile.nextSequence()) != null; ++numSequences)
149 if (sequenceNames.contains(refSeq.getName()))
152 "Sequence name appears more than once in reference: "
155 sequenceNames.add(refSeq.getName());
156 ret.add(makeSequenceRecord(refSeq));
158 return new SAMSequenceDictionary(ret);
161 public boolean isValid()
163 return dbLocation != null && refFile != null;
167 * Create one SAMSequenceRecord from a single fasta sequence
169 private SAMSequenceRecord makeSequenceRecord(
170 final ReferenceSequence refSeq)
173 final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(),
176 // Compute MD5 of upcased bases
177 final byte[] bases = refSeq.getBases();
178 for (int i = 0; i < bases.length; ++i)
180 bases[i] = StringUtil.toUpperCase(bases[i]);
183 ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
184 // if (GENOME_ASSEMBLY != null) {
185 // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
187 // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
188 // if (SPECIES != null) {
189 // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
194 private MessageDigest md5;
196 public void initCreateSequenceDictionary() throws Exception
200 md5 = MessageDigest.getInstance("MD5");
201 } catch (NoSuchAlgorithmException e)
203 throw new Exception("MD5 algorithm not found", e);
207 private String md5Hash(final byte[] bytes)
211 String s = new BigInteger(1, md5.digest()).toString(16);
212 if (s.length() != 32)
214 final String zeros = "00000000000000000000000000000000";
215 s = zeros.substring(0, 32 - s.length()) + s;
220 // ///// end of hts bits.
222 SequenceI getSequenceProxy(String id)
229 ReferenceSequence sseq = refFile.getSequence(id);
230 return new Sequence(sseq.getName(), new String(sseq.getBases()));