1 package jalview.ext.htsjdk;
3 import htsjdk.samtools.SAMSequenceDictionary;
4 import htsjdk.samtools.SAMSequenceRecord;
5 import htsjdk.samtools.reference.ReferenceSequence;
6 import htsjdk.samtools.reference.ReferenceSequenceFile;
7 import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
8 import htsjdk.samtools.util.StringUtil;
9 import jalview.datamodel.Sequence;
10 import jalview.datamodel.SequenceI;
13 import java.math.BigInteger;
14 import java.security.MessageDigest;
15 import java.security.NoSuchAlgorithmException;
16 import java.util.ArrayList;
17 import java.util.HashSet;
18 import java.util.List;
22 * a source of sequence data accessed via the HTSJDK
27 public class HtsContigDb
32 private File dbLocation;
34 private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null;
36 public HtsContigDb(String name, File descriptor) throws Exception
38 if (descriptor.isFile())
41 dbLocation = descriptor;
46 private void initSource() throws Exception
53 refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
55 if (refFile == null || refFile.getSequenceDictionary() == null)
57 // refFile = initSequenceDictionaryFor(dbLocation);
62 SAMSequenceDictionary rrefDict = null;
64 private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2)
67 rrefDict = getDictionary(dbLocation2, true);
70 ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory
71 .getReferenceSequenceFile(dbLocation2, true);
78 * code below hacked out from picard ----
80 * picard/src/java/picard/sam/CreateSequenceDictionary.java
82 * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876
88 * Copyright (c) 2009 The Broad Institute
90 * Permission is hereby granted, free of charge, to any person obtaining a
91 * copy of this software and associated documentation files (the "Software"),
92 * to deal in the Software without restriction, including without limitation
93 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
94 * and/or sell copies of the Software, and to permit persons to whom the
95 * Software is furnished to do so, subject to the following conditions:
97 * The above copyright notice and this permission notice shall be included in
98 * all copies or substantial portions of the Software.
100 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106 * DEALINGS IN THE SOFTWARE.
115 SAMSequenceDictionary getDictionary(File f, boolean truncate)
120 initCreateSequenceDictionary();
122 final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory
123 .getReferenceSequenceFile(f, truncate);
124 ReferenceSequence refSeq;
125 List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
126 Set<String> sequenceNames = new HashSet<String>();
127 for (int numSequences = 0; (refSeq = refSeqFile.nextSequence()) != null; ++numSequences)
129 if (sequenceNames.contains(refSeq.getName()))
132 "Sequence name appears more than once in reference: "
135 sequenceNames.add(refSeq.getName());
136 ret.add(makeSequenceRecord(refSeq));
138 return new SAMSequenceDictionary(ret);
141 public boolean isValid()
143 return dbLocation != null && refFile != null;
147 * Create one SAMSequenceRecord from a single fasta sequence
149 private SAMSequenceRecord makeSequenceRecord(
150 final ReferenceSequence refSeq)
153 final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(),
156 // Compute MD5 of upcased bases
157 final byte[] bases = refSeq.getBases();
158 for (int i = 0; i < bases.length; ++i)
160 bases[i] = StringUtil.toUpperCase(bases[i]);
163 ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
164 // if (GENOME_ASSEMBLY != null) {
165 // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
167 // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
168 // if (SPECIES != null) {
169 // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
174 private MessageDigest md5;
176 public void initCreateSequenceDictionary() throws Exception
180 md5 = MessageDigest.getInstance("MD5");
181 } catch (NoSuchAlgorithmException e)
183 throw new Exception("MD5 algorithm not found", e);
187 private String md5Hash(final byte[] bytes)
191 String s = new BigInteger(1, md5.digest()).toString(16);
192 if (s.length() != 32)
194 final String zeros = "00000000000000000000000000000000";
195 s = zeros.substring(0, 32 - s.length()) + s;
200 // ///// end of hts bits.
202 SequenceI getSequenceProxy(String id)
209 ReferenceSequence sseq = refFile.getSequence(id);
210 return new Sequence(sseq.getName(), new String(sseq.getBases()));