From ad251609d6d0d12aaa6d466b91d719dabf269d7b Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Mon, 8 Jun 2015 11:50:09 +0100 Subject: [PATCH] JAL-1766 test and buggy implementation of fasta index builder for a FASTA contig database --- src/jalview/ext/htsjdk/HtsContigDb.java | 210 ++++++++++++++++++++++++++ test/jalview/ext/htsjdk/TestHtsContigDb.java | 32 ++++ 2 files changed, 242 insertions(+) create mode 100644 src/jalview/ext/htsjdk/HtsContigDb.java create mode 100644 test/jalview/ext/htsjdk/TestHtsContigDb.java diff --git a/src/jalview/ext/htsjdk/HtsContigDb.java b/src/jalview/ext/htsjdk/HtsContigDb.java new file mode 100644 index 0000000..f3b5098 --- /dev/null +++ b/src/jalview/ext/htsjdk/HtsContigDb.java @@ -0,0 +1,210 @@ +package jalview.ext.htsjdk; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.util.StringUtil; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +import java.io.File; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * a source of sequence data accessed via the HTSJDK + * + * @author jprocter + * + */ +public class HtsContigDb +{ + + private String name; + + private File dbLocation; + + private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null; + + public HtsContigDb(String name, File descriptor) throws Exception + { + if (descriptor.isFile()) + { + this.name = name; + dbLocation = descriptor; + } + initSource(); + } + + private void initSource() throws Exception + { + if (refFile != null) + { + return; + } + + refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile( + dbLocation, true); + if (refFile == null || refFile.getSequenceDictionary() == null) + { + // refFile = initSequenceDictionaryFor(dbLocation); + } + + } + + + SAMSequenceDictionary rrefDict = null; + private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2) throws Exception + { + rrefDict = getDictionary(dbLocation2, true); + if (rrefDict != null) + { + ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(dbLocation2, true); + return rrefFile; + } + return null; + } + /** + * code below hacked out from picard ---- + * + * picard/src/java/picard/sam/CreateSequenceDictionary.java + * https://github.com/ + * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876 + */ + + + /* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + /** + * + * @param f + * @param truncate + * @return + * @throws Exception + */ + SAMSequenceDictionary getDictionary(File f, boolean truncate) + throws Exception + { + if (md5 == null) + { + initCreateSequenceDictionary(); + } + final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory + .getReferenceSequenceFile(f, truncate); + ReferenceSequence refSeq; + List ret = new ArrayList(); + Set sequenceNames = new HashSet(); + for (int numSequences = 0; (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) + { + if (sequenceNames.contains(refSeq.getName())) + { + throw new Exception( + "Sequence name appears more than once in reference: " + + refSeq.getName()); + } + sequenceNames.add(refSeq.getName()); + ret.add(makeSequenceRecord(refSeq)); + } + return new SAMSequenceDictionary(ret); + } + + public boolean isValid() + { + return dbLocation != null && refFile != null; + } + + /** + * Create one SAMSequenceRecord from a single fasta sequence + */ + private SAMSequenceRecord makeSequenceRecord( + final ReferenceSequence refSeq) + { + + final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), + refSeq.length()); + + // Compute MD5 of upcased bases + final byte[] bases = refSeq.getBases(); + for (int i = 0; i < bases.length; ++i) + { + bases[i] = StringUtil.toUpperCase(bases[i]); + } + + ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); + // if (GENOME_ASSEMBLY != null) { + // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); + // } + // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); + // if (SPECIES != null) { + // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); + // } + return ret; + } + + private MessageDigest md5; + + public void initCreateSequenceDictionary() throws Exception + { + try + { + md5 = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) + { + throw new Exception("MD5 algorithm not found", e); + } + } + + private String md5Hash(final byte[] bytes) + { + md5.reset(); + md5.update(bytes); + String s = new BigInteger(1, md5.digest()).toString(16); + if (s.length() != 32) + { + final String zeros = "00000000000000000000000000000000"; + s = zeros.substring(0, 32 - s.length()) + s; + } + return s; + } + + // ///// end of hts bits. + + SequenceI getSequenceProxy(String id) + { + if (!isValid()) + { + return null; + } + + ReferenceSequence sseq = refFile.getSequence(id); + return new Sequence(sseq.getName(), new String(sseq.getBases())); + } +} diff --git a/test/jalview/ext/htsjdk/TestHtsContigDb.java b/test/jalview/ext/htsjdk/TestHtsContigDb.java new file mode 100644 index 0000000..9fc5fb9 --- /dev/null +++ b/test/jalview/ext/htsjdk/TestHtsContigDb.java @@ -0,0 +1,32 @@ +/** + * + */ +package jalview.ext.htsjdk; + +import jalview.datamodel.SequenceI; + +import java.io.File; + +import org.junit.Assert; +import org.junit.Test; + +/** + * @author jprocter + * + */ +public class TestHtsContigDb +{ + @Test + public final void testHTSReferenceSequence() throws Exception + { + HtsContigDb remmadb = new HtsContigDb("REEMADB", new File( + "D:\\Jims\\Jalview\\Reema\\pgmb.fasta")); + + Assert.assertTrue(remmadb.isValid()); + + SequenceI sq = remmadb.getSequenceProxy("Deminut"); + Assert.assertNotNull(sq); + Assert.assertNotEquals(0, sq.getLength()); + } + +} -- 1.7.10.2