JAL-1766 test and buggy implementation of fasta index builder for a FASTA contig...
authorJim Procter <jprocter@issues.jalview.org>
Mon, 8 Jun 2015 10:50:09 +0000 (11:50 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Mon, 8 Jun 2015 10:50:09 +0000 (11:50 +0100)
src/jalview/ext/htsjdk/HtsContigDb.java [new file with mode: 0644]
test/jalview/ext/htsjdk/TestHtsContigDb.java [new file with mode: 0644]

diff --git a/src/jalview/ext/htsjdk/HtsContigDb.java b/src/jalview/ext/htsjdk/HtsContigDb.java
new file mode 100644 (file)
index 0000000..f3b5098
--- /dev/null
@@ -0,0 +1,210 @@
+package jalview.ext.htsjdk;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.reference.ReferenceSequence;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
+import htsjdk.samtools.util.StringUtil;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+import java.math.BigInteger;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * a source of sequence data accessed via the HTSJDK
+ * 
+ * @author jprocter
+ *
+ */
+public class HtsContigDb
+{
+
+  private String name;
+
+  private File dbLocation;
+
+  private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null;
+
+  public HtsContigDb(String name, File descriptor) throws Exception
+  {
+    if (descriptor.isFile())
+    {
+      this.name = name;
+      dbLocation = descriptor;
+    }
+    initSource();
+  }
+
+  private void initSource() throws Exception
+  {
+    if (refFile != null)
+    {
+      return;
+    }
+
+    refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
+            dbLocation, true);
+    if (refFile == null || refFile.getSequenceDictionary() == null)
+    {
+      // refFile = initSequenceDictionaryFor(dbLocation);
+    }
+
+  }
+
+
+  SAMSequenceDictionary rrefDict = null;
+  private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2) throws Exception
+  {
+    rrefDict = getDictionary(dbLocation2, true);
+    if (rrefDict != null)
+    {
+      ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(dbLocation2, true);
+      return rrefFile;
+    }
+    return null;
+  }
+  /**
+   * code below hacked out from picard ----
+   * 
+   * picard/src/java/picard/sam/CreateSequenceDictionary.java
+   * https://github.com/
+   * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876
+   */
+
+
+  /*
+   * The MIT License
+   * 
+   * Copyright (c) 2009 The Broad Institute
+   * 
+   * Permission is hereby granted, free of charge, to any person obtaining a
+   * copy of this software and associated documentation files (the "Software"),
+   * to deal in the Software without restriction, including without limitation
+   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+   * and/or sell copies of the Software, and to permit persons to whom the
+   * Software is furnished to do so, subject to the following conditions:
+   * 
+   * The above copyright notice and this permission notice shall be included in
+   * all copies or substantial portions of the Software.
+   * 
+   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   * DEALINGS IN THE SOFTWARE.
+   */
+  /**
+   * 
+   * @param f
+   * @param truncate
+   * @return
+   * @throws Exception
+   */
+  SAMSequenceDictionary getDictionary(File f, boolean truncate)
+          throws Exception
+  {
+    if (md5 == null)
+    {
+      initCreateSequenceDictionary();
+    }
+    final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory
+            .getReferenceSequenceFile(f, truncate);
+    ReferenceSequence refSeq;
+    List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
+    Set<String> sequenceNames = new HashSet<String>();
+    for (int numSequences = 0; (refSeq = refSeqFile.nextSequence()) != null; ++numSequences)
+    {
+      if (sequenceNames.contains(refSeq.getName()))
+      {
+        throw new Exception(
+                "Sequence name appears more than once in reference: "
+                        + refSeq.getName());
+      }
+      sequenceNames.add(refSeq.getName());
+      ret.add(makeSequenceRecord(refSeq));
+    }
+    return new SAMSequenceDictionary(ret);
+  }
+
+  public boolean isValid()
+  {
+    return dbLocation != null && refFile != null;
+  }
+
+  /**
+   * Create one SAMSequenceRecord from a single fasta sequence
+   */
+  private SAMSequenceRecord makeSequenceRecord(
+          final ReferenceSequence refSeq)
+  {
+
+    final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(),
+            refSeq.length());
+
+    // Compute MD5 of upcased bases
+    final byte[] bases = refSeq.getBases();
+    for (int i = 0; i < bases.length; ++i)
+    {
+      bases[i] = StringUtil.toUpperCase(bases[i]);
+    }
+
+    ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
+    // if (GENOME_ASSEMBLY != null) {
+    // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
+    // }
+    // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
+    // if (SPECIES != null) {
+    // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
+    // }
+    return ret;
+  }
+
+  private MessageDigest md5;
+
+  public void initCreateSequenceDictionary() throws Exception
+  {
+    try
+    {
+      md5 = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e)
+    {
+      throw new Exception("MD5 algorithm not found", e);
+    }
+  }
+
+  private String md5Hash(final byte[] bytes)
+  {
+    md5.reset();
+    md5.update(bytes);
+    String s = new BigInteger(1, md5.digest()).toString(16);
+    if (s.length() != 32)
+    {
+      final String zeros = "00000000000000000000000000000000";
+      s = zeros.substring(0, 32 - s.length()) + s;
+    }
+    return s;
+  }
+
+  // ///// end of hts bits.
+
+  SequenceI getSequenceProxy(String id)
+  {
+    if (!isValid())
+    {
+      return null;
+    }
+
+    ReferenceSequence sseq = refFile.getSequence(id);
+    return new Sequence(sseq.getName(), new String(sseq.getBases()));
+  }
+}
diff --git a/test/jalview/ext/htsjdk/TestHtsContigDb.java b/test/jalview/ext/htsjdk/TestHtsContigDb.java
new file mode 100644 (file)
index 0000000..9fc5fb9
--- /dev/null
@@ -0,0 +1,32 @@
+/**
+ * 
+ */
+package jalview.ext.htsjdk;
+
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author jprocter
+ *
+ */
+public class TestHtsContigDb
+{
+  @Test
+  public final void testHTSReferenceSequence() throws Exception
+  {
+    HtsContigDb remmadb = new HtsContigDb("REEMADB", new File(
+            "D:\\Jims\\Jalview\\Reema\\pgmb.fasta"));
+
+    Assert.assertTrue(remmadb.isValid());
+
+    SequenceI sq = remmadb.getSequenceProxy("Deminut");
+    Assert.assertNotNull(sq);
+    Assert.assertNotEquals(0, sq.getLength());
+  }
+
+}