JAL-2909 Read in a bam file - but strand/location unaware
authorkiramt <k.mourao@dundee.ac.uk>
Fri, 16 Feb 2018 13:15:40 +0000 (13:15 +0000)
committerkiramt <k.mourao@dundee.ac.uk>
Mon, 19 Feb 2018 16:14:42 +0000 (16:14 +0000)
src/jalview/io/BamFile.java [new file with mode: 0644]
src/jalview/io/FileFormat.java
src/jalview/io/IdentifyFile.java

diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java
new file mode 100644 (file)
index 0000000..c94dc3e
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ * 
+ * This file is part of Jalview.
+ * 
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License 
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *  
+ * Jalview is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty 
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
+ * PURPOSE.  See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
+package jalview.io;
+
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.ValidationStringency;
+
+public class BamFile extends AlignFile
+{
+
+  SamReader fileReader;
+
+  /**
+   * Creates a new BamFile object.
+   */
+  public BamFile()
+  {
+  }
+
+  /**
+   * Creates a new BamFile object.
+   * 
+   * @param inFile
+   *          DOCUMENT ME!
+   * @param sourceType
+   *          DOCUMENT ME!
+   * 
+   * @throws IOException
+   *           DOCUMENT ME!
+   */
+  public BamFile(String inFile, DataSourceType sourceType)
+          throws IOException
+  {
+    super(inFile, sourceType);
+    final SamReaderFactory factory = SamReaderFactory.makeDefault()
+            .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
+                    SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
+            .validationStringency(ValidationStringency.SILENT);
+    fileReader = factory.open(new File(inFile));
+  }
+
+  public BamFile(FileParse source) throws IOException
+  {
+    final SamReaderFactory factory = SamReaderFactory.makeDefault()
+            .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
+                    SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
+            .validationStringency(ValidationStringency.SILENT);
+
+    // File-based bam
+    fileReader = factory.open(source.inFile);
+    parse();
+  }
+  
+  @Override
+  public String print(SequenceI[] seqs, boolean jvsuffix)
+  {
+    // TODO Auto-generated method stub
+    return null;
+  }
+
+  @Override
+  public void parse() throws IOException
+  {
+    SequenceI seq = null;
+    SAMRecordIterator it = fileReader.iterator();
+    while (it.hasNext())
+    {
+      SAMRecord rec = it.next();
+      String read = rec.getReadString();
+      int start = rec.getStart();
+      int end = rec.getEnd();
+
+      Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+              .iterator();
+
+      seq = parseId(rec.getReadName());
+      String cigarredRead = parseCigarToSequence(read, cit, '-');
+      seq.setSequence(cigarredRead);
+      seq.setStart(start);
+      seq.setEnd(end);
+      seqs.addElement(seq);
+    }
+
+  }
+
+  /**
+   * Apply the CIGAR string to a read sequence and return the updated read
+   * 
+   * @param read
+   *          the read to update
+   * @param it
+   *          iterator over cigar elements
+   * @param gapChar
+   *          gap character to use
+   * @return string representing read with gaps, clipping etc applied
+   */
+  private String parseCigarToSequence(String read,
+          Iterator<CigarElement> it, char gapChar)
+  {
+    StringBuilder newRead = new StringBuilder();
+    int next = 0;
+
+    while (it.hasNext())
+    {
+      CigarElement el = it.next();
+      int length = el.getLength();
+      switch (el.getOperator())
+      {
+      case M:
+        // matched residues
+        newRead.append(read.substring(next, next + length - 1));
+        next += length;
+        break;
+      case N: // intron in RNA
+      case D: // deletion
+        // add gaps
+        char[] gaps = new char[length];
+        Arrays.fill(gaps, gapChar);
+        newRead.append(gaps);
+        break;
+      case S:
+        // soft clipping - just skip this bit of the read
+        // do nothing
+        next += length;
+        break;
+      case I:
+        // add gaps to the reference sequence, which we know nothing about just
+        // now
+        // TODO
+        newRead.append(read.substring(next, next + length - 1));
+        break;
+      case H:
+        // hard clipping - this stretch will not appear in the read
+        break;
+      default:
+        // P, X EQ don't know what to do with these
+        break;
+      }
+
+    }
+    return newRead.toString();
+  }
+
+}
index 4b33dbf..21fadd2 100644 (file)
@@ -347,6 +347,27 @@ public enum FileFormat implements FileFormatI
       return true;
     }
   },
+  Bam("bam", "bam", true, true)
+  {
+    @Override
+    public AlignmentFileReaderI getReader(FileParse source)
+            throws IOException
+    {
+      return new BamFile(source);
+    }
+
+    @Override
+    public AlignmentFileWriterI getWriter(AlignmentI al)
+    {
+      return new BamFile();
+    }
+
+    @Override
+    public boolean isStructureFile()
+    {
+      return false;
+    }
+  },
   Jalview("Jalview", "jar,jvp", true, true)
   {
     @Override
index ff959b0..38414e3 100755 (executable)
@@ -135,6 +135,12 @@ public class IdentifyFile
                     || fileStr.lastIndexOf(".zip") > -1)
             {
               reply = FileFormat.Jalview;
+              // TODO shouldn't there be a break here?
+            }
+            else if (fileStr.lastIndexOf(".bam") > -1)
+            {
+              reply = FileFormat.Bam;
+              break;
             }
           }
           if (!lineswereskipped && data.startsWith("PK"))