JAL-2909 Insertions almost
authorkiramt <k.mourao@dundee.ac.uk>
Tue, 20 Feb 2018 16:37:01 +0000 (16:37 +0000)
committerkiramt <k.mourao@dundee.ac.uk>
Tue, 20 Feb 2018 16:37:01 +0000 (16:37 +0000)
lib/htsjdk-1.133.jar [deleted file]
lib/htsjdk-2.12.0.jar [new file with mode: 0644]
src/jalview/io/BamFile.java

diff --git a/lib/htsjdk-1.133.jar b/lib/htsjdk-1.133.jar
deleted file mode 100644 (file)
index f084258..0000000
Binary files a/lib/htsjdk-1.133.jar and /dev/null differ
diff --git a/lib/htsjdk-2.12.0.jar b/lib/htsjdk-2.12.0.jar
new file mode 100644 (file)
index 0000000..c9b8ea3
Binary files /dev/null and b/lib/htsjdk-2.12.0.jar differ
index c94dc3e..e9919ff 100644 (file)
  */
 package jalview.io;
 
+import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceI;
 
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 import htsjdk.samtools.CigarElement;
 import htsjdk.samtools.SAMRecord;
@@ -90,8 +93,11 @@ public class BamFile extends AlignFile
   @Override
   public void parse() throws IOException
   {
-    SequenceI seq = null;
     SAMRecordIterator it = fileReader.iterator();
+    SortedMap<Integer,Integer> insertions = getInsertions(it);
+    it.close();
+
+    it = fileReader.iterator();
     while (it.hasNext())
     {
       SAMRecord rec = it.next();
@@ -99,64 +105,125 @@ public class BamFile extends AlignFile
       int start = rec.getStart();
       int end = rec.getEnd();
 
-      Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
-              .iterator();
+      SequenceI seq = new Sequence(rec.getReadName(), rec.getReadString());
 
-      seq = parseId(rec.getReadName());
-      String cigarredRead = parseCigarToSequence(read, cit, '-');
-      seq.setSequence(cigarredRead);
-      seq.setStart(start);
-      seq.setEnd(end);
-      seqs.addElement(seq);
+      String cigarredRead = parseCigarToSequence(read, rec, '-');
+
+      SequenceI alsq = seq.deriveSequence();
+      alsq.setSequence(cigarredRead);
+      alsq.setStart(start);
+      alsq.setEnd(end);
+      seqs.add(alsq);
     }
 
+    for (SequenceI seq : seqs)
+    {
+      int insertCount = 0;
+      SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+              seq.getEnd()); // TODO start point should be start of alignment
+                             // not 0
+      seqInserts.forEach(action);
+      while ((nextInsertion != -1) && (nextInsertion < seq.getEnd()))
+      {
+        seq.insertCharsAt(nextInsertion + insertCount, '-');
+        // nextInsertion = insertions.nextSetBit(nextInsertion + 1);
+        insertCount++;
+      }
+    }
   }
 
+  /*
+  1234567
+  ABCDEFG insert 3,4
+  
+  R1: insert 3 = gap before 3rd res
+  AB.CDEFG
+  
+  R2: insert 4 = gap before 4th res
+  ABC.DEFG
+  
+  R3: insert 3,4 = 2 gaps before 3rd res
+  AB..CDEFG
+  
+  => AB--C-DEFG
+  
+  So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
+  Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
+  
+  */
   /**
-   * Apply the CIGAR string to a read sequence and return the updated read
+   * Apply the CIGAR string to a read sequence and return the updated read.
    * 
    * @param read
    *          the read to update
-   * @param it
-   *          iterator over cigar elements
+   * @param rec
+   *          the corresponding SAM record
    * @param gapChar
    *          gap character to use
    * @return string representing read with gaps, clipping etc applied
    */
   private String parseCigarToSequence(String read,
-          Iterator<CigarElement> it, char gapChar)
+          SAMRecord rec, char gapChar)
   {
+    Iterator<CigarElement> it = rec.getCigar().getCigarElements()
+            .iterator();
+
     StringBuilder newRead = new StringBuilder();
     int next = 0;
+    int ap = 0; // position of query region qi.start;
+
+    // pad with spaces before read
+    for (int alp = rec
+            .getReferencePositionAtReadPosition(next + 1); ap < alp;)
+    {
+      newRead.append(" ");
+      ap++;
+    }
 
     while (it.hasNext())
     {
       CigarElement el = it.next();
       int length = el.getLength();
+      char[] gaps;
+
       switch (el.getOperator())
       {
       case M:
         // matched residues
-        newRead.append(read.substring(next, next + length - 1));
+        newRead.append(
+                read.substring(next, next + length));
         next += length;
         break;
       case N: // intron in RNA
       case D: // deletion
         // add gaps
-        char[] gaps = new char[length];
+        gaps = new char[length];
         Arrays.fill(gaps, gapChar);
         newRead.append(gaps);
         break;
       case S:
         // soft clipping - just skip this bit of the read
         // do nothing
+
+        // work out how many gaps we need before the start of the soft clip -
+        // don't do this at the end of the read!
+        if (next == 0) // at start of read
+        {
+          int numgaps = rec.getUnclippedStart();
+          gaps = new char[numgaps];
+          Arrays.fill(gaps, ' ');
+          newRead.append(gaps);
+        }
+
+        newRead.append(
+                read.substring(next, next + length).toLowerCase());
         next += length;
         break;
       case I:
-        // add gaps to the reference sequence, which we know nothing about just
-        // now
-        // TODO
-        newRead.append(read.substring(next, next + length - 1));
+        // the reference sequence and other reads should have been gapped for
+        // this insertion, so just add in the residues
+        newRead.append(read.substring(next, next + length));
+        next += length;
         break;
       case H:
         // hard clipping - this stretch will not appear in the read
@@ -170,4 +237,45 @@ public class BamFile extends AlignFile
     return newRead.toString();
   }
 
+  /**
+   * Get list of positions inserted to the reference sequence
+   * 
+   * @param it
+   * @return
+   */
+  private SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+  {
+    SortedMap<Integer, Integer> insertions = new TreeMap<>();
+    while (it.hasNext())
+    {
+      // check each record for insertions in the CIGAR string
+      SAMRecord rec = it.next();
+      Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+              .iterator();
+      int next = 0;
+      while (cit.hasNext())
+      {
+        CigarElement el = cit.next();
+        switch (el.getOperator())
+        {
+        case I:
+          // add to insertions list, and move along the read
+          int refLocation = rec.getReferencePositionAtReadPosition(next);
+          insertions.put(refLocation, el.getLength());
+          next += el.getLength();
+          break;
+        case M:
+          // match to reference, move along the read
+          next += el.getLength();
+          break;
+        default:
+          // deletions, introns etc don't consume any residues from the read
+          break;
+        }
+      }
+  
+    }
+    return insertions;
+  }
+
 }