JAL-2909 separate reverse and forward strand reads. fix subtle bug in handling insert...
[jalview.git] / src / jalview / datamodel / CigarParser.java
index ee0d523..a42c2b1 100644 (file)
@@ -15,9 +15,9 @@ public class CigarParser
 {
   private final char gapChar;
 
-  public CigarParser(char gapCharacter)
+  public CigarParser(char gap)
   {
-    gapChar = gapCharacter;
+    gapChar = gap;
   }
 
   /**
@@ -27,22 +27,29 @@ public class CigarParser
    *          the SAM record for the read
    * @param insertions
    *          a map of inserted positions and lengths
+   * @param alignmentStart
+   *          start of alignment to be used as offset when padding reads with
+   *          gaps
+   * @param seq
+   *          sequence to be annotated with inserts
    * @return string representing read with gaps, clipping etc applied
    */
   public String parseCigarToSequence(SAMRecord rec,
-          SortedMap<Integer, Integer> insertions)
+          SortedMap<Integer, Integer> insertions, int alignmentStart,
+          SequenceI seq)
   {
+    StringBuilder newRead = new StringBuilder();
     Iterator<CigarElement> it = rec.getCigar().getCigarElements()
             .iterator();
 
-    StringBuilder newRead = new StringBuilder();
     int next = 0;
     int refnext = 0;
 
     // pad with spaces before read
     // first count gaps needed to pad to reference position
     // NB position is 1-based so number of gaps = pos-1
-    int gaps = rec.getUnclippedStart() - 1;
+    int gaps = rec.getStart() - alignmentStart;
+
     // now count gaps to pad for insertions in other reads
     int insertCount = countInsertionsBeforeRead(rec, insertions);
     addGaps(newRead, gaps + insertCount);
@@ -60,18 +67,30 @@ public class CigarParser
       int[] override = null;
       if (lastinserts > 0)
       {
-        // we just inserted something
-        // remove these inserts now - should be very first entry
-        int count = seqInserts.get(rec.getStart() + refnext);
-        override = new int[] { rec.getStart() + refnext,
-            count - lastinserts };
-        lastinserts = 0;
+        if (!seqInserts.containsKey(rec.getStart() + refnext))
+        {
+          // ERROR
+          int pos = rec.getStart() + refnext;
+          System.out.println(
+                  "Read " + rec.getReadName() + " has an insertion at "
+                          + pos + " which is not in seqInserts");
+        }
+        else
+        {
+          // we just inserted something
+          // remove these inserts now - should be very first entry
+          int count = seqInserts.get(rec.getStart() + refnext);
+          override = new int[] { rec.getStart() + refnext,
+              count - lastinserts };
+          lastinserts = 0;
+        }
       }
 
       Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
               .iterator();
 
-      newRead.append(applyCigarOp(el, next, rec, iit, override));
+      String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
+      newRead.append(nextSegment);
 
       if (el.getOperator().consumesReferenceBases())
       {
@@ -108,11 +127,12 @@ public class CigarParser
    *          an optional <insertion position, length> which can override a
    *          <position,length> in it to change the length. Set to null if not
    *          used.
+   * @param seq
    * @return
    */
   private String applyCigarOp(CigarElement el, int next,
           SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
-          int[] override)
+          int[] override, SequenceI seq)
   {
     StringBuilder newRead = new StringBuilder();
     String read = rec.getReadString();
@@ -127,7 +147,7 @@ public class CigarParser
     case EQ:
       // matched or mismatched residues
       newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
-              override);
+              override, seq);
       break;
     case N: // intron in RNA
     case D: // deletion
@@ -141,22 +161,23 @@ public class CigarParser
       }
       addGaps(newRead, length + insertCount);
       break;
-    case S:
-      // soft clipping - just skip this bit of the read
-      // do nothing
-
-      newRead.append(
-              read.substring(nextPos, nextPos + length).toLowerCase());
-      // nextPos += length;
-      break;
     case I:
       // the reference sequence and other reads should have been gapped for
       // this insertion, so just add in the residues
       newRead.append(read.substring(nextPos, nextPos + length));
-      // nextPos += length;
+      seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
+              nextPos + 1, nextPos + length, "bamfile"));
+      break;
+    case S:
+      // soft clipping - just skip this bit of the read
+      seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
+              nextPos + 1, nextPos + length, "bamfile"));
       break;
     case H:
       // hard clipping - this stretch will not appear in the read
+      seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
+              nextPos + 1, nextPos + length, "bamfile"));
+      break;
     default:
       break;
     }
@@ -177,20 +198,22 @@ public class CigarParser
    *          iterator over insertions in the CIGAR region
    * @param override
    *          insertions which have already been accounted for
+   * @param seq
    * @return
    */
   private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
           SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
-          int[] override)
+          int[] override, SequenceI seq)
   {
     StringBuilder newRead = new StringBuilder();
     String read = rec.getReadString();
 
-    int nextPos = next;
+    int nextPos = next; // location of next position in read
     while (nextPos < next + length)
     {
-      int insertPos = next + length;
-      int insertLen = 0;
+      int insertPos = next + length; // location of next insert in read (or end
+                                     // of read)
+      int insertLen = 0; // no of gaps to insert after insertPos
       if (it.hasNext())
       {
         // account for sequence already having insertion
@@ -212,6 +235,11 @@ public class CigarParser
         {
           insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
         }
+        else
+        {
+          insertPos = nextPos; // bugfix - trigger by override + insertions in
+                               // same M stretch
+        }
       }
       newRead.append(read.substring(nextPos, insertPos));
       nextPos = insertPos;
@@ -247,53 +275,116 @@ public class CigarParser
   Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
   
   */
-  public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+  public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
   {
-    SortedMap<Integer, Integer> insertions = new TreeMap<>();
-    while (it.hasNext())
+    return getInsertions(it, null);
+  }
+
+  public int firstAlColumn = -1, firstRposCol = -1;
+
+  public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+          Range[] extent)
+  {
+    SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+        new TreeMap<>(), new TreeMap<>() };
+    Range xten = extent != null && extent[0] != null ? extent[0] : null;
+    do
     {
-      // check each record for insertions in the CIGAR string
+      // check extent of read
       SAMRecord rec = it.next();
+      if (extent != null)
+      {
+
+        int nstart = rec.getAlignmentStart();
+        if (nstart != 0)
+        {
+          nstart = rec.getReferencePositionAtReadPosition(nstart);
+        }
+        int nend = rec.getAlignmentEnd();
+        if (nend != 0)
+        {
+          nend = rec.getReferencePositionAtReadPosition(nend);
+        }
+
+        xten = new Range(
+                nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+                nend == 0 ? xten.end : Math.max(nend, xten.end));
+      }
+
+      // check each record for insertions in the CIGAR string
+
+      // pick the insert map to update
+      int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+      SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
       Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
               .iterator();
-      int next = 0;
+      int refLocation = rec.getAlignmentStart();
+      int rloc = 0;
+      int alcol = 0;
+      int alpos = 0;
       while (cit.hasNext())
       {
         CigarElement el = cit.next();
         switch (el.getOperator())
         {
-        case I:
-          // add to insertions list, and move along the read
-          // location is 1 past the current position of next
-          // (note if we try to retrieve +1 position from reference by calling
-          // getReferencePositionAtReadPosition(next+1) we get 0 because it's an
-          // insertion!)
-          int refLocation = rec.getReferencePositionAtReadPosition(next)
-                  + 1;
-
-          // if there's already an insertion at this location, keep the longest
-          // insertion; if there's no insertion keep this one
-          if (!insertions.containsKey(refLocation)
-                  || (insertions.containsKey(refLocation)
-                          && insertions.get(refLocation) < el.getLength()))
-          {
-            insertions.put(refLocation, el.getLength());
-          }
-          next += el.getLength();
-          break;
-        case M:
         case S:
-          // match to reference, or soft clip, move along the read
-          next += el.getLength();
+        case H:
+          // ignore soft/hardclipped
+
           break;
         default:
-          // deletions, introns etc don't consume any residues from the read
-          break;
+          if (el.getOperator().consumesReadBases()
+                  && !el.getOperator().consumesReferenceBases())
+          {
+            // add to insertions list, and move along the read
+            // location is the start of next CIGAR segment
+            // because getReferencePositionAtReadPosition returns 0 for read
+            // positions which are not in the reference (like insertions)
+
+            // if there's already an insertion at this location, keep the
+            // longest
+            // insertion; if there's no insertion keep this one
+            if (!inserts.containsKey(refLocation)
+                    || (inserts.containsKey(refLocation)
+                            && inserts.get(refLocation) < el.getLength()))
+            {
+              inserts.put(refLocation, el.getLength());
+            }
+            break;
+          }
+          if (el.getOperator().consumesReferenceBases())
+          {
+            if (el.getOperator().consumesReadBases() && alcol == 0
+                    && refLocation + 1 > extent[0].start)
+            {
+              // first match position : last rloc + first match op
+              alcol = rloc + 1;
+              alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+            }
+            refLocation += el.getLength();
+          }
+
+        }
+        if (el.getOperator().consumesReadBases()
+                || el.getOperator().consumesReferenceBases())
+        {
+          rloc += el.getLength();
         }
       }
-
+      // Wrong: hack that fails to relocate reference to correct place in
+      // sequence
+      if (firstAlColumn < 0)
+      {
+        firstAlColumn = alcol;
+        firstRposCol = alpos;
+      }
+    } while (it.hasNext());
+    if (xten != null)
+    {
+      extent[0] = xten;
     }
-    return insertions;
+    return sinserts;
   }
 
   /**
@@ -320,18 +411,19 @@ public class CigarParser
    * 
    * @param rec
    *          the SAMRecord for the read
-   * @param insertions
+   * @param inserts
    *          the map of insertions
    * @return number of insertions before the read starts
    */
   private int countInsertionsBeforeRead(SAMRecord rec,
-          SortedMap<Integer, Integer> insertions)
+          SortedMap<Integer, Integer> inserts)
   {
     int gaps = 0;
 
     // add in any insertion gaps before read
-    // TODO start point should be start of alignment not 0
-    SortedMap<Integer, Integer> seqInserts = insertions.subMap(0,
+    // although we only need to get the submap from alignmentStart, there won't
+    // be any insertions before that so we can just start at 0
+    SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
             rec.getStart());
 
     Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()