JAL-2909 separate reverse and forward strand reads. fix subtle bug in handling insert... feature/JAL-2909bamfiles_features
authorJim Procter <jprocter@issues.jalview.org>
Tue, 20 Mar 2018 16:34:10 +0000 (16:34 +0000)
committerJim Procter <jprocter@issues.jalview.org>
Tue, 20 Mar 2018 16:34:10 +0000 (16:34 +0000)
src/jalview/datamodel/CigarParser.java
src/jalview/io/BamFile.java

index a1124d4..a42c2b1 100644 (file)
@@ -275,52 +275,116 @@ public class CigarParser
   Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
   
   */
-  public SortedMap<Integer, Integer> getInsertions(Iterator<SAMRecord> it)
+  public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
   {
-    SortedMap<Integer, Integer> inserts = new TreeMap<>();
-    while (it.hasNext())
+    return getInsertions(it, null);
+  }
+
+  public int firstAlColumn = -1, firstRposCol = -1;
+
+  public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+          Range[] extent)
+  {
+    SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+        new TreeMap<>(), new TreeMap<>() };
+    Range xten = extent != null && extent[0] != null ? extent[0] : null;
+    do
     {
-      // check each record for insertions in the CIGAR string
+      // check extent of read
       SAMRecord rec = it.next();
+      if (extent != null)
+      {
+
+        int nstart = rec.getAlignmentStart();
+        if (nstart != 0)
+        {
+          nstart = rec.getReferencePositionAtReadPosition(nstart);
+        }
+        int nend = rec.getAlignmentEnd();
+        if (nend != 0)
+        {
+          nend = rec.getReferencePositionAtReadPosition(nend);
+        }
+
+        xten = new Range(
+                nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+                nend == 0 ? xten.end : Math.max(nend, xten.end));
+      }
+
+      // check each record for insertions in the CIGAR string
+
+      // pick the insert map to update
+      int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+      SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
       Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
               .iterator();
-      int next = 1;
+      int refLocation = rec.getAlignmentStart();
+      int rloc = 0;
+      int alcol = 0;
+      int alpos = 0;
       while (cit.hasNext())
       {
         CigarElement el = cit.next();
         switch (el.getOperator())
         {
-        case I:
-          // add to insertions list, and move along the read
-          // location is the start of next CIGAR segment
-          // because getReferencePositionAtReadPosition returns 0 for read
-          // positions which are not in the reference (like insertions)
-          int refLocation = rec.getReferencePositionAtReadPosition(
-                  next + el.getLength());
-
-          // if there's already an insertion at this location, keep the longest
-          // insertion; if there's no insertion keep this one
-          if (!inserts.containsKey(refLocation)
-                  || (inserts.containsKey(refLocation)
-                          && inserts.get(refLocation) < el.getLength()))
-          {
-            inserts.put(refLocation, el.getLength());
-          }
-          next += el.getLength();
-          break;
-        case M:
         case S:
-          // match to reference, or soft clip, move along the read
-          next += el.getLength();
+        case H:
+          // ignore soft/hardclipped
+
           break;
         default:
-          // deletions, introns etc don't consume any residues from the read
-          break;
+          if (el.getOperator().consumesReadBases()
+                  && !el.getOperator().consumesReferenceBases())
+          {
+            // add to insertions list, and move along the read
+            // location is the start of next CIGAR segment
+            // because getReferencePositionAtReadPosition returns 0 for read
+            // positions which are not in the reference (like insertions)
+
+            // if there's already an insertion at this location, keep the
+            // longest
+            // insertion; if there's no insertion keep this one
+            if (!inserts.containsKey(refLocation)
+                    || (inserts.containsKey(refLocation)
+                            && inserts.get(refLocation) < el.getLength()))
+            {
+              inserts.put(refLocation, el.getLength());
+            }
+            break;
+          }
+          if (el.getOperator().consumesReferenceBases())
+          {
+            if (el.getOperator().consumesReadBases() && alcol == 0
+                    && refLocation + 1 > extent[0].start)
+            {
+              // first match position : last rloc + first match op
+              alcol = rloc + 1;
+              alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+            }
+            refLocation += el.getLength();
+          }
+
+        }
+        if (el.getOperator().consumesReadBases()
+                || el.getOperator().consumesReferenceBases())
+        {
+          rloc += el.getLength();
         }
       }
-
+      // Wrong: hack that fails to relocate reference to correct place in
+      // sequence
+      if (firstAlColumn < 0)
+      {
+        firstAlColumn = alcol;
+        firstRposCol = alpos;
+      }
+    } while (it.hasNext());
+    if (xten != null)
+    {
+      extent[0] = xten;
     }
-    return inserts;
+    return sinserts;
   }
 
   /**
index 8a04867..8d55c53 100644 (file)
@@ -125,6 +125,44 @@ public class BamFile extends AlignFile
     return null;
   }
 
+  private StringBuilder insertRefSeq(
+          SortedMap<Integer, Integer> insertions, Range xtent)
+  {
+    StringBuilder refseq = new StringBuilder();
+    int inserted = 0;
+    for (int p = xtent.start; p < xtent.end; p++)
+    {
+      refseq.append('N');
+    }
+    for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
+    {
+      int inspos = insert.getKey() - xtent.start + inserted;
+      if (inspos < 0)
+      {
+        System.out.println("Ignoring -ve insert position " + insert.getKey()
+                + " of " + insert.getValue() +
+                " (alpos: " + inspos + ")");
+        inspos = 0;
+        // continue;
+      }
+      for (int i = 0, j = insert.getValue(); i < j; i++)
+      {
+        inserted++;
+        refseq.insert(inspos, '-');
+      }
+    }
+    return refseq;
+  }
+
+  private void padRef(SequenceI ref, CigarParser cig)
+  { // pad to column
+    int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
+    System.out.println("Padding " + padding + " to move refseq position "
+            + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
+
+    ref.insertCharAt(0, padding, '-');
+  }
+
   @Override
   public void parse()
   {
@@ -134,10 +172,30 @@ public class BamFile extends AlignFile
       SAMRecordIterator it = fileReader.query(chromosome, start, end,
               false);
       CigarParser parser = new CigarParser('-');
-      SortedMap<Integer, Integer> insertions = parser.getInsertions(it);
+      Range[] xtent = new Range[] { new Range(start, end) };
+      SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
+              xtent);
       it.close();
 
+      SequenceI refSeq = new Sequence("chr:" + chromosome,
+              insertRefSeq(insertions[0], xtent[0])
+                      .toString());
+
+      refSeq.setStart(xtent[0].start);
+      refSeq.setEnd(xtent[0].end);
+      SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
+              insertRefSeq(insertions[1], xtent[0])
+                      .toString());
+      revRefSeq.setStart(xtent[0].start);
+      revRefSeq.setEnd(xtent[0].end);
+
+      // Hack to move the seqs along
+      padRef(refSeq, parser);
+      padRef(revRefSeq, parser);
+
       it = fileReader.query(chromosome, start, end, false);
+
+      ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
       while (it.hasNext())
       {
         SAMRecord rec = it.next();
@@ -150,7 +208,9 @@ public class BamFile extends AlignFile
         }
 
         // make dataset sequence: start at 1, end at read length
-        SequenceI seq = new Sequence(rec.getReadName(),
+        SequenceI seq = new Sequence(
+                "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
+                        + rec.getReadName(),
                 rec.getReadString().toLowerCase());
         seq.setStart(1);
         seq.setEnd(rec.getReadLength());
@@ -163,7 +223,8 @@ public class BamFile extends AlignFile
                   (float) q.next() - ' ', "bamfile"));
           p++;
         }
-        String newRead = parser.parseCigarToSequence(rec, insertions,
+        String newRead = parser.parseCigarToSequence(rec,
+                insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
                 alignmentStart, seq);
 
         // make alignment sequences
@@ -172,7 +233,28 @@ public class BamFile extends AlignFile
 
         // set start relative to soft clip; assume end is set by Sequence code
         alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
-        seqs.add(alsq);
+        if (rec.getReadNegativeStrandFlag())
+        {
+          rev.add(alsq);
+        }
+        else
+        {
+          fwd.add(alsq);
+        }
+      }
+      // Add forward
+      if (fwd.size() > 0)
+      {
+        seqs.add(refSeq); // FIXME needs to be optional, and properly padded
+        seqs.addAll(fwd);
+        fwd.clear();
+      }
+      // and reverse strand reads.
+      if (rev.size() > 0)
+      {
+        seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
+        seqs.addAll(rev);
+        rev.clear();
       }
     }
   }