Merge branch 'develop' into features/JAL-2446NCList
[jalview.git] / src / jalview / analysis / AlignmentUtils.java
index 1b8f84f..2b9b9f9 100644 (file)
@@ -35,14 +35,14 @@ import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceGroup;
 import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntologyFactory;
+import jalview.datamodel.features.SequenceFeatures;
 import jalview.io.gff.SequenceOntologyI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.Comparison;
 import jalview.util.DBRefUtils;
+import jalview.util.IntRangeComparator;
 import jalview.util.MapList;
 import jalview.util.MappingUtils;
-import jalview.util.RangeComparator;
 import jalview.util.StringUtils;
 
 import java.io.UnsupportedEncodingException;
@@ -51,7 +51,6 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -654,15 +653,16 @@ public class AlignmentUtils
     int toOffset = alignTo.getStart() - 1;
     int sourceGapMappedLength = 0;
     boolean inExon = false;
-    final char[] thisSeq = alignTo.getSequence();
-    final char[] thatAligned = alignFrom.getSequence();
-    StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+    final int toLength = alignTo.getLength();
+    final int fromLength = alignFrom.getLength();
+    StringBuilder thisAligned = new StringBuilder(2 * toLength);
 
     /*
      * Traverse the 'model' aligned sequence
      */
-    for (char sourceChar : thatAligned)
+    for (int i = 0; i < fromLength; i++)
     {
+      char sourceChar = alignFrom.getCharAt(i);
       if (sourceChar == sourceGap)
       {
         sourceGapMappedLength += ratio;
@@ -702,9 +702,9 @@ public class AlignmentUtils
        */
       int intronLength = 0;
       while (basesWritten + toOffset < mappedCodonEnd
-              && thisSeqPos < thisSeq.length)
+              && thisSeqPos < toLength)
       {
-        final char c = thisSeq[thisSeqPos++];
+        final char c = alignTo.getCharAt(thisSeqPos++);
         if (c != myGapChar)
         {
           basesWritten++;
@@ -730,7 +730,7 @@ public class AlignmentUtils
             int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
                     preserveUnmappedGaps, sourceGapMappedLength, inExon,
                     trailingCopiedGap.length(), intronLength, startOfCodon);
-            for (int i = 0; i < gapsToAdd; i++)
+            for (int k = 0; k < gapsToAdd; k++)
             {
               thisAligned.append(myGapChar);
             }
@@ -758,9 +758,9 @@ public class AlignmentUtils
      * At end of model aligned sequence. Copy any remaining target sequence, optionally
      * including (intron) gaps.
      */
-    while (thisSeqPos < thisSeq.length)
+    while (thisSeqPos < toLength)
     {
-      final char c = thisSeq[thisSeqPos++];
+      final char c = alignTo.getCharAt(thisSeqPos++);
       if (c != myGapChar || preserveUnmappedGaps)
       {
         thisAligned.append(c);
@@ -952,7 +952,7 @@ public class AlignmentUtils
       SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
       if (peptide != null)
       {
-        int peptideLength = peptide.getLength();
+        final int peptideLength = peptide.getLength();
         Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
         if (map != null)
         {
@@ -961,9 +961,9 @@ public class AlignmentUtils
           {
             mapList = mapList.getInverse();
           }
-          int cdsLength = cdsDss.getLength();
-          int mappedFromLength = MappingUtils
-                  .getLength(mapList.getFromRanges());
+          final int cdsLength = cdsDss.getLength();
+          int mappedFromLength = MappingUtils.getLength(mapList
+                  .getFromRanges());
           int mappedToLength = MappingUtils
                   .getLength(mapList.getToRanges());
           boolean addStopCodon = (cdsLength == mappedFromLength
@@ -988,14 +988,15 @@ public class AlignmentUtils
            * walk over the aligned peptide sequence and insert mapped 
            * codons for residues in the aligned cds sequence 
            */
-          char[] alignedPeptide = peptide.getSequence();
-          char[] nucleotides = cdsDss.getSequence();
           int copiedBases = 0;
           int cdsStart = cdsDss.getStart();
           int proteinPos = peptide.getStart() - 1;
           int cdsCol = 0;
-          for (char residue : alignedPeptide)
+
+          for (int col = 0; col < peptideLength; col++)
           {
+            char residue = peptide.getCharAt(col);
+
             if (Comparison.isGap(residue))
             {
               cdsCol += CODON_LENGTH;
@@ -1013,7 +1014,7 @@ public class AlignmentUtils
               {
                 for (int j = codon[0]; j <= codon[1]; j++)
                 {
-                  char mappedBase = nucleotides[j - cdsStart];
+                  char mappedBase = cdsDss.getCharAt(j - cdsStart);
                   alignedCds[cdsCol++] = mappedBase;
                   copiedBases++;
                 }
@@ -1025,7 +1026,7 @@ public class AlignmentUtils
            * append stop codon if not mapped from protein,
            * closing it up to the end of the mapped sequence
            */
-          if (copiedBases == nucleotides.length - CODON_LENGTH)
+          if (copiedBases == cdsLength - CODON_LENGTH)
           {
             for (int i = alignedCds.length - 1; i >= 0; i--)
             {
@@ -1035,10 +1036,9 @@ public class AlignmentUtils
                 break;
               }
             }
-            for (int i = nucleotides.length
-                    - CODON_LENGTH; i < nucleotides.length; i++)
+            for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
             {
-              alignedCds[cdsCol++] = nucleotides[i];
+              alignedCds[cdsCol++] = cdsDss.getCharAt(i);
             }
           }
           cdsSeq.setSequence(new String(alignedCds));
@@ -1208,21 +1208,26 @@ public class AlignmentUtils
           List<SequenceI> unmappedProtein)
   {
     /*
-     * Prefill aligned sequences with gaps before inserting aligned protein
-     * residues.
+     * prefill peptide sequences with gaps 
      */
     int alignedWidth = alignedCodons.size();
     char[] gaps = new char[alignedWidth];
     Arrays.fill(gaps, protein.getGapCharacter());
-    String allGaps = String.valueOf(gaps);
+    Map<SequenceI, char[]> peptides = new HashMap<>();
     for (SequenceI seq : protein.getSequences())
     {
       if (!unmappedProtein.contains(seq))
       {
-        seq.setSequence(allGaps);
+        peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
       }
     }
 
+    /*
+     * Traverse the codons left to right (as defined by CodonComparator)
+     * and insert peptides in each column where the sequence is mapped.
+     * This gives a peptide 'alignment' where residues are aligned if their
+     * corresponding codons occupy the same columns in the cdna alignment.
+     */
     int column = 0;
     for (AlignedCodon codon : alignedCodons.keySet())
     {
@@ -1230,12 +1235,20 @@ public class AlignmentUtils
               .get(codon);
       for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
       {
-        // place translated codon at its column position in sequence
-        entry.getKey().getSequence()[column] = entry.getValue().product
-                .charAt(0);
+        char residue = entry.getValue().product.charAt(0);
+        peptides.get(entry.getKey())[column] = residue;
       }
       column++;
     }
+
+    /*
+     * and finally set the constructed sequences
+     */
+    for (Entry<SequenceI, char[]> entry : peptides.entrySet())
+    {
+      entry.getKey().setSequence(new String(entry.getValue()));
+    }
+
     return 0;
   }
 
@@ -2061,11 +2074,11 @@ public class AlignmentUtils
    * 
    * @param fromSeq
    * @param toSeq
+   * @param mapping
+   *          the mapping from 'fromSeq' to 'toSeq'
    * @param select
    *          if not null, only features of this type are copied (including
    *          subtypes in the Sequence Ontology)
-   * @param mapping
-   *          the mapping from 'fromSeq' to 'toSeq'
    * @param omitting
    */
   public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
@@ -2077,75 +2090,74 @@ public class AlignmentUtils
       copyTo = copyTo.getDatasetSequence();
     }
 
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    /*
+     * get features, optionally restricted by an ontology term
+     */
+    List<SequenceFeature> sfs = select == null ? fromSeq.getFeatures()
+            .getPositionalFeatures() : fromSeq.getFeatures()
+            .getFeaturesByOntology(select);
+
     int count = 0;
-    SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
-    if (sfs != null)
+    for (SequenceFeature sf : sfs)
     {
-      for (SequenceFeature sf : sfs)
+      String type = sf.getType();
+      boolean omit = false;
+      for (String toOmit : omitting)
       {
-        String type = sf.getType();
-        if (select != null && !so.isA(type, select))
-        {
-          continue;
-        }
-        boolean omit = false;
-        for (String toOmit : omitting)
+        if (type.equals(toOmit))
         {
-          if (type.equals(toOmit))
-          {
-            omit = true;
-          }
-        }
-        if (omit)
-        {
-          continue;
+          omit = true;
         }
+      }
+      if (omit)
+      {
+        continue;
+      }
 
-        /*
-         * locate the mapped range - null if either start or end is
-         * not mapped (no partial overlaps are calculated)
-         */
-        int start = sf.getBegin();
-        int end = sf.getEnd();
-        int[] mappedTo = mapping.locateInTo(start, end);
-        /*
-         * if whole exon range doesn't map, try interpreting it
-         * as 5' or 3' exon overlapping the CDS range
-         */
-        if (mappedTo == null)
-        {
-          mappedTo = mapping.locateInTo(end, end);
-          if (mappedTo != null)
-          {
-            /*
-             * end of exon is in CDS range - 5' overlap
-             * to a range from the start of the peptide
-             */
-            mappedTo[0] = 1;
-          }
-        }
-        if (mappedTo == null)
+      /*
+       * locate the mapped range - null if either start or end is
+       * not mapped (no partial overlaps are calculated)
+       */
+      int start = sf.getBegin();
+      int end = sf.getEnd();
+      int[] mappedTo = mapping.locateInTo(start, end);
+      /*
+       * if whole exon range doesn't map, try interpreting it
+       * as 5' or 3' exon overlapping the CDS range
+       */
+      if (mappedTo == null)
+      {
+        mappedTo = mapping.locateInTo(end, end);
+        if (mappedTo != null)
         {
-          mappedTo = mapping.locateInTo(start, start);
-          if (mappedTo != null)
-          {
-            /*
-             * start of exon is in CDS range - 3' overlap
-             * to a range up to the end of the peptide
-             */
-            mappedTo[1] = toSeq.getLength();
-          }
+          /*
+           * end of exon is in CDS range - 5' overlap
+           * to a range from the start of the peptide
+           */
+          mappedTo[0] = 1;
         }
+      }
+      if (mappedTo == null)
+      {
+        mappedTo = mapping.locateInTo(start, start);
         if (mappedTo != null)
         {
-          SequenceFeature copy = new SequenceFeature(sf);
-          copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
-          copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
-          copyTo.addSequenceFeature(copy);
-          count++;
+          /*
+           * start of exon is in CDS range - 3' overlap
+           * to a range up to the end of the peptide
+           */
+          mappedTo[1] = toSeq.getLength();
         }
       }
+      if (mappedTo != null)
+      {
+        int newBegin = Math.min(mappedTo[0], mappedTo[1]);
+        int newEnd = Math.max(mappedTo[0], mappedTo[1]);
+        SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
+                sf.getFeatureGroup(), sf.getScore());
+        copyTo.addSequenceFeature(copy);
+        count++;
+      }
     }
     return count;
   }
@@ -2210,49 +2222,44 @@ public class AlignmentUtils
   public static List<int[]> findCdsPositions(SequenceI dnaSeq)
   {
     List<int[]> result = new ArrayList<int[]>();
-    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
-    if (sfs == null)
+
+    List<SequenceFeature> sfs = dnaSeq.getFeatures().getFeaturesByOntology(
+            SequenceOntologyI.CDS);
+    if (sfs.isEmpty())
     {
       return result;
     }
-
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    SequenceFeatures.sortFeatures(sfs, true);
     int startPhase = 0;
 
     for (SequenceFeature sf : sfs)
     {
+      int phase = 0;
+      try
+      {
+        phase = Integer.parseInt(sf.getPhase());
+      } catch (NumberFormatException e)
+      {
+        // ignore
+      }
       /*
-       * process a CDS feature (or a sub-type of CDS)
+       * phase > 0 on first codon means 5' incomplete - skip to the start
+       * of the next codon; example ENST00000496384
        */
-      if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+      int begin = sf.getBegin();
+      int end = sf.getEnd();
+      if (result.isEmpty())
       {
-        int phase = 0;
-        try
-        {
-          phase = Integer.parseInt(sf.getPhase());
-        } catch (NumberFormatException e)
-        {
-          // ignore
-        }
-        /*
-         * phase > 0 on first codon means 5' incomplete - skip to the start
-         * of the next codon; example ENST00000496384
-         */
-        int begin = sf.getBegin();
-        int end = sf.getEnd();
-        if (result.isEmpty())
+        begin += phase;
+        if (begin > end)
         {
-          begin += phase;
-          if (begin > end)
-          {
-            // shouldn't happen!
-            System.err.println(
-                    "Error: start phase extends beyond start CDS in "
-                            + dnaSeq.getName());
-          }
+          // shouldn't happen!
+          System.err
+                  .println("Error: start phase extends beyond start CDS in "
+                          + dnaSeq.getName());
         }
-        result.add(new int[] { begin, end });
       }
+      result.add(new int[] { begin, end });
     }
 
     /*
@@ -2272,7 +2279,7 @@ public class AlignmentUtils
      * ranges are assembled in order. Other cases should not use this method,
      * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
      */
-    Collections.sort(result, new RangeComparator(true));
+    Collections.sort(result, IntRangeComparator.ASCENDING);
     return result;
   }
 
@@ -2325,24 +2332,6 @@ public class AlignmentUtils
       count += computePeptideVariants(peptide, peptidePos, codonVariants);
     }
 
-    /*
-     * sort to get sequence features in start position order
-     * - would be better to store in Sequence as a TreeSet or NCList?
-     */
-    if (peptide.getSequenceFeatures() != null)
-    {
-      Arrays.sort(peptide.getSequenceFeatures(),
-              new Comparator<SequenceFeature>()
-              {
-                @Override
-                public int compare(SequenceFeature o1, SequenceFeature o2)
-                {
-                  int c = Integer.compare(o1.getBegin(), o2.getBegin());
-                  return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
-                          : c;
-                }
-              });
-    }
     return count;
   }
 
@@ -2470,10 +2459,9 @@ public class AlignmentUtils
       String trans3Char = StringUtils
               .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
       String desc = "p." + residue3Char + peptidePos + trans3Char;
-      // set score to 0f so 'graduated colour' option is offered! JAL-2060
       SequenceFeature sf = new SequenceFeature(
               SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
-              peptidePos, 0f, var.getSource());
+              peptidePos, var.getSource());
       StringBuilder attributes = new StringBuilder(32);
       String id = (String) var.variant.getValue(ID);
       if (id != null)
@@ -2531,10 +2519,10 @@ public class AlignmentUtils
      * LinkedHashMap ensures we keep the peptide features in sequence order
      */
     LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<Integer, List<DnaVariant>[]>();
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 
-    SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
-    if (dnaFeatures == null)
+    List<SequenceFeature> dnaFeatures = dnaSeq.getFeatures()
+            .getFeaturesByOntology(SequenceOntologyI.SEQUENCE_VARIANT);
+    if (dnaFeatures.isEmpty())
     {
       return variants;
     }
@@ -2554,84 +2542,80 @@ public class AlignmentUtils
         // not handling multi-locus variant features
         continue;
       }
-      if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+      int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+      if (mapsTo == null)
       {
-        int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
-        if (mapsTo == null)
-        {
-          // feature doesn't lie within coding region
-          continue;
-        }
-        int peptidePosition = mapsTo[0];
-        List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
-        if (codonVariants == null)
-        {
-          codonVariants = new ArrayList[CODON_LENGTH];
-          codonVariants[0] = new ArrayList<DnaVariant>();
-          codonVariants[1] = new ArrayList<DnaVariant>();
-          codonVariants[2] = new ArrayList<DnaVariant>();
-          variants.put(peptidePosition, codonVariants);
-        }
+        // feature doesn't lie within coding region
+        continue;
+      }
+      int peptidePosition = mapsTo[0];
+      List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
+      if (codonVariants == null)
+      {
+        codonVariants = new ArrayList[CODON_LENGTH];
+        codonVariants[0] = new ArrayList<DnaVariant>();
+        codonVariants[1] = new ArrayList<DnaVariant>();
+        codonVariants[2] = new ArrayList<DnaVariant>();
+        variants.put(peptidePosition, codonVariants);
+      }
 
-        /*
-         * extract dna variants to a string array
-         */
-        String alls = (String) sf.getValue("alleles");
-        if (alls == null)
-        {
-          continue;
-        }
-        String[] alleles = alls.toUpperCase().split(",");
-        int i = 0;
-        for (String allele : alleles)
-        {
-          alleles[i++] = allele.trim(); // lose any space characters "A, G"
-        }
+      /*
+       * extract dna variants to a string array
+       */
+      String alls = (String) sf.getValue("alleles");
+      if (alls == null)
+      {
+        continue;
+      }
+      String[] alleles = alls.toUpperCase().split(",");
+      int i = 0;
+      for (String allele : alleles)
+      {
+        alleles[i++] = allele.trim(); // lose any space characters "A, G"
+      }
 
-        /*
-         * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
-         */
-        int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
-                : MappingUtils.flattenRanges(dnaToProtein
-                        .locateInFrom(peptidePosition, peptidePosition));
-        lastPeptidePostion = peptidePosition;
-        lastCodon = codon;
+      /*
+       * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+       */
+      int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+              : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+                      peptidePosition, peptidePosition));
+      lastPeptidePostion = peptidePosition;
+      lastCodon = codon;
 
-        /*
-         * save nucleotide (and any variant) for each codon position
-         */
-        for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
+      /*
+       * save nucleotide (and any variant) for each codon position
+       */
+      for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
+      {
+        String nucleotide = String.valueOf(
+                dnaSeq.getCharAt(codon[codonPos] - dnaStart)).toUpperCase();
+        List<DnaVariant> codonVariant = codonVariants[codonPos];
+        if (codon[codonPos] == dnaCol)
         {
-          String nucleotide = String
-                  .valueOf(dnaSeq.getCharAt(codon[codonPos] - dnaStart))
-                  .toUpperCase();
-          List<DnaVariant> codonVariant = codonVariants[codonPos];
-          if (codon[codonPos] == dnaCol)
+          if (!codonVariant.isEmpty()
+                  && codonVariant.get(0).variant == null)
           {
-            if (!codonVariant.isEmpty()
-                    && codonVariant.get(0).variant == null)
-            {
-              /*
-               * already recorded base value, add this variant
-               */
-              codonVariant.get(0).variant = sf;
-            }
-            else
-            {
-              /*
-               * add variant with base value
-               */
-              codonVariant.add(new DnaVariant(nucleotide, sf));
-            }
+            /*
+             * already recorded base value, add this variant
+             */
+            codonVariant.get(0).variant = sf;
           }
-          else if (codonVariant.isEmpty())
+          else
           {
             /*
-             * record (possibly non-varying) base value
+             * add variant with base value
              */
-            codonVariant.add(new DnaVariant(nucleotide));
+            codonVariant.add(new DnaVariant(nucleotide, sf));
           }
         }
+        else if (codonVariant.isEmpty())
+        {
+          /*
+           * record (possibly non-varying) base value
+           */
+          codonVariant.add(new DnaVariant(nucleotide));
+        }
       }
     }
     return variants;
@@ -2910,9 +2894,7 @@ public class AlignmentUtils
               seqMap.getMap().getInverse());
     }
 
-    char[] fromChars = fromSeq.getSequence();
     int toStart = seq.getStart();
-    char[] toChars = seq.getSequence();
 
     /*
      * traverse [start, end, start, end...] ranges in fromSeq
@@ -2943,10 +2925,10 @@ public class AlignmentUtils
          * of the next character of the mapped-to sequence; stop when all
          * the characters of the range have been counted
          */
-        while (mappedCharPos <= range[1] && fromCol <= fromChars.length
+        while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
                 && fromCol >= 0)
         {
-          if (!Comparison.isGap(fromChars[fromCol - 1]))
+          if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
           {
             /*
              * mapped from sequence has a character in this column
@@ -2958,7 +2940,7 @@ public class AlignmentUtils
               seqsMap = new HashMap<SequenceI, Character>();
               map.put(fromCol, seqsMap);
             }
-            seqsMap.put(seq, toChars[mappedCharPos - toStart]);
+            seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
             mappedCharPos++;
           }
           fromCol += (forward ? 1 : -1);