JAL-1705 tests added, minor bugfix and refactoring

author gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)
diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java

index 7d0b6fd..22c0a06 100644 (file)
--- a/src/jalview/ext/ensembl/EnsemblCds.java
+++ b/src/jalview/ext/ensembl/EnsemblCds.java
@@ -1,8 +1,11 @@
  package jalview.ext.ensembl;
  
  import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
  import jalview.io.gff.SequenceOntology;
  
+import java.util.List;
+
  public class EnsemblCds extends EnsemblSeqProxy
  {
    /*
@@ -76,4 +79,18 @@ public class EnsemblCds extends EnsemblSeqProxy
      return false;
    }
  
+  /**
+   * Overrides this method to trivially return a range which is the whole of the
+   * nucleotide sequence. This is both faster than scanning for CDS features,
+   * and also means we don't need to keep CDS features on CDS sequence (where
+   * they are redundant information).
+   */
+  @Override
+  protected int getCdsRanges(SequenceI dnaSeq, List<int[]> ranges)
+  {
+    int len = dnaSeq.getLength();
+    ranges.add(new int[] { 1, len });
+    return len;
+  }
+
  }
diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java

index fb0b01c..e241874 100644 (file)
--- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java
+++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
@@ -296,29 +296,62 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     */
    protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq)
    {
-    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
-    if (sfs == null)
+    List<int[]> ranges = new ArrayList<int[]>(50);
+
+    int mappedDnaLength = getCdsRanges(dnaSeq, ranges);
+
+    int proteinLength = proteinSeq.getLength();
+    List<int[]> proteinRange = new ArrayList<int[]>();
+    int proteinStart = 1;
+
+    /*
+     * incomplete start codon may mean X at start of peptide
+     * we ignore both for mapping purposes
+     */
+    if (proteinSeq.getCharAt(0) == 'X')
      {
-      return null;
+      proteinStart = 2;
+      proteinLength--;
      }
+    proteinRange.add(new int[] { proteinStart, proteinLength });
  
-    List<int[]> ranges = new ArrayList<int[]>(50);
-    SequenceOntology so = SequenceOntology.getInstance();
-
-    int mappedDnaLength = 0;
-    
      /*
-     * Map CDS columns of dna to peptide. No need to worry about reverse strand
-     * dna here since the retrieved sequence is as transcribed (reverse
-     * complement for reverse strand), i.e in the same sense as the peptide. 
+     * dna length should map to protein (or protein plus stop codon)
       */
-    boolean fivePrimeIncomplete = false;
+    int codesForResidues = mappedDnaLength / 3;
+    if (codesForResidues == proteinLength
+            || codesForResidues == (proteinLength + 1))
+    {
+      return new MapList(ranges, proteinRange, 3, 1);
+    }
+    return null;
+  }
+
+  /**
+   * Adds CDS ranges to the ranges list, and returns the total length mapped.
+   * 
+   * No need to worry about reverse strand dna here since the retrieved sequence
+   * is as transcribed (reverse complement for reverse strand), i.e in the same
+   * sense as the peptide.
+   * 
+   * @param dnaSeq
+   * @param ranges
+   * @return
+   */
+  protected int getCdsRanges(SequenceI dnaSeq, List<int[]> ranges)
+  {
+    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+    if (sfs == null)
+    {
+      return 0;
+    }
+    int mappedDnaLength = 0;
      for (SequenceFeature sf : sfs)
      {
        /*
         * process a CDS feature (or a sub-type of CDS)
         */
-      if (so.isA(sf.getType(), SequenceOntology.CDS))
+      if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.CDS))
        {
          int phase = 0;
          try {
@@ -335,7 +368,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          int end = sf.getEnd();
          if (ranges.isEmpty() && phase > 0)
          {
-          fivePrimeIncomplete = true;
            begin += phase;
            if (begin > end)
            {
@@ -346,26 +378,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          mappedDnaLength += Math.abs(end - begin) + 1;
        }
      }
-    int proteinLength = proteinSeq.getLength();
-    List<int[]> proteinRange = new ArrayList<int[]>();
-    int proteinStart = 1;
-    if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X')
-    {
-      proteinStart = 2;
-      proteinLength--;
-    }
-    proteinRange.add(new int[] { proteinStart, proteinLength });
-
-    /*
-     * dna length should map to protein (or protein plus stop codon)
-     */
-    int codesForResidues = mappedDnaLength / 3;
-    if (codesForResidues == proteinLength
-            || codesForResidues == (proteinLength + 1))
-    {
-      return new MapList(ranges, proteinRange, 3, 1);
-    }
-    return null;
+    return mappedDnaLength;
    }
  
    /**
@@ -834,7 +847,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        peptide = peptide.getDatasetSequence();
      }
    
-    mapExonsToProtein(dnaSeq, peptide, dnaToProtein);
+    mapExonFeaturesToProtein(dnaSeq, peptide, dnaToProtein);
  
      LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
              dnaSeq, dnaToProtein);
@@ -852,7 +865,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                residue);
        if (!peptideVariants.isEmpty())
        {
-        Collections.sort(peptideVariants);
          String desc = StringUtils.listToDelimitedString(peptideVariants,
                  ", ");
          SequenceFeature sf = new SequenceFeature(
@@ -875,7 +887,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     * @param peptide
     * @param dnaToProtein
     */
-  static int mapExonsToProtein(SequenceI dnaSeq, SequenceI peptide,
+  static int mapExonFeaturesToProtein(SequenceI dnaSeq, SequenceI peptide,
            MapList dnaToProtein)
    {
      SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
@@ -930,6 +942,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        return variants;
      }
    
+    int dnaStart = dnaSeq.getStart();
      int[] lastCodon = null;
      int lastPeptidePostion = 0;
    
@@ -985,7 +998,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          for (int codonPos = 0; codonPos < 3; codonPos++)
          {
            String nucleotide = String.valueOf(dnaSeq
-                  .getCharAt(codon[codonPos] - 1));
+                  .getCharAt(codon[codonPos] - dnaStart));
            if (codon[codonPos] == dnaCol)
            {
              /*
@@ -1011,11 +1024,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    }
  
    /**
-   * Returns a non-redundant list of all peptide translations generated by the
-   * given dna variants, excluding the current residue value
+   * Returns a sorted, non-redundant list of all peptide translations generated
+   * by the given dna variants, excluding the current residue value
     * 
     * @param codonVariants
-   *          an array of base values for codon positions 1, 2, 3
+   *          an array of base values (acgtACGT) for codon positions 1, 2, 3
     * @param residue
     *          the current residue translation
     * @return
@@ -1036,13 +1049,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
            String peptide = codon.contains("-") ? "-" : ResidueProperties
                    .codonTranslate(codon);
            if (peptide != null && !result.contains(peptide)
-                  && !peptide.equals(residue))
+                  && !peptide.equalsIgnoreCase(residue))
            {
              result.add(peptide);
            }
          }
        }
      }
+
+    /*
+     * sort alphabetically with STOP at the end
+     */
+    Collections.sort(result, new Comparator<String>()
+    {
+
+      @Override
+      public int compare(String o1, String o2)
+      {
+        if ("STOP".equals(o1))
+        {
+          return 1;
+        }
+        else if ("STOP".equals(o2))
+        {
+          return -1;
+        }
+        else
+        {
+          return o1.compareTo(o2);
+        }
+      }
+    });
      return result;
    }
  
diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java

index 10ecfe0..c525e95 100644 (file)
--- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java
+++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java
@@ -1,5 +1,7 @@
  package jalview.ext.ensembl;
  
+import static org.testng.AssertJUnit.assertEquals;
+
  import jalview.datamodel.Alignment;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.SequenceI;
@@ -208,5 +210,74 @@ public class EnsemblSeqProxyTest
              + (isAvailable ? "UP!"
                      : "DOWN or unreachable ******************* BAD!"));
    }
-  // todo lots of tests
+
+  /**
+   * Tests for the method that computes all peptide variants given codon
+   * variants
+   */
+  @Test(groups = "Functional")
+  public void testComputePeptideVariants()
+  {
+    String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
+
+    /*
+     * AGT codes for S - this is not included in the variants returned
+     */
+    List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[]", variants.toString());
+
+    // S is reported if it differs from the current value (A):
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
+    assertEquals("[S]", variants.toString());
+
+    /*
+     * synonymous variant is not reported
+     */
+    codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
+    // AGC and AGT both code for S
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
+    assertEquals("[]", variants.toString());
+
+    /*
+     * equivalent variants are only reported once
+     */
+    codonVariants = new String[][] { { "C" }, { "T" },
+        { "A", "C", "G", "T" } };
+    // CTA CTC CTG CTT all code for L
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[L]", variants.toString());
+
+    /*
+     * vary codons 1 and 2; variant products are sorted and non-redundant
+     */
+    codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
+    // aga ata cga cta code for R, I, R, L
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[I, L, R]", variants.toString());
+
+    /*
+     * vary codons 2 and 3
+     */
+    codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
+    // aga agc ata atc code for R, S, I, I
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[I, R]", variants.toString());
+
+    /*
+     * vary codons 1 and 3
+     */
+    codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
+    // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[K, N, Y, STOP]", variants.toString());
+
+    /*
+     * vary codons 1, 2 and 3
+     */
+    codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
+        { "t", "g" } };
+    // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
+    variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
+    assertEquals("[C, R, T, W]", variants.toString());
+  }
  }
\ No newline at end of file
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 28 Jan 2016 15:00:17 +0000 (15:00 +0000)
src/jalview/ext/ensembl/EnsemblCds.java		patch \| blob \| history
src/jalview/ext/ensembl/EnsemblSeqProxy.java		patch \| blob \| history
test/jalview/ext/ensembl/EnsemblSeqProxyTest.java		patch \| blob \| history