Merge branch 'bug/JAL-2920_uniprotvariantfeature' into releases/Release_2_10_4_Branch
authorJim Procter <jprocter@issues.jalview.org>
Wed, 6 Jun 2018 15:12:58 +0000 (16:12 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Wed, 6 Jun 2018 15:12:58 +0000 (16:12 +0100)
resources/uniprot_mapping.xml
src/jalview/datamodel/xdb/uniprot/UniprotFeature.java
src/jalview/ws/dbsources/Uniprot.java
test/jalview/ws/dbsources/UniprotTest.java

index 4c9ad5f..68868c4 100755 (executable)
@@ -18,6 +18,7 @@
  * The Jalview Authors are detailed in the 'AUTHORS' file.
 -->
 <mapping>
+  <!-- see https://www.uniprot.org/docs/uniprot.xsd for latest Uniprot XML schema -->
        <class name="jalview.datamodel.xdb.uniprot.UniprotFile">
                  <map-to xml="uniprot"/>               
                  <field name="UniprotEntries" type="jalview.datamodel.xdb.uniprot.UniprotEntry" collection="vector">
index 3bae87e..8bd5652 100644 (file)
@@ -53,40 +53,7 @@ public class UniprotFeature
 
   public String getDescription()
   {
-    if (description == null && variation == null && original == null)
-    {
-      return null;
-    }
-    StringBuilder sb = new StringBuilder();
-    if (description != null)
-    {
-      sb.append(description);
-    }
-    if (variation != null && variation.size() > 0)
-    {
-      int i = 0;
-      for (String var : variation)
-      {
-        if (i++ > 0)
-        {
-          sb.append(",");
-        }
-        if (sb.length() > 0)
-        {
-          sb.append(" ");
-        }
-        sb.append("Variation: '" + var + "'");
-      }
-    }
-    if (original != null)
-    {
-      if (sb.length() > 0)
-      {
-        sb.append(" ");
-      }
-      sb.append("Original: '" + original + "'");
-    }
-    return sb.toString();
+    return description;
   }
 
   public void setDescription(String d)
index 6b09eb6..c311ea9 100644 (file)
@@ -32,6 +32,8 @@ import jalview.datamodel.SequenceI;
 import jalview.datamodel.xdb.uniprot.UniprotEntry;
 import jalview.datamodel.xdb.uniprot.UniprotFeature;
 import jalview.datamodel.xdb.uniprot.UniprotFile;
+import jalview.schemes.ResidueProperties;
+import jalview.util.StringUtils;
 import jalview.ws.seqfetcher.DbSourceProxyImpl;
 
 import java.io.InputStream;
@@ -40,6 +42,7 @@ import java.io.Reader;
 import java.net.URL;
 import java.net.URLConnection;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.Vector;
 
 import org.exolab.castor.mapping.Mapping;
@@ -278,7 +281,7 @@ public class Uniprot extends DbSourceProxyImpl
       for (UniprotFeature uf : entry.getFeature())
       {
         SequenceFeature copy = new SequenceFeature(uf.getType(),
-                uf.getDescription(), uf.getBegin(), uf.getEnd(), "Uniprot");
+                getDescription(uf), uf.getBegin(), uf.getEnd(), "Uniprot");
         copy.setStatus(uf.getStatus());
         sequence.addSequenceFeature(copy);
       }
@@ -291,6 +294,86 @@ public class Uniprot extends DbSourceProxyImpl
   }
 
   /**
+   * Constructs a feature description from the description and (optionally)
+   * original and variant fields of the Uniprot XML feature
+   * 
+   * @param uf
+   * @return
+   */
+  protected static String getDescription(UniprotFeature uf)
+  {
+    String orig = uf.getOriginal();
+    List<String> variants = uf.getVariation();
+    StringBuilder sb = new StringBuilder();
+
+    /*
+     * append variant in standard format if present
+     * e.g. p.Arg59Lys
+     */
+    if (orig != null && !orig.isEmpty() && variants != null
+            && !variants.isEmpty())
+    {
+      int p = 0;
+      for (String var : variants)
+      {
+        // TODO proper HGVS nomenclature for delins structural variations
+        // http://varnomen.hgvs.org/recommendations/protein/variant/delins/
+        // for now we are pragmatic - any orig/variant sequence longer than
+        // three characters is shown with single-character notation rather than
+        // three-letter notation
+        sb.append("p.");
+        if (orig.length() < 4)
+        {
+          for (int c = 0, clen = orig.length(); c < clen; c++)
+          {
+            char origchar = orig.charAt(c);
+            String orig3 = ResidueProperties.aa2Triplet.get("" + origchar);
+            sb.append(orig3 == null ? origchar
+                    : StringUtils.toSentenceCase(orig3));
+          }
+        }
+        else
+        {
+          sb.append(orig);
+        }
+
+        sb.append(Integer.toString(uf.getPosition()));
+
+        if (var.length() < 4)
+        {
+          for (int c = 0, clen = var.length(); c < clen; c++)
+          {
+            char varchar = var.charAt(c);
+            String var3 = ResidueProperties.aa2Triplet.get("" + varchar);
+
+            sb.append(var3 != null ? StringUtils.toSentenceCase(var3)
+                    : "" + varchar);
+          }
+        }
+        else
+        {
+          sb.append(var);
+        }
+        if (++p != variants.size())
+        {
+          sb.append("\n");
+        }
+        else
+        {
+          sb.append(" ");
+        }
+      }
+    }
+    String description = uf.getDescription();
+    if (description != null)
+    {
+      sb.append(description);
+    }
+
+    return sb.toString();
+  }
+
+  /**
    * 
    * @param entry
    *          UniportEntry
index b70e581..ab79f10 100644 (file)
@@ -35,6 +35,7 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Vector;
 
+import org.testng.Assert;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.Test;
 
@@ -67,6 +68,8 @@ public class UniprotTest
           + "<feature type=\"sequence variant\" description=\"Pathogenic\"><original>M</original><variation>L</variation><location><position position=\"41\"/></location></feature>"
           + "<feature type=\"sequence variant\" description=\"Pathogenic\"><original>M</original><location><position position=\"41\"/></location></feature>"
           + "<feature type=\"sequence variant\" description=\"Foo\"><variation>L</variation><variation>LMV</variation><original>M</original><location><position position=\"42\"/></location></feature>"
+          + "<feature type=\"sequence variant\" description=\"Foo\"><variation>LL</variation><variation>LMV</variation><original>ML</original><location><begin position=\"42\"/><end position=\"43\"/></location></feature>"
+          + "<feature type=\"sequence variant\" description=\"Foo Too\"><variation>LL</variation><variation>LMVK</variation><original>MLML</original><location><begin position=\"42\"/><end position=\"45\"/></location></feature>"
           + "<sequence length=\"10\" mass=\"27410\" checksum=\"8CB760AACF88FE6C\" modified=\"2008-01-15\" version=\"1\">MHAPL VSKDL</sequence></entry>"
           + "</uniprot>";
 
@@ -102,7 +105,7 @@ public class UniprotTest
      * Check sequence features
      */
     Vector<UniprotFeature> features = entry.getFeature();
-    assertEquals(7, features.size());
+    assertEquals(9, features.size());
     UniprotFeature sf = features.get(0);
     assertEquals("signal peptide", sf.getType());
     assertNull(sf.getDescription());
@@ -124,33 +127,47 @@ public class UniprotTest
 
     sf = features.get(3);
     assertEquals("sequence variant", sf.getType());
-    assertEquals("Variation: 'L' Original: 'M'", sf.getDescription());
+    assertNull(sf.getDescription());
     assertEquals(41, sf.getPosition());
     assertEquals(41, sf.getBegin());
     assertEquals(41, sf.getEnd());
 
     sf = features.get(4);
     assertEquals("sequence variant", sf.getType());
-    assertEquals("Pathogenic Variation: 'L' Original: 'M'",
-            sf.getDescription());
+    assertEquals("Pathogenic", sf.getDescription());
     assertEquals(41, sf.getPosition());
     assertEquals(41, sf.getBegin());
     assertEquals(41, sf.getEnd());
 
     sf = features.get(5);
     assertEquals("sequence variant", sf.getType());
-    assertEquals("Pathogenic Original: 'M'", sf.getDescription());
+    assertEquals("Pathogenic", sf.getDescription());
     assertEquals(41, sf.getPosition());
     assertEquals(41, sf.getBegin());
     assertEquals(41, sf.getEnd());
 
     sf = features.get(6);
     assertEquals("sequence variant", sf.getType());
-    assertEquals("Foo Variation: 'L', Variation: 'LMV' Original: 'M'",
+    assertEquals("Foo",
             sf.getDescription());
     assertEquals(42, sf.getPosition());
     assertEquals(42, sf.getBegin());
     assertEquals(42, sf.getEnd());
+    Assert.assertEquals(Uniprot.getDescription(sf),
+            "p.Met42Leu" + "\n" + "p.Met42LeuMetVal Foo");
+
+    sf = features.get(7);
+    assertEquals(42, sf.getBegin());
+    assertEquals(43, sf.getEnd());
+    Assert.assertEquals(Uniprot.getDescription(sf),
+            "p.MetLeu42LeuLeu" + "\n" + "p.MetLeu42LeuMetVal Foo");
+
+    sf = features.get(8);
+    assertEquals(42, sf.getBegin());
+    assertEquals(45, sf.getEnd());
+    Assert.assertEquals(Uniprot.getDescription(sf),
+            "p.MLML42LeuLeu" + "\n" + "p.MLML42LMVK Foo Too");
+
     /*
      * Check cross-references
      */
@@ -219,4 +236,29 @@ public class UniprotTest
     assertEquals(expectedDescription,
             Uniprot.getUniprotEntryDescription(entry));
   }
+
+  @Test(groups = { "Functional" })
+  public void testGetDescription()
+  {
+    UniprotFeature uf = new UniprotFeature();
+    assertEquals("", Uniprot.getDescription(uf));
+
+    uf.setDescription("Hello");
+    assertEquals("Hello", Uniprot.getDescription(uf));
+
+    uf.setPosition(23);
+    uf.setOriginal("K");
+    Vector<String> vars = new Vector();
+    vars.add("y");
+    uf.setVariation(vars);
+    assertEquals("p.Lys23Tyr Hello", Uniprot.getDescription(uf));
+
+    vars.clear();
+    vars.add("z"); // unknown variant - fails gracefully
+    uf.setVariation(vars);
+    assertEquals("p.Lys23z Hello", Uniprot.getDescription(uf));
+
+    uf.setVariation(null); // variant missing - is ignored
+    assertEquals("Hello", Uniprot.getDescription(uf));
+  }
 }