JAL-2029 many-to-many EnsemblCDS-to-Uniprot mappings

author gmungoc <g.m.carstairs@dundee.ac.uk>

Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)
diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java

index 7d09a3b..d48c14a 100644 (file)
--- a/src/jalview/analysis/CrossRef.java
+++ b/src/jalview/analysis/CrossRef.java
@@ -361,6 +361,9 @@ public class CrossRef
              {
                updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
  
+              SequenceIdMatcher matcher = new SequenceIdMatcher(
+                      dataset.getSequences());
+              matcher.addAll(addedPeers);
                List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
                CrossRef me = new CrossRef();
                for (int rs = 0; rs < retrieved.length; rs++)
@@ -378,8 +381,16 @@ public class CrossRef
                      {
                        if (map.getTo() != null && map.getMap() != null)
                        {
-                        // should search the local dataset to find any existing
-                        // candidates for To !
+                        SequenceI matched = matcher
+                                .findIdMatch(map.getTo());
+                        if (matched != null)
+                        {
+                          map.setTo(matched);
+                        }
+                        else
+                        {
+                          matcher.add(map.getTo());
+                        }
                          try
                          {
                            // compare ms with dss and replace with dss in mapping
@@ -433,7 +444,10 @@ public class CrossRef
                            }
                            else
                            {
-                            addedPeers.add(map.getTo());
+                            if (!addedPeers.contains(map.getTo()))
+                            {
+                              addedPeers.add(map.getTo());
+                            }
                              cf.addMap(retrieved[rs].getDatasetSequence(),
                                      map.getTo(), map.getMap());
                            }
diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java

index b89287c..70defb0 100755 (executable)
--- a/src/jalview/analysis/SequenceIdMatcher.java
+++ b/src/jalview/analysis/SequenceIdMatcher.java
@@ -46,7 +46,7 @@ public class SequenceIdMatcher
    }
  
    /**
-   * add more sequences to this matcher - also used by the constructor
+   * Adds sequences to this matcher
     * 
     * @param seqs
     */
@@ -54,26 +54,36 @@ public class SequenceIdMatcher
    {
      for (SequenceI seq : seqs)
      {
-      // TODO: deal with ID collisions - SequenceI should be appended to list
-      // associated with this key.
-      names.put(new SeqIdName(seq.getDisplayId(true)), seq);
-      SequenceI dbseq = seq;
-      while (dbseq.getDatasetSequence() != null)
-      {
-        dbseq = dbseq.getDatasetSequence();
-      }
-      // add in any interesting identifiers
-      if (dbseq.getDBRefs() != null)
+      add(seq);
+    }
+  }
+
+  /**
+   * Adds one sequence to this matcher
+   * 
+   * @param seq
+   */
+  public void add(SequenceI seq)
+  {
+    // TODO: deal with ID collisions - SequenceI should be appended to list
+    // associated with this key.
+    names.put(new SeqIdName(seq.getDisplayId(true)), seq);
+    SequenceI dbseq = seq;
+    while (dbseq.getDatasetSequence() != null)
+    {
+      dbseq = dbseq.getDatasetSequence();
+    }
+    // add in any interesting identifiers
+    if (dbseq.getDBRefs() != null)
+    {
+      DBRefEntry dbr[] = dbseq.getDBRefs();
+      SeqIdName sid = null;
+      for (int r = 0; r < dbr.length; r++)
        {
-        DBRefEntry dbr[] = dbseq.getDBRefs();
-        SeqIdName sid = null;
-        for (int r = 0; r < dbr.length; r++)
+        sid = new SeqIdName(dbr[r].getAccessionId());
+        if (!names.containsKey(sid))
          {
-          sid = new SeqIdName(dbr[r].getAccessionId());
-          if (!names.containsKey(sid))
-          {
-            names.put(sid, seq);
-          }
+          names.put(sid, seq);
          }
        }
      }
diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java

index 87e2789..7da6d6c 100644 (file)
--- a/src/jalview/datamodel/xdb/embl/EmblEntry.java
+++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java
@@ -20,6 +20,7 @@
   */
  package jalview.datamodel.xdb.embl;
  
+import jalview.analysis.SequenceIdMatcher;
  import jalview.datamodel.DBRefEntry;
  import jalview.datamodel.DBRefSource;
  import jalview.datamodel.FeatureProperties;
@@ -27,21 +28,32 @@ import jalview.datamodel.Mapping;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
+import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
  
  import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
  import java.util.Map.Entry;
  import java.util.Vector;
+import java.util.regex.Pattern;
  
  /**
   * Data model for one entry returned from an EMBL query, as marshalled by a
   * Castor binding file
   * 
- * For example: http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/x53828/emblxml
+ * For example:
+ * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
+ * &format=emblxml
   * 
   * @see embl_mapping.xml
   */
  public class EmblEntry
  {
+  private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
+
    String accession;
  
    String version;
@@ -251,207 +263,48 @@ public class EmblEntry
      this.version = version;
    }
  
-  /*
-   * EMBL Feature support is limited. The text below is included for the benefit
-   * of any developer working on improving EMBL feature import in Jalview.
-   * Extract from EMBL feature specification see
-   * http://www.embl-ebi.ac.uk/embl/Documentation
-   * /FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose
-   * 
-   * The location indicates the region of the presented sequence which
-   * corresponds to a feature.
-   * 
-   * 3.5.2 Format and conventions The location contains at least one sequence
-   * location descriptor and may contain one or more operators with one or more
-   * sequence location descriptors. Base numbers refer to the numbering in the
-   * entry. This numbering designates the first base (5' end) of the presented
-   * sequence as base 1. Base locations beyond the range of the presented
-   * sequence may not be used in location descriptors, the only exception being
-   * location in a remote entry (see 3.5.2.1, e).
-   * 
-   * Location operators and descriptors are discussed in more detail below.
-   * 
-   * 3.5.2.1 Location descriptors
-   * 
-   * The location descriptor can be one of the following: (a) a single base
-   * number (b) a site between two indicated adjoining bases (c) a single base
-   * chosen from within a specified range of bases (not allowed for new entries)
-   * (d) the base numbers delimiting a sequence span (e) a remote entry
-   * identifier followed by a local location descriptor (i.e., a-d)
-   * 
-   * A site between two adjoining nucleotides, such as endonucleolytic cleavage
-   * site, is indicated by listing the two points separated by a carat (^). The
-   * permitted formats for this descriptor are n^n+1 (for example 55^56), or,
-   * for circular molecules, n^1, where "n" is the full length of the molecule,
-   * ie 1000^1 for circular molecule with length 1000.
-   * 
-   * A single base chosen from a range of bases is indicated by the first base
-   * number and the last base number of the range separated by a single period
-   * (e.g., '12.21' indicates a single base taken from between the indicated
-   * points). From October 2006 the usage of this descriptor is restricted : it
-   * is illegal to use "a single base from a range" (c) either on its own or in
-   * combination with the "sequence span" (d) descriptor for newly created
-   * entries. The existing entries where such descriptors exist are going to be
-   * retrofitted.
-   * 
-   * Sequence spans are indicated by the starting base number and the ending
-   * base number separated by two periods (e.g., '34..456'). The '<' and '>'
-   * symbols may be used with the starting and ending base numbers to indicate
-   * that an end point is beyond the specified base number. The starting and
-   * ending base positions can be represented as distinct base numbers
-   * ('34..456') or a site between two indicated adjoining bases.
-   * 
-   * A location in a remote entry (not the entry to which the feature table
-   * belongs) can be specified by giving the accession-number and sequence
-   * version of the remote entry, followed by a colon ":", followed by a
-   * location descriptor which applies to that entry's sequence (i.e.
-   * J12345.1:1..15, see also examples below)
-   * 
-   * 3.5.2.2 Operators
-   * 
-   * The location operator is a prefix that specifies what must be done to the
-   * indicated sequence to find or construct the location corresponding to the
-   * feature. A list of operators is given below with their definitions and most
-   * common format.
-   * 
-   * complement(location) Find the complement of the presented sequence in the
-   * span specified by " location" (i.e., read the complement of the presented
-   * strand in its 5'-to-3' direction)
-   * 
-   * join(location,location, ... location) The indicated elements should be
-   * joined (placed end-to-end) to form one contiguous sequence
-   * 
-   * order(location,location, ... location) The elements can be found in the
-   * specified order (5' to 3' direction), but nothing is implied about the
-   * reasonableness about joining them
-   * 
-   * Note : location operator "complement" can be used in combination with
-   * either " join" or "order" within the same location; combinations of "join"
-   * and "order" within the same location (nested operators) are illegal.
-   * 
-   * 
-   * 
-   * 3.5.3 Location examples
-   * 
-   * The following is a list of common location descriptors with their meanings:
-   * 
-   * Location Description
-   * 
-   * 467 Points to a single base in the presented sequence
-   * 
-   * 340..565 Points to a continuous range of bases bounded by and including the
-   * starting and ending bases
-   * 
-   * <345..500 Indicates that the exact lower boundary point of a feature is
-   * unknown. The location begins at some base previous to the first base
-   * specified (which need not be contained in the presented sequence) and
-   * continues to and includes the ending base
-   * 
-   * <1..888 The feature starts before the first sequenced base and continues to
-   * and includes base 888
-   * 
-   * 1..>888 The feature starts at the first sequenced base and continues beyond
-   * base 888
-   * 
-   * 102.110 Indicates that the exact location is unknown but that it is one of
-   * the bases between bases 102 and 110, inclusive
-   * 
-   * 123^124 Points to a site between bases 123 and 124
-   * 
-   * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to
-   * form one contiguous sequence
-   * 
-   * 
-   * complement(34..126) Start at the base complementary to 126 and finish at
-   * the base complementary to base 34 (the feature is on the strand
-   * complementary to the presented strand)
-   * 
-   * 
-   * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918
-   * to 5163, then complements the joined segments (the feature is on the strand
-   * complementary to the presented strand)
-   * 
-   * join(complement(4918..5163),complement(2691..4571)) Complements regions
-   * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the
-   * feature is on the strand complementary to the presented strand)
-   * 
-   * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
-   * this database) with primary accession number 'J00194'
-   * 
-   * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry
-   * with the region 100..202 of remote entry J00194
-   */
    /**
     * Recover annotated sequences from EMBL file
     * 
-   * @param noNa
-   *          don't return nucleic acid sequences
     * @param sourceDb
-   *          TODO
-   * @param noProtein
-   *          don't return any translated protein sequences marked in features
-   * @return dataset sequences with DBRefs and features - DNA always comes first
+   * @param peptides
+   *          a list of protein products found so far (to add to)
+   * @return dna dataset sequence with DBRefs and features
     */
-  public jalview.datamodel.SequenceI[] getSequences(boolean noNa,
-          boolean noPeptide, String sourceDb)
-  { // TODO: ensure emblEntry.getSequences behaves correctly for returning all
-    // cases of noNa and noPeptide
-    Vector<SequenceI> seqs = new Vector<SequenceI>();
-    Sequence dna = null;
-    if (!noNa)
+  public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
+  {
+    SequenceI dna = new Sequence(sourceDb + "|" + accession,
+            sequence.getSequence());
+    dna.setDescription(desc);
+    DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
+    dna.addDBRef(retrievedref);
+    // add map to indicate the sequence is a valid coordinate frame for the
+    // dbref
+    retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
+            new int[] { 1, dna.getLength() }, 1, 1));
+    // TODO: transform EMBL Database refs to canonical form
+    if (dbRefs != null)
      {
-      // In theory we still need to create this if noNa is set to avoid a null
-      // pointer exception
-      dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence());
-      dna.setDescription(desc);
-      DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
-      dna.addDBRef(retrievedref);
-      // add map to indicate the sequence is a valid coordinate frame for the
-      // dbref
-      retrievedref.setMap(new Mapping(null,
-              new int[] { 1, dna.getLength() }, new int[] { 1,
-                  dna.getLength() }, 1, 1));
-      // TODO: transform EMBL Database refs to canonical form
-      if (dbRefs != null)
+      for (DBRefEntry dbref : dbRefs)
        {
-        for (DBRefEntry dbref : dbRefs)
-        {
-          dna.addDBRef(dbref);
-        }
+        dna.addDBRef(dbref);
        }
      }
+
      try
      {
        for (EmblFeature feature : features)
        {
-        if (!noNa)
+        if (feature.dbRefs != null)
          {
-          if (feature.dbRefs != null)
+          for (DBRefEntry dbref : feature.dbRefs)
            {
-            for (DBRefEntry dbref : feature.dbRefs)
-            {
-              dna.addDBRef(dbref);
-            }
+            dna.addDBRef(dbref);
            }
          }
          if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
          {
-          parseCodingFeature(feature, sourceDb, seqs, dna, noPeptide);
-        }
-        else
-        {
-          // General feature type.
-          // TODO this is just duplicated code ??
-          if (!noNa)
-          {
-            if (feature.dbRefs != null)
-            {
-              for (DBRefEntry dbref : feature.dbRefs)
-              {
-                dna.addDBRef(dbref);
-              }
-            }
-          }
+          parseCodingFeature(feature, sourceDb, dna, peptides);
          }
        }
      } catch (Exception e)
@@ -463,65 +316,46 @@ public class EmblEntry
        System.err.println("Resulted in exception: " + e.getMessage());
        e.printStackTrace(System.err);
      }
-    if (!noNa && dna != null)
-    {
-      seqs.add(dna);
-    }
-    SequenceI[] sqs = new SequenceI[seqs.size()];
-    for (int i = 0, j = seqs.size(); i < j; i++)
-    {
-      sqs[i] = seqs.elementAt(i);
-      seqs.set(i, null);
-    }
-    return sqs;
+
+    return dna;
    }
  
    /**
-   * attempt to extract coding region and product from a feature and properly
-   * decorate it with annotations.
+   * Extracts coding region and product from a CDS feature and properly decorate
+   * it with annotations.
     * 
     * @param feature
     *          coding feature
     * @param sourceDb
     *          source database for the EMBLXML
-   * @param seqs
-   *          place where sequences go
     * @param dna
     *          parent dna sequence for this record
-   * @param noPeptide
-   *          flag for generation of Peptide sequence objects
+   * @param peptides
+   *          list of protein product sequences for Embl entry
     */
-  private void parseCodingFeature(EmblFeature feature, String sourceDb,
-          Vector<SequenceI> seqs, Sequence dna, boolean noPeptide)
+  void parseCodingFeature(EmblFeature feature, String sourceDb,
+          SequenceI dna, List<SequenceI> peptides)
    {
      boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
-    // extract coding region(s)
-    jalview.datamodel.Mapping map = null;
-    int[] exon = null;
-    if (feature.locations != null)
-    {
-      for (EmblFeatureLocations loc : feature.locations)
-      {
-        int[] se = loc.getElementRanges(accession);
-        if (exon == null)
-        {
-          exon = se;
-        }
-        else
-        {
-          int[] t = new int[exon.length + se.length];
-          System.arraycopy(exon, 0, t, 0, exon.length);
-          System.arraycopy(se, 0, t, exon.length, se.length);
-          exon = t;
-        }
-      }
-    }
+
+    int[] exon = getCdsRanges(feature);
+
      String prseq = null;
-    String prname = new String();
+    String prname = "";
      String prid = null;
-    Hashtable<String, String> vals = new Hashtable<String, String>();
-    int prstart = 1;
-    // get qualifiers
+    Map<String, String> vals = new Hashtable<String, String>();
+    SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
+
+    /*
+     * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
+     * (phase is required for CDS features in GFF3 format)
+     */
+    int codonStart = 1;
+
+    /*
+     * parse qualifiers, saving protein translation, protein id,
+     * codon start position, product (name), and 'other values'
+     */
      if (feature.getQualifiers() != null)
      {
        for (Qualifier q : feature.getQualifiers())
@@ -529,16 +363,8 @@ public class EmblEntry
          String qname = q.getName();
          if (qname.equals("translation"))
          {
-          StringBuilder prsq = new StringBuilder(q.getValues()[0]);
-          int p = prsq.indexOf(" ");
-          while (p > -1)
-          {
-            prsq.deleteCharAt(p);
-            p = prsq.indexOf(" ", p);
-          }
-          prseq = prsq.toString();
-          prsq = null;
-
+          // remove all spaces (precompiled String.replaceAll(" ", ""))
+          prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
          }
          else if (qname.equals("protein_id"))
          {
@@ -546,46 +372,57 @@ public class EmblEntry
          }
          else if (qname.equals("codon_start"))
          {
-          prstart = Integer.parseInt(q.getValues()[0]);
+          try
+          {
+            codonStart = Integer.parseInt(q.getValues()[0]);
+          } catch (NumberFormatException e)
+          {
+            System.err.println("Invalid codon_start in XML for "
+                    + accession + ": " + e.getMessage());
+          }
          }
          else if (qname.equals("product"))
          {
+          // sometimes name is returned e.g. for V00488
            prname = q.getValues()[0];
          }
          else
          {
            // throw anything else into the additional properties hash
-          String[] s = q.getValues();
-          StringBuilder sb = new StringBuilder();
-          if (s != null)
+          String[] qvals = q.getValues();
+          if (qvals != null)
            {
-            for (int i = 0; i < s.length; i++)
-            {
-              sb.append(s[i]);
-              sb.append("\n");
-            }
+            String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
+                    ",");
+            vals.put(qname, commaSeparated);
            }
-          vals.put(qname, sb.toString());
          }
        }
      }
-    Sequence product = null;
+
+    // SequenceI product = null;
      DBRefEntry protEMBLCDS = null;
-    exon = adjustForPrStart(prstart, exon);
+    exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
      boolean noProteinDbref = true;
  
+    SequenceI product = null;
+    Mapping map = null;
      if (prseq != null && prname != null && prid != null)
      {
-      // extract proteins.
-      product = new Sequence(prid, prseq, 1, prseq.length());
-      product.setDescription(((prname.length() == 0) ? "Protein Product from "
-              + sourceDb
-              : prname));
-      if (!noPeptide)
+      /*
+       * look for product in peptides list, if not found, add it
+       */
+      product = matcher.findIdMatch(prid);
+      if (product == null)
        {
-        // Protein is also added to vector of sequences returned
-        seqs.add(product);
+        product = new Sequence(prid, prseq, 1, prseq.length());
+        product.setDescription(((prname.length() == 0) ? "Protein Product from "
+                + sourceDb
+                : prname));
+        peptides.add(product);
+        matcher.add(product);
        }
+
        // we have everything - create the mapping and perhaps the protein
        // sequence
        if (exon == null || exon.length == 0)
@@ -593,24 +430,24 @@ public class EmblEntry
          System.err
                  .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
                          + sourceDb + ":" + getAccession() + ")");
-        if (prseq.length() * 3 == (1 - prstart + dna.getSequence().length))
+        if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
          {
            System.err
                    .println("Not allowing for additional stop codon at end of cDNA fragment... !");
            // this might occur for CDS sequences where no features are
            // marked.
-          exon = new int[] { dna.getStart() + (prstart - 1), dna.getEnd() };
-          map = new jalview.datamodel.Mapping(product, exon, new int[] { 1,
-              prseq.length() }, 3, 1);
+          exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() };
+          map = new Mapping(product, exon, new int[] { 1, prseq.length() },
+                  3, 1);
          }
-        if ((prseq.length() + 1) * 3 == (1 - prstart + dna.getSequence().length))
+        if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
          {
            System.err
                    .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
-          exon = new int[] { dna.getStart() + (prstart - 1),
+          exon = new int[] { dna.getStart() + (codonStart - 1),
                dna.getEnd() - 3 };
-          map = new jalview.datamodel.Mapping(product, exon, new int[] { 1,
-              prseq.length() }, 3, 1);
+          map = new Mapping(product, exon, new int[] { 1, prseq.length() },
+                  3, 1);
          }
        }
        else
@@ -628,11 +465,13 @@ public class EmblEntry
          }
          else
          {
-          // final product length trunctation check
-
-          map = new jalview.datamodel.Mapping(product,
-                  adjustForProteinLength(prseq.length(), exon), new int[] {
-                      1, prseq.length() }, 3, 1);
+          // final product length truncation check
+          // TODO should from range include stop codon even if not in protein
+          // in order to include stop codon in CDS sequence (as done for
+          // Ensembl)?
+          int[] cdsRanges = adjustForProteinLength(prseq.length(),
+                  exon);
+          map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1);
            // reconstruct the EMBLCDS entry
            // TODO: this is only necessary when there codon annotation is
            // complete (I think JBPNote)
@@ -640,12 +479,9 @@ public class EmblEntry
            pcdnaref.setAccessionId(prid);
            pcdnaref.setSource(DBRefSource.EMBLCDS);
            pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
-          jalview.util.MapList mp = new jalview.util.MapList(new int[] { 1,
-              prseq.length() }, new int[] { 1 + (prstart - 1),
-              (prstart - 1) + 3 * prseq.length() }, 1, 3);
-          // { 1 + (prstart - 1) * 3,
-          // 1 + (prstart - 1) * 3 + prseq.length() * 3 - 1 }, new int[]
-          // { 1prstart, prstart + prseq.length() - 1 }, 3, 1);
+          MapList mp = new MapList(new int[] { 1, prseq.length() },
+                  new int[] { 1 + (codonStart - 1),
+                      (codonStart - 1) + 3 * prseq.length() }, 1, 3);
            pcdnaref.setMap(new Mapping(mp));
            if (product != null)
            {
@@ -653,55 +489,60 @@ public class EmblEntry
              protEMBLCDS = new DBRefEntry(pcdnaref);
              protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
              product.addDBRef(protEMBLCDS);
-
            }
-
          }
        }
        // add cds feature to dna seq - this may include the stop codon
        for (int xint = 0; exon != null && xint < exon.length; xint += 2)
        {
-        SequenceFeature sf = new SequenceFeature();
-        sf.setBegin(exon[xint]);
-        sf.setEnd(exon[xint + 1]);
-        sf.setType(feature.getName());
+        SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
+                codonStart);
+        sf.setType(feature.getName()); // "CDS"
          sf.setFeatureGroup(sourceDb);
-        sf.setDescription("Exon " + (1 + xint / 2) + " for protein '"
-                + prname + "' EMBLCDS:" + prid);
-        sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint));
-        sf.setValue(FeatureProperties.EXONPRODUCT, prname);
-        if (vals != null)
-        {
-          for (Entry<String, String> val : vals.entrySet())
-          {
-            sf.setValue(val.getKey(), val.getValue());
-          }
-        }
          dna.addSequenceFeature(sf);
        }
      }
      // add dbRefs to sequence
      if (feature.dbRefs != null)
      {
+      boolean productMapped = false;
        for (DBRefEntry ref : feature.dbRefs)
        {
-        ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref
-                .getSource()));
+        ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
          // Hard code the kind of protein product accessions that EMBL cite
-        if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT))
+        if (ref.getSource().equals(DBRefSource.UNIPROT))
          {
+          String refSeqName = DBRefSource.UNIPROT + "|"
+                  + ref.getAccessionId();
            ref.setMap(map);
            if (map != null && map.getTo() != null)
            {
-            map.getTo().addDBRef(
-                    new DBRefEntry(ref.getSource(), ref.getVersion(), ref
-                            .getAccessionId())); // don't copy map over.
-            if (map.getTo().getName().indexOf(prid) == 0)
-            {
-              map.getTo().setName(
-                      jalview.datamodel.DBRefSource.UNIPROT + "|"
-                              + ref.getAccessionId());
-            }
+            // if (!productMapped)
+            // {
+            // map.getTo().setName(refSeqName);
+            // map.getTo().addDBRef(
+            // new DBRefEntry(ref.getSource(), ref.getVersion(), ref
+            // .getAccessionId())); // don't copy map over.
+            // // if (map.getTo().getName().startsWith(prid))
+            // productMapped = true;
+            // }
+            // else
+            // {
+              /*
+               * an alternate UNIPROT product for CDS - same mapping
+               * but to a sequence with a different name
+               */
+              SequenceI newSeq = matcher.findIdMatch(refSeqName);
+              if (newSeq == null)
+              {
+                newSeq = new Sequence(refSeqName, map.getTo()
+                      .getSequenceAsString());
+                matcher.add(newSeq);
+                peptides.add(newSeq);
+              }
+              Mapping newMap = new Mapping(newSeq, map.getMap());
+              ref.setMap(newMap);
+            // }
            }
            noProteinDbref = false;
          }
@@ -756,39 +597,86 @@ public class EmblEntry
      }
    }
  
-  private int[] adjustForPrStart(int prstart, int[] exon)
+  /**
+   * Helper method to construct a SequenceFeature for one cds range
+   * 
+   * @param exons
+   *          array of cds [start, end, ...] positions
+   * @param exonStartIndex
+   *          offset into the exons array
+   * @param proteinName
+   * @param proteinAccessionId
+   * @param vals
+   *          map of 'miscellaneous values' for feature
+   * @param codonStart
+   *          codon start position for CDS (1/2/3, normally 1)
+   * @return
+   */
+  protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
+          String proteinName, String proteinAccessionId,
+          Map<String, String> vals, int codonStart)
    {
-
-    int origxon[], sxpos = -1;
-    int sxstart, sxstop; // unnecessary variables used for debugging
-    // first adjust range for codon start attribute
-    if (prstart > 1)
+    int exonNumber = exonStartIndex / 2 + 1;
+    SequenceFeature sf = new SequenceFeature();
+    sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
+    sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
+    sf.setDescription(String.format(
+            "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName,
+            proteinAccessionId));
+    sf.setPhase(String.valueOf(codonStart - 1));
+    sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-");
+    sf.setValue(FeatureProperties.EXONPOS, exonNumber);
+    sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+    if (!vals.isEmpty())
      {
-      origxon = new int[exon.length];
-      System.arraycopy(exon, 0, origxon, 0, exon.length);
-      int cdspos = 0;
-      for (int x = 0; x < exon.length && sxpos == -1; x += 2)
+      StringBuilder sb = new StringBuilder();
+      boolean first = true;
+      for (Entry<String, String> val : vals.entrySet())
        {
-        cdspos += exon[x + 1] - exon[x] + 1;
-        if (prstart <= cdspos)
+        if (!first)
          {
-          sxpos = x;
-          sxstart = exon[x];
-          sxstop = exon[x + 1];
-          // and adjust start boundary of first exon.
-          exon[x] = exon[x + 1] - cdspos + prstart;
-          break;
+          sb.append(";");
          }
+        sb.append(val.getKey()).append("=").append(val.getValue());
+        first = false;
+        sf.setValue(val.getKey(), val.getValue());
        }
+      sf.setAttributes(sb.toString());
+    }
+    return sf;
+  }
  
-      if (sxpos > 0)
-      {
-        int[] nxon = new int[exon.length - sxpos];
-        System.arraycopy(exon, sxpos, nxon, 0, exon.length - sxpos);
-        exon = nxon;
-      }
+  /**
+   * Returns the CDS positions as a list of [start, end, start, end...]
+   * positions. If on the reverse strand, these will be in descending order.
+   * 
+   * @param feature
+   * @return
+   */
+  protected int[] getCdsRanges(EmblFeature feature)
+  {
+    if (feature.locations == null)
+    {
+      return new int[] {};
      }
-    return exon;
+    int cdsBoundaryCount = 0; // count of all start/stop locations
+    int[][] cdsLocations = new int[feature.locations.size()][];
+    int locationNumber = 0;
+    for (EmblFeatureLocations loc : feature.locations)
+    {
+      int[] locationRanges = loc.getElementRanges(accession);
+      cdsLocations[locationNumber++] = locationRanges;
+      cdsBoundaryCount += locationRanges.length;
+    }
+    int[] cdsRanges = new int[cdsBoundaryCount];
+    int copyTo = 0;
+    for (int[] ranges : cdsLocations)
+    {
+      System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
+      copyTo += ranges.length;
+    }
+    return cdsRanges;
+
    }
  
    /**
@@ -802,7 +690,6 @@ public class EmblEntry
    {
  
      int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3;
-    int sxstart, sxstop; // unnecessary variables used for debugging
      // first adjust range for codon start attribute
      if (prlength >= 1 && exon != null)
      {
@@ -811,13 +698,11 @@ public class EmblEntry
        int cdspos = 0;
        for (int x = 0; x < exon.length && sxpos == -1; x += 2)
        {
-        cdspos += exon[x + 1] - exon[x] + 1;
+        cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
          if (cdslength <= cdspos)
          {
            // advanced beyond last codon.
            sxpos = x;
-          sxstart = exon[x];
-          sxstop = exon[x + 1];
            if (cdslength != cdspos)
            {
              System.err
diff --git a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java

index eb0bee7..9774004 100644 (file)
--- a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java
+++ b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java
@@ -20,13 +20,18 @@
   */
  package jalview.datamodel.xdb.embl;
  
+import jalview.bin.Cache;
+import jalview.util.ArrayUtils;
+
+import java.util.Arrays;
  import java.util.Vector;
  
  /**
- * Data model for a &lt;loctaion&gt; child element of a &lt;feature&gt; read
+ * Data model for a &lt;location&gt; child element of a &lt;feature&gt; read
   * from an EMBL query reply
   * 
   * @see embl_mapping.xml
+ * @see http://www.insdc.org/files/feature_table.html#3.4.2
   */
  public class EmblFeatureLocations
  {
@@ -101,21 +106,21 @@ public class EmblFeatureLocations
    }
  
    /**
-   * Return all location elements concerning given accession as start-end pairs
-   * TODO: pass back complement and 'less than or more than' range information
-   * TODO: deal with multiple accessions
+   * Return all location elements concerning given accession as start-end pairs.
+   * If the CDS feature is on the forward strand, then start <= end, if on the
+   * reverse strand then start > end.
     * 
     * @param accession
     *          the accession string for which locations are requested, or null
     *          for all locations
-   * @return null or int[] { start1, end1, ... }
+   * @return int[] { start1, end1, ... }
     */
-
-  public int[] getElementRanges(String accession)
+  int[] getElementRanges(String accession)
    {
      int sepos = 0;
      int[] se = new int[locElements.size() * 2];
-    if (locationType.equalsIgnoreCase("single")) // TODO: or "simple" ?
+    if ("single".equalsIgnoreCase(locationType)
+            || "join".equalsIgnoreCase(locationType))
      {
        for (EmblFeatureLocElement loce : locElements)
        {
@@ -125,50 +130,61 @@ public class EmblFeatureLocations
            BasePosition bp[] = loce.getBasePositions();
            if (bp.length == 2)
            {
-            se[sepos++] = Integer.parseInt(bp[0].getPos());
-            se[sepos++] = Integer.parseInt(bp[1].getPos());
+            try
+            {
+              int start = Integer.parseInt(bp[0].getPos());
+              int end = Integer.parseInt(bp[1].getPos());
+              se[sepos++] = start;
+              se[sepos++] = end;
+            } catch (NumberFormatException e)
+            {
+              System.err
+                      .println("format error in EMBL CDS location basePosition: "
+                              + e.getMessage());
+            }
            }
-        }
-      }
-    }
-    else if (locationType.equalsIgnoreCase("join"))
-    {
-      for (EmblFeatureLocElement loce : locElements)
-      {
-        if (accession == null || loce.accession != null
-                && accession.equals(loce.accession))
-        {
-          BasePosition bp[] = loce.getBasePositions();
-          if (bp.length == 2)
+          else
            {
-            se[sepos++] = Integer.parseInt(bp[0].getPos());
-            se[sepos++] = Integer.parseInt(bp[1].getPos());
+            System.err
+                    .println("format error in EMBL CDS location, basePosition count = "
+                            + bp.length);
            }
          }
        }
-      return se;
      }
      else if (locationType != null)
      {
-      if (jalview.bin.Cache.log != null)
+      if (Cache.log != null)
        {
-        jalview.bin.Cache.log
-                .error("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"
+        Cache.log
+                .error("EmblFeatureLocations.getElementRanges cannot deal with locationType=='"
                          + locationType + "'");
        }
        else
        {
          System.err
-                .println("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"
+                .println("EmblFeatureLocations.getElementRanges cannot deal with locationType=='"
                          + locationType + "'");
        }
      }
-    // trim range if necessary.
-    if (se != null && sepos != se.length)
+
+    if (sepos != se.length)
+    {
+      /*
+       * we failed to parse something - trim off null values
+       */
+      se = Arrays.copyOf(se, sepos);
+    }
+
+    /*
+     * If on the complement, reverse the ranges to [end, start, ...end1, start1].
+     * For an example of a joined complement, see (tRNA feature) CAGL0B00165r on
+     * http://www.ebi.ac.uk/ena/data/view/CR380948&display=xml
+     * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/CR380948/emblxml
+     */
+    if (locationComplement)
      {
-      int[] trimmed = new int[sepos];
-      System.arraycopy(se, 0, trimmed, 0, sepos);
-      se = trimmed;
+      ArrayUtils.reverseIntArray(se);
      }
      return se;
    }
diff --git a/src/jalview/util/ArrayUtils.java b/src/jalview/util/ArrayUtils.java

new file mode 100644 (file)

index 0000000..92085c3
--- /dev/null
+++ b/src/jalview/util/ArrayUtils.java
@@ -0,0 +1,27 @@
+package jalview.util;
+
+public class ArrayUtils
+{
+  /**
+   * Reverse the given array 'in situ'
+   * 
+   * @param arr
+   */
+  public static void reverseIntArray(int[] arr)
+  {
+    if (arr != null)
+    {
+      /*
+       * swap [k] with [end-k] up to the half way point in the array
+       * if length is odd, the middle entry is left untouched by the excitement
+       */
+      int last = arr.length - 1;
+      for (int k = 0; k < arr.length / 2; k++)
+      {
+        int temp = arr[k];
+        arr[k] = arr[last - k];
+        arr[last - k] = temp;
+      }
+    }
+  }
+}
diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java

index 0780b2a..c2cad1f 100644 (file)
--- a/src/jalview/util/MappingUtils.java
+++ b/src/jalview/util/MappingUtils.java
@@ -821,4 +821,66 @@ public final class MappingUtils
      }
      return false;
    }
+
+  /**
+   * Removes a specified number of positions from the start of a ranges list.
+   * For example, could be used to adjust cds ranges to allow for an incomplete
+   * start codon. Subranges are removed completely, or their start positions
+   * adjusted, until the required number of positions has been removed from the
+   * range. Reverse strand ranges are supported. The input array is not
+   * modified.
+   * 
+   * @param removeCount
+   * @param ranges
+   *          an array of [start, end, start, end...] positions
+   * @return a new array with the first removeCount positions removed
+   */
+  public static int[] removeStartPositions(int removeCount,
+          final int[] ranges)
+  {
+    if (removeCount <= 0)
+    {
+      return ranges;
+    }
+  
+    int[] copy = Arrays.copyOf(ranges, ranges.length);
+    int sxpos = -1;
+    int cdspos = 0;
+    for (int x = 0; x < copy.length && sxpos == -1; x += 2)
+    {
+      // fixme handle reverse strand
+      cdspos += Math.abs(copy[x + 1] - copy[x]) + 1;
+      if (removeCount < cdspos)
+      {
+        /*
+         * we have removed enough, time to finish
+         */
+        sxpos = x;
+
+        /*
+         * increment start of first exon, or decrement if reverse strand
+         */
+        if (copy[x] <= copy[x + 1])
+        {
+          copy[x] = copy[x + 1] - cdspos + removeCount + 1;
+        }
+        else
+        {
+          copy[x] = copy[x + 1] + cdspos - removeCount - 1;
+        }
+        break;
+      }
+    }
+  
+    if (sxpos > 0)
+    {
+      /*
+       * we dropped at least one entire sub-range - compact the array
+       */
+      int[] nxon = new int[copy.length - sxpos];
+      System.arraycopy(copy, sxpos, nxon, 0, copy.length - sxpos);
+      return nxon;
+    }
+    return copy;
+  }
  }
diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java

index 66ebe1b..4041606 100644 (file)
--- a/src/jalview/ws/dbsources/EmblXmlSource.java
+++ b/src/jalview/ws/dbsources/EmblXmlSource.java
@@ -29,14 +29,15 @@ import jalview.util.MessageManager;
  import jalview.ws.ebi.EBIFetchClient;
  
  import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
  
  public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  {
-
-  /**
-   * Last properly parsed embl file.
+  /*
+   * JAL-1856 Embl returns this text for query not found
     */
-  public EmblFile efile = null;
+  private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  
    public EmblXmlSource()
    {
@@ -88,68 +89,36 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy
    public AlignmentI getEmblSequenceRecords(String emprefx, String query,
            File reply) throws Exception
    {
-    SequenceI seqs[] = null;
-    StringBuffer result = new StringBuffer();
+    EmblFile efile = null;
+    List<SequenceI> seqs = new ArrayList<SequenceI>();
+
      if (reply != null && reply.exists())
      {
-      efile = null;
        file = reply.getAbsolutePath();
-      if (reply.length() > 25)
+      if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
        {
          efile = EmblFile.getEmblFile(reply);
        }
-      else
-      {
-        result.append(MessageManager.formatMessage(
-                "label.no_embl_record_found",
-                new String[] { emprefx.toLowerCase(), query.trim() }));
-      }
      }
+
+    List<SequenceI> peptides = new ArrayList<SequenceI>();
      if (efile != null)
      {
        for (EmblEntry entry : efile.getEntries())
        {
-        SequenceI[] seqparts = entry.getSequences(false, true, emprefx);
-        // TODO: use !fetchNa,!fetchPeptide here instead - see todo in EmblEntry
-        if (seqparts != null)
+        SequenceI seq = entry.getSequence(emprefx, peptides);
+        if (seq != null)
          {
-          SequenceI[] newseqs = null;
-          int si = 0;
-          if (seqs == null)
-          {
-            newseqs = new SequenceI[seqparts.length];
-          }
-          else
-          {
-            newseqs = new SequenceI[seqs.length + seqparts.length];
-
-            for (; si < seqs.length; si++)
-            {
-              newseqs[si] = seqs[si];
-              seqs[si] = null;
-            }
-          }
-          for (int j = 0; j < seqparts.length; si++, j++)
-          {
-            newseqs[si] = seqparts[j].deriveSequence();
-            // place DBReferences on dataset and refer
-          }
-          seqs = newseqs;
-
+          seqs.add(seq.deriveSequence());
+          // place DBReferences on dataset and refer
          }
        }
      }
-    else
-    {
-      result = null;
-    }
+
      AlignmentI al = null;
-    if (seqs != null && seqs.length > 0)
+    if (!seqs.isEmpty())
      {
-      al = new Alignment(seqs);
-      result.append(MessageManager.formatMessage(
-              "label.embl_successfully_parsed", new String[] { emprefx }));
-      results = result;
+      al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
      }
      stopQuery();
      return al;
diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java

new file mode 100644 (file)

index 0000000..9fffc45
--- /dev/null
+++ b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java
@@ -0,0 +1,308 @@
+package jalview.datamodel.xdb.embl;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertSame;
+
+import jalview.util.MappingUtils;
+
+import java.util.Arrays;
+import java.util.Vector;
+
+import org.testng.annotations.Test;
+
+public class EmblEntryTest
+{
+  @Test(groups = "Functional")
+  public void testGetCdsRanges()
+  {
+    EmblEntry testee = new EmblEntry();
+
+    /*
+     * Make a (CDS) Feature with 4 locations
+     */
+    EmblFeature cds = new EmblFeature();
+    Vector<EmblFeatureLocations> locs = new Vector<EmblFeatureLocations>();
+    cds.setLocations(locs);
+
+    /*
+     * single range [10-20]
+     */
+    EmblFeatureLocations loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    Vector<EmblFeatureLocElement> elements = new Vector<EmblFeatureLocElement>();
+    EmblFeatureLocElement locElement = new EmblFeatureLocElement();
+    BasePosition b1 = new BasePosition();
+    b1.setPos("10");
+    BasePosition b2 = new BasePosition();
+    b2.setPos("20");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * complement range [30-40]
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(true);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("30");
+    b2 = new BasePosition();
+    b2.setPos("40");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * join range [50-60], [70-80]
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("join");
+    loc.setLocationComplement(false);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("50");
+    b2 = new BasePosition();
+    b2.setPos("60");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("70");
+    b2 = new BasePosition();
+    b2.setPos("80");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * complement range [90-100], [110-120]
+     * this should be the same as complement(join(90..100,110.120))
+     * which is "join 90-100 and 110-120, then complement"
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("join");
+    loc.setLocationComplement(true);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("90");
+    b2 = new BasePosition();
+    b2.setPos("100");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("110");
+    b2 = new BasePosition();
+    b2.setPos("120");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    int[] exons = testee.getCdsRanges(cds);
+    assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110, 100, 90]",
+            Arrays.toString(exons));
+  }
+
+  @Test(groups = "Functional")
+  public void testGetCdsRanges_badData()
+  {
+    EmblEntry testee = new EmblEntry();
+
+    /*
+     * Make a (CDS) Feature with 4 locations
+     */
+    EmblFeature cds = new EmblFeature();
+    Vector<EmblFeatureLocations> locs = new Vector<EmblFeatureLocations>();
+    cds.setLocations(locs);
+
+    /*
+     * single range [10-20]
+     */
+    EmblFeatureLocations loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    Vector<EmblFeatureLocElement> elements = new Vector<EmblFeatureLocElement>();
+    EmblFeatureLocElement locElement = new EmblFeatureLocElement();
+    BasePosition b1 = new BasePosition();
+    b1.setPos("10");
+    BasePosition b2 = new BasePosition();
+    b2.setPos("20");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * single range with missing end position - should be skipped
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("30");
+    locElement.setBasePositions(new BasePosition[] { b1 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * single range with extra base position - should be skipped
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("30");
+    locElement.setBasePositions(new BasePosition[] { b1, b1, b1 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    /*
+     * single valid range [50-60] to finish
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("50");
+    b2 = new BasePosition();
+    b2.setPos("60");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+
+    int[] exons = testee.getCdsRanges(cds);
+    assertEquals("[10, 20, 50, 60]", Arrays.toString(exons));
+  }
+
+  /**
+   * Test retrieval of exon locations matching an accession id
+   */
+  @Test(groups = "Functional")
+  public void testGetCdsRanges_forAccession()
+  {
+    EmblEntry testee = new EmblEntry();
+    String accession = "A1234";
+    testee.setAccession(accession);
+    /*
+     * Make a (CDS) Feature with 4 locations
+     */
+    EmblFeature cds = new EmblFeature();
+    Vector<EmblFeatureLocations> locs = new Vector<EmblFeatureLocations>();
+    cds.setLocations(locs);
+  
+    /*
+     * single range [10-20] for 'this' accession
+     */
+    EmblFeatureLocations loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(false);
+    Vector<EmblFeatureLocElement> elements = new Vector<EmblFeatureLocElement>();
+    EmblFeatureLocElement locElement = new EmblFeatureLocElement();
+    locElement.setAccession(accession);
+    BasePosition b1 = new BasePosition();
+    b1.setPos("10");
+    BasePosition b2 = new BasePosition();
+    b2.setPos("20");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+  
+    /*
+     * complement range [30-40] - no accession
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("single");
+    loc.setLocationComplement(true);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    b1 = new BasePosition();
+    b1.setPos("30");
+    b2 = new BasePosition();
+    b2.setPos("40");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+  
+    /*
+     * join range [50-60] this accession, [70-80] another
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("join");
+    loc.setLocationComplement(false);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    locElement.setAccession(accession);
+    b1 = new BasePosition();
+    b1.setPos("50");
+    b2 = new BasePosition();
+    b2.setPos("60");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    locElement = new EmblFeatureLocElement();
+    locElement.setAccession("notme");
+    b1 = new BasePosition();
+    b1.setPos("70");
+    b2 = new BasePosition();
+    b2.setPos("80");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+  
+    /*
+     * complement range [90-100] wrong accession, [110-120] good 
+     * this should be the same as complement(join(90..100,110.120))
+     * which is "join 90-100 and 110-120, then complement"
+     */
+    loc = new EmblFeatureLocations();
+    loc.setLocationType("join");
+    loc.setLocationComplement(true);
+    elements = new Vector<EmblFeatureLocElement>();
+    locElement = new EmblFeatureLocElement();
+    locElement.setAccession("wrong");
+    b1 = new BasePosition();
+    b1.setPos("90");
+    b2 = new BasePosition();
+    b2.setPos("100");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    locElement = new EmblFeatureLocElement();
+    locElement.setAccession(accession);
+    b1 = new BasePosition();
+    b1.setPos("110");
+    b2 = new BasePosition();
+    b2.setPos("120");
+    locElement.setBasePositions(new BasePosition[] { b1, b2 });
+    elements.add(locElement);
+    loc.setLocElements(elements);
+    locs.add(loc);
+  
+    /*
+     * verify we pick out only ranges for A1234
+     */
+    int[] exons = testee.getCdsRanges(cds);
+    assertEquals("[10, 20, 50, 60, 120, 110]",
+            Arrays.toString(exons));
+  }
+}
diff --git a/test/jalview/util/ArrayUtilsTest.java b/test/jalview/util/ArrayUtilsTest.java

new file mode 100644 (file)

index 0000000..5a2674a
--- /dev/null
+++ b/test/jalview/util/ArrayUtilsTest.java
@@ -0,0 +1,31 @@
+package jalview.util;
+
+import static org.testng.AssertJUnit.assertEquals;
+
+import java.util.Arrays;
+
+import org.testng.annotations.Test;
+
+public class ArrayUtilsTest
+{
+  @Test(groups="Functional")
+  public void testReverseIntArray() {
+
+    // null value: should be no exception
+    ArrayUtils.reverseIntArray((int[]) null);
+
+    // empty array: should be no exception
+    int[] arr = new int[] {};
+    ArrayUtils.reverseIntArray(arr);
+
+    // even length array
+    arr = new int[] { 1, 2, 3, 4 };
+    ArrayUtils.reverseIntArray(arr);
+    assertEquals("[4, 3, 2, 1]", Arrays.toString(arr));
+
+    // odd length array
+    arr = new int[] { 1, 2, 3, 4, 5 };
+    ArrayUtils.reverseIntArray(arr);
+    assertEquals("[5, 4, 3, 2, 1]", Arrays.toString(arr));
+  }
+}
diff --git a/test/jalview/util/MappingUtilsTest.java b/test/jalview/util/MappingUtilsTest.java

index 853ebd5..b53d513 100644 (file)
--- a/test/jalview/util/MappingUtilsTest.java
+++ b/test/jalview/util/MappingUtilsTest.java
@@ -24,7 +24,6 @@ import static org.testng.AssertJUnit.assertEquals;
  import static org.testng.AssertJUnit.assertFalse;
  import static org.testng.AssertJUnit.assertSame;
  import static org.testng.AssertJUnit.assertTrue;
-import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
  
  import jalview.api.AlignViewportI;
  import jalview.commands.EditCommand;
@@ -911,4 +910,107 @@ public class MappingUtilsTest
      assertFalse(MappingUtils.contains(ranges, -45));
    }
  
+  /**
+   * Test the method that drops positions from the start of a mapped range
+   */
+  @Test(groups = "Functional")
+  public void testRemoveStartPositions()
+  {
+    int[] ranges = new int[] { 1, 10 };
+    int[] adjusted = MappingUtils.removeStartPositions(0, ranges);
+    assertEquals("[1, 10]", Arrays.toString(adjusted));
+
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[2, 10]", Arrays.toString(adjusted));
+    assertEquals("[1, 10]", Arrays.toString(ranges));
+
+    ranges = adjusted;
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[3, 10]", Arrays.toString(adjusted));
+    assertEquals("[2, 10]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 3, 10, 12 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[3, 3, 10, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 3, 10, 12]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 2, 8, 12 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[8, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 2, 8, 12]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 2, 8, 12 };
+    adjusted = MappingUtils.removeStartPositions(2, ranges);
+    assertEquals("[9, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 2, 8, 12]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 2, 4, 4, 9, 12 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[4, 4, 9, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 2, 4, 4, 9, 12]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 2, 4, 4, 9, 12 };
+    adjusted = MappingUtils.removeStartPositions(2, ranges);
+    assertEquals("[9, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 2, 4, 4, 9, 12]", Arrays.toString(ranges));
+
+    ranges = new int[] { 2, 3, 9, 12 };
+    adjusted = MappingUtils.removeStartPositions(3, ranges);
+    assertEquals("[10, 12]", Arrays.toString(adjusted));
+    assertEquals("[2, 3, 9, 12]", Arrays.toString(ranges));
+  }
+
+  /**
+   * Test the method that drops positions from the start of a mapped range, on
+   * the reverse strand
+   */
+  @Test(groups = "Functional")
+  public void testRemoveStartPositions_reverseStrand()
+  {
+    int[] ranges = new int[] { 10, 1 };
+    int[] adjusted = MappingUtils.removeStartPositions(0, ranges);
+    assertEquals("[10, 1]", Arrays.toString(adjusted));
+    assertEquals("[10, 1]", Arrays.toString(ranges));
+  
+    ranges = adjusted;
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[9, 1]", Arrays.toString(adjusted));
+    assertEquals("[10, 1]", Arrays.toString(ranges));
+  
+    ranges = adjusted;
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[8, 1]", Arrays.toString(adjusted));
+    assertEquals("[9, 1]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 11, 9, 6 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[11, 11, 9, 6]", Arrays.toString(adjusted));
+    assertEquals("[12, 11, 9, 6]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 12, 8, 4 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[8, 4]", Arrays.toString(adjusted));
+    assertEquals("[12, 12, 8, 4]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 12, 8, 4 };
+    adjusted = MappingUtils.removeStartPositions(2, ranges);
+    assertEquals("[7, 4]", Arrays.toString(adjusted));
+    assertEquals("[12, 12, 8, 4]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 12, 10, 10, 8, 4 };
+    adjusted = MappingUtils.removeStartPositions(1, ranges);
+    assertEquals("[10, 10, 8, 4]", Arrays.toString(adjusted));
+    assertEquals("[12, 12, 10, 10, 8, 4]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 12, 10, 10, 8, 4 };
+    adjusted = MappingUtils.removeStartPositions(2, ranges);
+    assertEquals("[8, 4]", Arrays.toString(adjusted));
+    assertEquals("[12, 12, 10, 10, 8, 4]", Arrays.toString(ranges));
+  
+    ranges = new int[] { 12, 11, 8, 4 };
+    adjusted = MappingUtils.removeStartPositions(3, ranges);
+    assertEquals("[7, 4]", Arrays.toString(adjusted));
+    assertEquals("[12, 11, 8, 4]", Arrays.toString(ranges));
+  }
+
  }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Mon, 21 Mar 2016 14:42:51 +0000 (14:42 +0000)
src/jalview/analysis/CrossRef.java		patch \| blob \| history
src/jalview/analysis/SequenceIdMatcher.java		patch \| blob \| history
src/jalview/datamodel/xdb/embl/EmblEntry.java		patch \| blob \| history
src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java		patch \| blob \| history
src/jalview/util/ArrayUtils.java	[new file with mode: 0644]	patch \| blob
src/jalview/util/MappingUtils.java		patch \| blob \| history
src/jalview/ws/dbsources/EmblXmlSource.java		patch \| blob \| history
test/jalview/datamodel/xdb/embl/EmblEntryTest.java	[new file with mode: 0644]	patch \| blob
test/jalview/util/ArrayUtilsTest.java	[new file with mode: 0644]	patch \| blob
test/jalview/util/MappingUtilsTest.java		patch \| blob \| history