JAL-2110 work in progress

author gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index 081f446..949c47a 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -1411,6 +1411,12 @@ public class AlignmentUtils
    {
      List<SequenceI> cdsSeqs = new ArrayList<SequenceI>();
      
+    /*
+     * construct CDS sequences from the (cds-to-protein) mappings made earlier;
+     * this makes it possible to model multiple products from dna (e.g. EMBL); 
+     * however it does mean we don't have the EMBL protein_id (a property on 
+     * the CDS features) in order to make the CDS sequence name :-( 
+     */
      for (SequenceI seq : dna)
      {
        AlignedCodonFrame cdsMappings = new AlignedCodonFrame();
@@ -1795,17 +1801,20 @@ public class AlignmentUtils
       * sort to get sequence features in start position order
       * - would be better to store in Sequence as a TreeSet or NCList?
       */
-    Arrays.sort(peptide.getSequenceFeatures(),
-            new Comparator<SequenceFeature>()
-            {
-              @Override
-              public int compare(SequenceFeature o1, SequenceFeature o2)
+    if (peptide.getSequenceFeatures() != null)
+    {
+      Arrays.sort(peptide.getSequenceFeatures(),
+              new Comparator<SequenceFeature>()
                {
-                int c = Integer.compare(o1.getBegin(), o2.getBegin());
-                return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
-                        : c;
-              }
-            });
+                @Override
+                public int compare(SequenceFeature o1, SequenceFeature o2)
+                {
+                  int c = Integer.compare(o1.getBegin(), o2.getBegin());
+                  return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+                          : c;
+                }
+              });
+    }
      return count;
    }
  
diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java

index cb664df..4e8f070 100644 (file)
--- a/src/jalview/analysis/CrossRef.java
+++ b/src/jalview/analysis/CrossRef.java
@@ -35,12 +35,11 @@ import jalview.ws.SequenceFetcherFactory;
  import jalview.ws.seqfetcher.ASequenceFetcher;
  
  import java.util.ArrayList;
-import java.util.Arrays;
+import java.util.Iterator;
  import java.util.List;
  
  /**
- * Functions for cross-referencing sequence databases. user must first specify
- * if cross-referencing from protein or dna (set dna==true)
+ * Functions for cross-referencing sequence databases.
   * 
   * @author JimP
   * 
@@ -48,28 +47,40 @@ import java.util.List;
  public class CrossRef
  {
    /*
-   * A sub-class that ignores Parent attribute when comparing sequence 
-   * features. This avoids 'duplicate' CDS features that only
-   * differ in their parent Transcript ids.
+   * the dataset of the alignment for which we are searching for 
+   * cross-references; in some cases we may resolve xrefs by 
+   * searching in the dataset
     */
-  class MySequenceFeature extends SequenceFeature
-  {
-    private SequenceFeature feat;
+  private AlignmentI dataset;
  
-    MySequenceFeature(SequenceFeature sf)
-    {
-      this.feat = sf;
-    }
+  /*
+   * true if we are searching for cross-references from nucleotide,
+   * i.e. for protein sequences, false if the reverse
+   */
+  private boolean fromDna;
  
-    @Override
-    public boolean equals(Object o)
-    {
-      return feat.equals(o, true);
-    }
-  }
+  /*
+   * the sequences for which we are seeking cross-references
+   */
+  private SequenceI[] fromSeqs;
  
    /**
+   * Constructor
+   * 
+   * @param seqs
+   *          the sequences for which we are seeking cross-references
+   * @param ds
+   *          the containing alignment dataset (may be searched to resolve
+   *          cross-references)
+   */
+  public CrossRef(SequenceI[] seqs, AlignmentI ds)
+  {
+    fromSeqs = seqs;
+    fromDna = ds.isNucleotide();
+    dataset = ds.getDataset() == null ? ds : ds.getDataset();
+  }
  
+  /**
     * Returns a list of distinct database sources for which sequences have either
     * <ul>
     * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
@@ -77,24 +88,16 @@ public class CrossRef
     * reference from another sequence in the dataset which has a cross-reference
     * to a direct DBRefEntry on the given sequence</li>
     * </ul>
-   * 
-   * @param dna
-   *          true if seqs are nucleotide
-   * @param seqs
-   *          sequences whose xrefs we are seeking
-   * @param dataset
-   *          an alignment to search for indirect references
     * @return
     */
-  public static List<String> findXrefSourcesForSequences(boolean dna,
-          SequenceI[] seqs, AlignmentI dataset)
+  public List<String> findXrefSourcesForSequences()
    {
      List<String> sources = new ArrayList<String>();
-    for (SequenceI seq : seqs)
+    for (SequenceI seq : fromSeqs)
      {
        if (seq != null)
        {
-        findXrefSourcesForSequence(seq, dna, dataset, sources);
+        findXrefSourcesForSequence(seq, sources);
        }
      }
      return sources;
@@ -111,41 +114,37 @@ public class CrossRef
     * 
     * @param seq
     *          the sequence whose dbrefs we are searching against
-   * @param dna
-   *          true if the sequence is nucleotide
-   * @param dataset
-   *          an alignment to search for indirect references
     * @param sources
     *          a list of sources to add matches to
     */
-  static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
-          AlignmentI dataset, List<String> sources)
+  void findXrefSourcesForSequence(SequenceI seq, List<String> sources)
    {
      /*
       * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
       */
-    DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
+    DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
      addXrefsToSources(rfs, sources);
      if (dataset != null)
      {
        /*
         * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
         */
-      DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+      DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
        List<SequenceI> rseqs = new ArrayList<SequenceI>();
  
        /*
         * find sequences in the alignment which xref one of these DBRefs
         * i.e. is xref-ed to a common sequence identifier
         */
-      CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
+      searchDatasetXrefs(seq, lrfs, rseqs, null);
  
        /*
         * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
         */
        for (SequenceI rs : rseqs)
        {
-        DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
+        DBRefEntry[] xrs = DBRefUtils
+                .selectDbRefs(!fromDna, rs.getDBRefs());
          addXrefsToSources(xrs, sources);
        }
      }
@@ -158,7 +157,7 @@ public class CrossRef
     * @param xrefs
     * @param sources
     */
-  static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
+  void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
    {
      if (xrefs != null)
      {
@@ -185,13 +184,14 @@ public class CrossRef
     *          add to)
     * @return products (as dataset sequences)
     */
-  public static Alignment findXrefSequences(SequenceI[] seqs,
-          final boolean dna, final String source, AlignmentI al)
+  public Alignment findXrefSequences(String source)
    {
-    AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
      List<SequenceI> rseqs = new ArrayList<SequenceI>();
      AlignedCodonFrame cf = new AlignedCodonFrame();
-    for (SequenceI seq : seqs)
+    SequenceIdMatcher matcher = new SequenceIdMatcher(
+            dataset.getSequences());
+
+    for (SequenceI seq : fromSeqs)
      {
        SequenceI dss = seq;
        while (dss.getDatasetSequence() != null)
@@ -199,42 +199,67 @@ public class CrossRef
          dss = dss.getDatasetSequence();
        }
        boolean found = false;
-      DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
+      DBRefEntry[] xrfs = DBRefUtils
+              .selectDbRefs(!fromDna, dss.getDBRefs());
        if ((xrfs == null || xrfs.length == 0) && dataset != null)
        {
          /*
           * found no suitable dbrefs on sequence - look for sequences in the
           * alignment which share a dbref with this one
           */
-        DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
+        DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
+                seq.getDBRefs());
  
          /*
           * find sequences (except this one!), of complementary type,
           *  which have a dbref to an accession id for this sequence,
           *  and add them to the results
           */
-        found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
-                rseqs, cf);
+        found = searchDatasetXrefs(dss, lrfs, rseqs, cf);
        }
-      for (int r = 0; xrfs != null && r < xrfs.length; r++)
+      if (xrfs == null && !found)
        {
-        DBRefEntry xref = xrfs[r];
-        if (source != null && !source.equals(xref.getSource()))
-        {
-          continue;
-        }
+        /*
+         * no dbref to source on this sequence or matched
+         * complementary sequence in the dataset 
+         */
+        continue;
+      }
+      List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
+              source);
+      Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
+      while (refIterator.hasNext())
+      {
+        DBRefEntry xref = refIterator.next();
+        found = false;
          if (xref.hasMap())
          {
-          if (xref.getMap().getTo() != null)
+          SequenceI mappedTo = xref.getMap().getTo();
+          if (mappedTo != null)
            {
+            /*
+             * dbref contains the sequence it maps to; add it to the
+             * results unless we have done so already (could happen if 
+             * fetching xrefs for sequences which have xrefs in common)
+             * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
+             */
              found = true;
-            SequenceI rsq = new Sequence(xref.getMap().getTo());
+            SequenceI matchInDataset = findInDataset(mappedTo);// matcher.findIdMatch(mappedTo);
+            if (matchInDataset != null)
+            {
+              if (!rseqs.contains(matchInDataset))
+              {
+                rseqs.add(matchInDataset);
+              }
+              continue;
+            }
+            SequenceI rsq = new Sequence(mappedTo);
              rseqs.add(rsq);
-            if (xref.getMap().getMap().getFromRatio() != xref
-                    .getMap().getMap().getToRatio())
+            if (xref.getMap().getMap().getFromRatio() != xref.getMap()
+                    .getMap().getToRatio())
              {
                // get sense of map correct for adding to product alignment.
-              if (dna)
+              if (fromDna)
                {
                  // map is from dna seq to a protein product
                  cf.addMap(dss, rsq, xref.getMap().getMap());
@@ -247,182 +272,162 @@ public class CrossRef
              }
            }
          }
+
          if (!found)
          {
-          // do a bit more work - search for sequences with references matching
-          // xrefs on this sequence.
-          if (dataset != null)
+          SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
+                  + xref.getAccessionId());
+          if (matchedSeq != null)
            {
-            found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
-                    !dna);
-            if (found)
+            if (constructMapping(seq, matchedSeq, xref, cf))
              {
-              xrfs[r] = null; // we've recovered seqs for this one.
+              found = true;
              }
            }
          }
+
+        if (!found)
+        {
+          // do a bit more work - search for sequences with references matching
+          // xrefs on this sequence.
+          found = searchDataset(dss, xref, rseqs, cf, false);
+        }
+        if (found)
+        {
+          refIterator.remove();
+        }
        }
-      if (!found)
+
+      /*
+       * fetch from source database any dbrefs we haven't resolved up to here
+       */
+      if (!sourceRefs.isEmpty())
        {
-        if (xrfs != null && xrfs.length > 0)
+        ASequenceFetcher sftch = SequenceFetcherFactory
+                .getSequenceFetcher();
+        SequenceI[] retrieved = null;
+        try
          {
-          ASequenceFetcher sftch = SequenceFetcherFactory
-                  .getSequenceFetcher();
-          SequenceI[] retrieved = null;
-          int l = xrfs.length;
-          for (int r = 0; r < xrfs.length; r++)
-          {
-            // filter out any irrelevant or irretrievable references
-            if (xrfs[r] == null
-                    || ((source != null && !source.equals(xrfs[r]
-                            .getSource())) || !sftch.isFetchable(xrfs[r]
-                            .getSource())))
-            {
-              l--;
-              xrfs[r] = null;
-            }
-          }
-          if (l > 0)
-          {
-            // System.out
-            // .println("Attempting to retrieve cross referenced sequences.");
-            DBRefEntry[] t = new DBRefEntry[l];
-            l = 0;
-            for (int r = 0; r < xrfs.length; r++)
-            {
-              if (xrfs[r] != null)
-              {
-                t[l++] = xrfs[r];
-              }
-            }
-            xrfs = t;
-            try
-            {
-              retrieved = sftch.getSequences(Arrays.asList(xrfs), !dna);
-              // problem here is we don't know which of xrfs resulted in which
-              // retrieved element
-            } catch (Exception e)
-            {
-              System.err
-                      .println("Problem whilst retrieving cross references for Sequence : "
-                              + seq.getName());
-              e.printStackTrace();
-            }
+          retrieved = sftch.getSequences(sourceRefs, !fromDna);
+        } catch (Exception e)
+        {
+          System.err
+                  .println("Problem whilst retrieving cross references for Sequence : "
+                          + seq.getName());
+          e.printStackTrace();
+        }
  
-            if (retrieved != null)
+        if (retrieved != null)
+        {
+          updateDbrefMappings(seq, xrfs, retrieved, cf);
+          for (SequenceI retrievedSequence : retrieved)
+          {
+            SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
+                    : retrievedSequence.getDatasetSequence();
+            DBRefEntry[] dbr = retrievedSequence.getDBRefs();
+            if (dbr != null)
              {
-              updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
-
-              SequenceIdMatcher matcher = new SequenceIdMatcher(
-                      dataset.getSequences());
-              List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
-              CrossRef me = new CrossRef();
-              for (int rs = 0; rs < retrieved.length; rs++)
+              for (DBRefEntry dbref : dbr)
                {
-                // TODO: examine each sequence for 'redundancy'
-                DBRefEntry[] dbr = retrieved[rs].getDBRefs();
-                if (dbr != null && dbr.length > 0)
+                // find any entry where we should put in the sequence being
+                // cross-referenced into the map
+                Mapping map = dbref.getMap();
+                if (map != null)
                  {
-                  for (int di = 0; di < dbr.length; di++)
+                  if (map.getTo() != null && map.getMap() != null)
                    {
-                    // find any entry where we should put in the sequence being
-                    // cross-referenced into the map
-                    Mapping map = dbr[di].getMap();
-                    if (map != null)
+                    // TODO findInDataset requires exact sequence match but
+                    // 'congruent' test only for the mapped part
+                    SequenceI matched = findInDataset(map.getTo());// matcher.findIdMatch(map.getTo());
+                    if (matched != null)
                      {
-                      if (map.getTo() != null && map.getMap() != null)
+                      /*
+                       * already got an xref to this sequence; update this
+                       * map to point to the same sequence, and add
+                       * any new dbrefs to it
+                       */
+                      DBRefEntry[] toRefs = map.getTo().getDBRefs();
+                      if (toRefs != null)
                        {
-                        SequenceI matched = matcher
-                                .findIdMatch(map.getTo());
-                        if (matched != null)
-                        {
-                          /*
-                           * already got an xref to this sequence; update this
-                           * map to point to the same sequence, and add
-                           * any new dbrefs to it
-                           */
-                          for (DBRefEntry ref : map.getTo().getDBRefs())
-                          {
-                            matched.addDBRef(ref); // add or update mapping
-                          }
-                          map.setTo(matched);
-                        }
-                        else
+                        for (DBRefEntry ref : toRefs)
                          {
-                          matcher.add(map.getTo());
+                          matched.addDBRef(ref); // add or update mapping
                          }
-                        try
+                      }
+                      map.setTo(matched);
+                    }
+                    else
+                    {
+                      matcher.add(map.getTo());
+                    }
+                    try
+                    {
+                      // compare ms with dss and replace with dss in mapping
+                      // if map is congruent
+                      SequenceI ms = map.getTo();
+                      int sf = map.getMap().getToLowest();
+                      int st = map.getMap().getToHighest();
+                      SequenceI mappedrg = ms.getSubSequence(sf, st);
+                      // SequenceI loc = dss.getSubSequence(sf, st);
+                      if (mappedrg.getLength() > 0
+                              && ms.getSequenceAsString().equals(
+                                      dss.getSequenceAsString()))
+                      // && mappedrg.getSequenceAsString().equals(
+                      // loc.getSequenceAsString()))
+                      {
+                        String msg = "Mapping updated from " + ms.getName()
+                                + " to retrieved crossreference "
+                                + dss.getName();
+                        System.out.println(msg);
+                        // method to update all refs of existing To on
+                        // retrieved sequence with dss and merge any props
+                        // on To onto dss.
+                        // TODO don't we have to change the mapped to ranges
+                        // if not to the whole sequence?
+                        map.setTo(dss);
+                        /*
+                         * copy sequence features as well, avoiding
+                         * duplication (e.g. same variation from 2 
+                         * transcripts)
+                         */
+                        SequenceFeature[] sfs = ms.getSequenceFeatures();
+                        if (sfs != null)
                          {
-                          // compare ms with dss and replace with dss in mapping
-                          // if map is congruent
-                          SequenceI ms = map.getTo();
-                          int sf = map.getMap().getToLowest();
-                          int st = map.getMap().getToHighest();
-                          SequenceI mappedrg = ms.getSubSequence(sf, st);
-                          // SequenceI loc = dss.getSubSequence(sf, st);
-                          if (mappedrg.getLength() > 0
-                                  && ms.getSequenceAsString().equals(
-                                          dss.getSequenceAsString()))
-                          // && mappedrg.getSequenceAsString().equals(
-                          // loc.getSequenceAsString()))
+                          for (SequenceFeature feat : sfs)
                            {
-                            String msg = "Mapping updated from "
-                                    + ms.getName()
-                                    + " to retrieved crossreference "
-                                    + dss.getName();
-                            System.out.println(msg);
-                            // method to update all refs of existing To on
-                            // retrieved sequence with dss and merge any props
-                            // on To onto dss.
-                            map.setTo(dss);
                              /*
-                             * copy sequence features as well, avoiding
-                             * duplication (e.g. same variation from 2 
-                             * transcripts)
+                             * make a flyweight feature object which ignores Parent
+                             * attribute in equality test, to avoid creating many
+                             * otherwise duplicate exon features on genomic sequence
                               */
-                            SequenceFeature[] sfs = ms
-                                    .getSequenceFeatures();
-                            if (sfs != null)
+                            SequenceFeature newFeature = new SequenceFeature(
+                                    feat)
                              {
-                              for (SequenceFeature feat : sfs)
+                              @Override
+                              public boolean equals(Object o)
                                {
-                                /* 
-                                 * we override SequenceFeature.equals here (but
-                                 * not elsewhere) to ignore Parent attribute
-                                 * TODO not quite working yet!
-                                 */
-                                if (!copiedFeatures
-                                        .contains(me.new MySequenceFeature(
-                                                feat)))
-                                {
-                                  dss.addSequenceFeature(feat);
-                                  copiedFeatures.add(feat);
-                                }
+                                return super.equals(o, true);
                                }
-                            }
-                            cf.addMap(retrieved[rs].getDatasetSequence(),
-                                    dss, map.getMap());
-                          }
-                          // TODO remove this 'else' and the cf.addMap above?
-                          else
-                          {
-                            cf.addMap(retrieved[rs].getDatasetSequence(),
-                                    map.getTo(), map.getMap());
+                            };
+                            dss.addSequenceFeature(newFeature);
                            }
-                        } catch (Exception e)
-                        {
-                          System.err
-                                  .println("Exception when consolidating Mapped sequence set...");
-                          e.printStackTrace(System.err);
                          }
                        }
+                      cf.addMap(retrievedDss, map.getTo(), map.getMap());
+                    } catch (Exception e)
+                    {
+                      System.err
+                              .println("Exception when consolidating Mapped sequence set...");
+                      e.printStackTrace(System.err);
                      }
                    }
                  }
-                retrieved[rs].updatePDBIds();
-                rseqs.add(retrieved[rs]);
                }
              }
+            retrievedSequence.updatePDBIds();
+            rseqs.add(retrievedSequence);
+            dataset.addSequence(retrievedDss);
+            matcher.add(retrievedSequence);
            }
          }
        }
@@ -441,17 +446,81 @@ public class CrossRef
    }
  
    /**
+   * Returns the first identical sequence in the dataset if any, else null
+   * 
+   * @param mappedTo
+   * @return
+   */
+  SequenceI findInDataset(SequenceI mappedTo)
+  {
+    if (mappedTo == null)
+    {
+      return null;
+    }
+    SequenceI dss = mappedTo.getDatasetSequence() == null ? mappedTo
+            : mappedTo.getDatasetSequence();
+    for (SequenceI seq : dataset.getSequences())
+    {
+      if (sameSequence(seq, dss))
+      {
+        return seq;
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
+   * case), else false. This method compares the lengths, then each character in
+   * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
+   * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
+   * 
+   * @param seq1
+   * @param seq2
+   * @return
+   */
+  // TODO move to Sequence / SequenceI
+  static boolean sameSequence(SequenceI seq1, SequenceI seq2)
+  {
+    if (seq1 == seq2)
+    {
+      return true;
+    }
+    if (seq1 == null || seq2 == null)
+    {
+      return false;
+    }
+    char[] c1 = seq1.getSequence();
+    char[] c2 = seq2.getSequence();
+    if (c1.length != c2.length)
+    {
+      return false;
+    }
+    for (int i = 0; i < c1.length; i++)
+    {
+      int diff = c1[i] - c2[i];
+      /*
+       * same char or differ in case only ('a'-'A' == 32)
+       */
+      if (diff != 0 && diff != 32 && diff != -32)
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
     * Updates any empty mappings in the cross-references with one to a compatible
     * retrieved sequence if found, and adds any new mappings to the
     * AlignedCodonFrame
     * 
-   * @param dna
     * @param mapFrom
     * @param xrefs
     * @param retrieved
     * @param acf
     */
-  static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+  void updateDbrefMappings(SequenceI mapFrom,
            DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
    {
      SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
@@ -468,55 +537,69 @@ public class CrossRef
          }
          for (SequenceI seq : matches)
          {
-          MapList mapping = null;
-          if (dna)
-          {
-            mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
-          }
-          else
-          {
-            mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
-            if (mapping != null)
-            {
-              mapping = mapping.getInverse();
-            }
-          }
-          if (mapping != null)
-          {
-            xref.setMap(new Mapping(seq, mapping));
-            if (dna)
-            {
-              AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
-            }
-            if (dna)
-            {
-              acf.addMap(mapFrom, seq, mapping);
-            }
-            else
-            {
-              acf.addMap(seq, mapFrom, mapping.getInverse());
-            }
-            continue;
-          }
+          constructMapping(mapFrom, seq, xref, acf);
          }
        }
      }
    }
  
    /**
+   * Tries to make a mapping from dna to protein. If successful, adds the
+   * mapping to the dbref and the mappings collection and answers true,
+   * otherwise answers false.
+   * 
+   * @param mapFrom
+   * @param mapTo
+   * @param xref
+   * @param mappings
+   * @return
+   */
+  boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
+          DBRefEntry xref, AlignedCodonFrame mappings)
+  {
+    MapList mapping = null;
+    if (fromDna)
+    {
+      mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
+    }
+    else
+    {
+      mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
+      if (mapping != null)
+      {
+        mapping = mapping.getInverse();
+      }
+    }
+    if (mapping == null)
+    {
+      return false;
+    }
+    xref.setMap(new Mapping(mapTo, mapping));
+    if (fromDna)
+    {
+      AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
+      mappings.addMap(mapFrom, mapTo, mapping);
+    }
+    else
+    {
+      mappings.addMap(mapTo, mapFrom, mapping.getInverse());
+    }
+
+    return true;
+  }
+
+  /**
     * find references to lrfs in the cross-reference set of each sequence in
     * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
     * based on source and accession string only - Map and Version are nulled.
     * 
     * @param sequenceI
     * @param lrfs
-   * @param dataset
     * @param rseqs
     * @return true if matches were found.
     */
-  private static boolean searchDatasetXrefs(SequenceI sequenceI,
-          boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
-          List<SequenceI> rseqs, AlignedCodonFrame cf)
+  private boolean searchDatasetXrefs(SequenceI sequenceI,
+          DBRefEntry[] lrfs, List<SequenceI> rseqs, AlignedCodonFrame cf)
    {
      boolean found = false;
      if (lrfs == null)
@@ -529,8 +612,7 @@ public class CrossRef
        // add in wildcards
        xref.setVersion(null);
        xref.setMap(null);
-      found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
-              dna);
+      found |= searchDataset(sequenceI, xref, rseqs, cf, false);
      }
      return found;
    }
@@ -543,21 +625,16 @@ public class CrossRef
     *          a sequence to ignore (start point of search)
     * @param xrf
     *          a cross-reference to try to match
-   * @param dataset
-   *          sequences to search in
     * @param rseqs
     *          result list to add to
     * @param cf
     *          a set of sequence mappings to add to
     * @param direct
     *          - search all references or only subset
-   * @param dna
-   *          search dna or protein xrefs (if direct=false)
     * @return true if relationship found and sequence added.
     */
-  public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
-          AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
-          boolean direct, boolean dna)
+  boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
+          List<SequenceI> rseqs, AlignedCodonFrame cf, boolean direct)
    {
      boolean found = false;
      if (dataset == null)
@@ -585,16 +662,14 @@ public class CrossRef
            {
              continue;
            }
-          // check if this is the correct sequence type
+          /*
+           * only look at same molecule type if 'direct', or
+           * complementary type if !direct
+           */
            {
-            // TODO 'direct' is always set to false - remove?
-            // or should it be 'true' from findXrefSequences?
-            // also its Javadoc conflicts with its use:
-            // test below implies 'direct' means find complementary sequences,
-            // !direct means select same molecule type
              boolean isDna = Comparison
                      .isNucleotide(new SequenceI[] { nxt });
-            if ((direct && isDna == dna) || (!direct && isDna != dna))
+            if (direct ? (isDna != fromDna) : (isDna == fromDna))
              {
                // skip this sequence because it is wrong molecule type
                continue;
@@ -616,7 +691,7 @@ public class CrossRef
            }
            else
            {
-            poss = DBRefUtils.selectDbRefs(!dna, poss);
+            poss = DBRefUtils.selectDbRefs(!fromDna, poss);
              cands = DBRefUtils.searchRefs(poss, xrf);
            }
            if (!cands.isEmpty())
@@ -639,7 +714,7 @@ public class CrossRef
                      {
                        // get sense of map correct for adding to product
                        // alignment.
-                      if (dna)
+                      if (fromDna)
                        {
                          // map is from dna seq to a protein product
                          cf.addMap(sequenceI, nxt, map);
diff --git a/src/jalview/analysis/CrossRefs.java b/src/jalview/analysis/CrossRefs.java

index 0f3f425..691e972 100644 (file)
--- a/src/jalview/analysis/CrossRefs.java
+++ b/src/jalview/analysis/CrossRefs.java
@@ -1,5 +1,6 @@
  package jalview.analysis;
  
+import jalview.analysis.CrossRef.MySequenceFeature;
  import jalview.datamodel.AlignedCodonFrame;
  import jalview.datamodel.Alignment;
  import jalview.datamodel.AlignmentI;
@@ -20,6 +21,27 @@ import java.util.List;
  
  public class CrossRefs
  {
+  /*
+   * A sub-class that ignores Parent attribute when comparing sequence 
+   * features. This avoids 'duplicate' CDS features that only
+   * differ in their parent Transcript ids.
+   */
+  class MySequenceFeature extends SequenceFeature
+  {
+    private SequenceFeature feat;
+  
+    MySequenceFeature(SequenceFeature sf)
+    {
+      this.feat = sf;
+    }
+  
+    @Override
+    public boolean equals(Object o)
+    {
+      return feat.equals(o, true);
+    }
+  }
+
    /**
     * Finds cross-references for sequences from a specified source database.
     * These may be found in four ways:
@@ -46,44 +68,71 @@ public class CrossRefs
    public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
            String source, AlignmentI dataset)
    {
-    List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
-    AlignedCodonFrame mappings = new AlignedCodonFrame();
-
-    List<DBRefEntry> sourceRefs = new ArrayList<DBRefEntry>();
-
+    /*
+     * filter to only those sequences of the right type (nucleotide/protein)
+     */
+    List<SequenceI> fromSeqs = new ArrayList<SequenceI>();
      for (SequenceI seq : seqs)
      {
-      if (dna != Comparison.isNucleotide(seq))
+      if (dna == Comparison.isNucleotide(seq))
        {
-        /*
-         * mixed alignment, and this sequence is of the wrong type
-         */
-        continue;
+        fromSeqs.add(seq);
        }
+    }
+    return findXrefSequences(fromSeqs, dna, source, dataset);
+  }
+
+  /**
+   * Finds cross-references for sequences from a specified source database.
+   * These may be found in four ways:
+   * <ul>
+   * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
+   * <li>a sequence of complementary type in the alignment dataset, which has a
+   * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
+   * <li>a sequence of complementary type in the alignment, which has a
+   * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
+   * <li>by fetching the accession from the remote database</li>
+   * </ul>
+   * 
+   * @param seqs
+   *          the sequences whose cross-references we are searching for,
+   *          filtered to only those which are of the type denoted by 'dna'
+   * @param dna
+   *          true if the sequences are from a nucleotide alignment, else false
+   * @param source
+   *          the database source we want cross-references to
+   * @param dataset
+   *          the alignment dataset the sequences belong to
+   * @return an alignment containing cross-reference sequences, or null if none
+   *         found
+   */
+  static AlignmentI findXrefSequences(List<SequenceI> fromSeqs,
+          boolean dna, String source, AlignmentI dataset)
+  {
+    List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
+    AlignedCodonFrame mappings = new AlignedCodonFrame();
  
-      /*
-       * get this sequence's dbrefs to source database (if any)
-       */
-      List<DBRefEntry> seqSourceRefs = DBRefUtils.searchRefsForSource(
-              seq.getDBRefs(), source);
+    List<DBRefEntry> unresolvedRefs = new ArrayList<DBRefEntry>();
  
-      /*
-       * first extract any mapped sequences from sourceRefs
-       */
-      findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings);
+    /*
+     * first extract any mapped sequences from sourceRefs
+     * if successful, sequence is removed from fromSeqs
+     * if unsuccessful, dbrefs are added to unresolvedRefs
+     */
+    findMappedDbrefs(fromSeqs, source, foundSeqs,
+            unresolvedRefs, mappings);
  
-      /*
-       * for remaining sourceRefs, try to match a 
-       * complementary sequence in the dataset
-       */
-      findIndirectCrossReferences(seq, source, seqSourceRefs, dataset,
-              foundSeqs, mappings);
-    }
+    /*
+     * then search the alignment dataset for dbref resolutions
+     */
+    findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
+            unresolvedRefs, mappings);
  
      /*
       * fetch any remaining sourceRefs from the source database
       */
-    fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset);
+    fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
+            dna, dataset);
  
      if (foundSeqs.isEmpty())
      {
@@ -98,52 +147,90 @@ public class CrossRefs
    /**
     * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
     * found, adds the sequence to foundSeqs and removes the dbref from the list.
+   * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
+   * version number to 0 i.e. use source and accession only).
     * 
-   * @param seq
-   *          the dataset sequence we are searching from
-   * @param sourceRefs
-   *          the sequence's dbrefs to 'source'
+   * @param fromSeqs
+   *          the dataset sequences we are searching from
+   * @param source
+   *          the database source we are searching dbrefs for
     * @param foundSeqs
-   *          a list of cross-references to add to
+   *          a list of found sequences to add to
+   * @param unresolvedRefs
+   *          a list of unresolved cross-references to add to
     * @param mappings
     *          a set of sequence mappings to add to
     * @return
     */
-  static void findMappedDbrefs(SequenceI seq, List<DBRefEntry> sourceRefs,
-          List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+  static void findMappedDbrefs(List<SequenceI> fromSeqs, String source,
+          List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+          AlignedCodonFrame mappings)
    {
-    Iterator<DBRefEntry> refs = sourceRefs.iterator();
-    while (refs.hasNext())
+    Iterator<SequenceI> it = fromSeqs.iterator();
+    while (it.hasNext())
      {
-      DBRefEntry dbref = refs.next();
-      Mapping map = dbref.getMap();
-      if (map != null)
+      SequenceI seq = it.next();
+      SequenceI dss = seq.getDatasetSequence();
+      dss = dss == null ? seq : dss;
+
+      DBRefEntry[] dbRefs = seq.getDBRefs();
+      if (dbRefs == null)
+      {
+        continue;
+      }
+      boolean resolved = false;
+      for (DBRefEntry dbref : dbRefs)
        {
-        SequenceI mappedTo = map.getTo();
-        if (mappedTo != null)
+        if (!source.equals(dbref.getSource()))
          {
-          foundSeqs.add(new Sequence(mappedTo));
-          refs.remove();
-      
-          /*
-           * check mapping is not 'direct' (it shouldn't be if we reach here)
-           * and add mapping (dna-to-peptide or vice versa) to the set
-           */
-          MapList mapList = map.getMap();
-          int fromRatio = mapList.getFromRatio();
-          int toRatio = mapList.getToRatio();
-          if (fromRatio != toRatio)
+          continue;
+        }
+        DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
+                dbref.getAccessionId());
+        Mapping map = dbref.getMap();
+        if (map != null)
+        {
+          unresolvedRefs.remove(todo);
+          resolved = true;
+          SequenceI mappedTo = map.getTo();
+          if (mappedTo != null)
            {
-            if (fromRatio == 3)
-            {
-              mappings.addMap(seq, mappedTo, mapList);
-            }
-            else
+            foundSeqs.add(new Sequence(mappedTo));
+
+            /*
+             * check mapping is not 'direct' (it shouldn't be if we reach here)
+             * and add mapping (dna-to-peptide or vice versa) to the set
+             */
+            MapList mapList = map.getMap();
+            int fromRatio = mapList.getFromRatio();
+            int toRatio = mapList.getToRatio();
+            if (fromRatio != toRatio)
              {
-              mappings.addMap(mappedTo, seq, mapList.getInverse());
+              if (fromRatio == 3)
+              {
+                mappings.addMap(dss, mappedTo, mapList);
+              }
+              else
+              {
+                mappings.addMap(mappedTo, dss, mapList.getInverse());
+              }
              }
            }
          }
+        else
+        {
+          /*
+           * no mapping to resolve dbref - add source+accession to list to resolve
+           */
+          if (!unresolvedRefs.contains(todo))
+          {
+            unresolvedRefs.add(todo);
+          }
+        }
+      }
+      if (resolved)
+      {
+        it.remove();
        }
      }
    }
@@ -153,13 +240,13 @@ public class CrossRefs
     * to the foundSeqs list. If found, tries to make a mapping between seq and
     * the retrieved sequence and insert it into the database reference.
     * 
-   * @param seq
+   * @param fromSeqs
     * @param sourceRefs
     * @param foundSeqs
     * @param mappings
     * @param dna
     */
-  static void fetchCrossReferences(SequenceI seq,
+  static void fetchCrossReferences(List<SequenceI> fromSeqs,
            List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
            AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
    {
@@ -170,116 +257,116 @@ public class CrossRefs
        retrieved = sftch.getSequences(sourceRefs, !dna);
      } catch (Exception e)
      {
-      System.err
-              .println("Problem whilst retrieving cross references for Sequence : "
-                      + seq.getName());
+      System.err.println("Problem whilst retrieving cross references: "
+              + e.getMessage());
        e.printStackTrace();
        return;
      }
  
-    if (retrieved != null)
+    if (retrieved == null)
      {
-      updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings);
+      return;
+    }
+    updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
  
-      SequenceIdMatcher matcher = new SequenceIdMatcher(
-              dataset.getSequences());
-      List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
-      CrossRef me = new CrossRef();
-      for (int rs = 0; rs < retrieved.length; rs++)
+    SequenceIdMatcher matcher = new SequenceIdMatcher(
+            dataset.getSequences());
+    List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
+    CrossRefs me = new CrossRefs();
+    for (int rs = 0; rs < retrieved.length; rs++)
+    {
+      // TODO: examine each sequence for 'redundancy'
+      DBRefEntry[] dbr = retrieved[rs].getDBRefs();
+      if (dbr != null && dbr.length > 0)
        {
-        // TODO: examine each sequence for 'redundancy'
-        DBRefEntry[] dbr = retrieved[rs].getDBRefs();
-        if (dbr != null && dbr.length > 0)
+        for (int di = 0; di < dbr.length; di++)
          {
-          for (int di = 0; di < dbr.length; di++)
+          // find any entry where we should put in the sequence being
+          // cross-referenced into the map
+          Mapping map = dbr[di].getMap();
+          if (map != null)
            {
-            // find any entry where we should put in the sequence being
-            // cross-referenced into the map
-            Mapping map = dbr[di].getMap();
-            if (map != null)
+            if (map.getTo() != null && map.getMap() != null)
              {
-              if (map.getTo() != null && map.getMap() != null)
+              SequenceI matched = matcher.findIdMatch(map.getTo());
+              if (matched != null)
                {
-                SequenceI matched = matcher.findIdMatch(map.getTo());
-                if (matched != null)
-                {
-                  /*
-                   * already got an xref to this sequence; update this
-                   * map to point to the same sequence, and add
-                   * any new dbrefs to it
-                   */
-                  for (DBRefEntry ref : map.getTo().getDBRefs())
-                  {
-                    matched.addDBRef(ref); // add or update mapping
-                  }
-                  map.setTo(matched);
-                }
-                else
+                /*
+                 * already got an xref to this sequence; update this
+                 * map to point to the same sequence, and add
+                 * any new dbrefs to it
+                 */
+                for (DBRefEntry ref : map.getTo().getDBRefs())
                  {
-                  matcher.add(map.getTo());
+                  matched.addDBRef(ref); // add or update mapping
                  }
-                try
+                map.setTo(matched);
+              }
+              else
+              {
+                matcher.add(map.getTo());
+              }
+              try
+              {
+                // compare ms with dss and replace with dss in mapping
+                // if map is congruent
+                SequenceI ms = map.getTo();
+                int sf = map.getMap().getToLowest();
+                int st = map.getMap().getToHighest();
+                SequenceI mappedrg = ms.getSubSequence(sf, st);
+                // SequenceI loc = dss.getSubSequence(sf, st);
+                if (mappedrg.getLength() > 0
+                        && ms.getSequenceAsString().equals(
+                                fromSeqs.getSequenceAsString()))
+                // && mappedrg.getSequenceAsString().equals(
+                // loc.getSequenceAsString()))
                  {
-                  // compare ms with dss and replace with dss in mapping
-                  // if map is congruent
-                  SequenceI ms = map.getTo();
-                  int sf = map.getMap().getToLowest();
-                  int st = map.getMap().getToHighest();
-                  SequenceI mappedrg = ms.getSubSequence(sf, st);
-                  // SequenceI loc = dss.getSubSequence(sf, st);
-                  if (mappedrg.getLength() > 0
-                          && ms.getSequenceAsString().equals(
-                                  seq.getSequenceAsString()))
-                  // && mappedrg.getSequenceAsString().equals(
-                  // loc.getSequenceAsString()))
+                  String msg = "Mapping updated from " + ms.getName()
+                          + " to retrieved crossreference "
+                          + fromSeqs.getName();
+                  System.out.println(msg);
+                  // method to update all refs of existing To on
+                  // retrieved sequence with dss and merge any props
+                  // on To onto dss.
+                  map.setTo(fromSeqs);
+                  /*
+                   * copy sequence features as well, avoiding
+                   * duplication (e.g. same variation from 2 
+                   * transcripts)
+                   */
+                  SequenceFeature[] sfs = ms.getSequenceFeatures();
+                  if (sfs != null)
                    {
-                    String msg = "Mapping updated from " + ms.getName()
-                            + " to retrieved crossreference "
-                            + seq.getName();
-                    System.out.println(msg);
-                    // method to update all refs of existing To on
-                    // retrieved sequence with dss and merge any props
-                    // on To onto dss.
-                    map.setTo(seq);
-                    /*
-                     * copy sequence features as well, avoiding
-                     * duplication (e.g. same variation from 2 
-                     * transcripts)
-                     */
-                    SequenceFeature[] sfs = ms.getSequenceFeatures();
-                    if (sfs != null)
+                    for (SequenceFeature feat : sfs)
                      {
-                      for (SequenceFeature feat : sfs)
+                      /* 
+                       * we override SequenceFeature.equals here (but
+                       * not elsewhere) to ignore Parent attribute
+                       * TODO not quite working yet!
+                       */
+                      if (!copiedFeatures
+                              .contains(me.new MySequenceFeature(feat)))
                        {
-                        /* 
-                         * we override SequenceFeature.equals here (but
-                         * not elsewhere) to ignore Parent attribute
-                         * TODO not quite working yet!
-                         */
-                        if (!copiedFeatures
-                                .contains(me.new MySequenceFeature(feat)))
-                        {
-                          seq.addSequenceFeature(feat);
-                          copiedFeatures.add(feat);
-                        }
+                        fromSeqs.addSequenceFeature(feat);
+                        copiedFeatures.add(feat);
                        }
                      }
                    }
-                  mappings.addMap(retrieved[rs].getDatasetSequence(),
-                          map.getTo(), map.getMap());
-                } catch (Exception e)
-                {
-                  System.err
-                          .println("Exception when consolidating Mapped sequence set...");
-                  e.printStackTrace(System.err);
                  }
+                mappings.addMap(retrieved[rs].getDatasetSequence(),
+                        map.getTo(), map.getMap());
+              } catch (Exception e)
+              {
+                System.err
+                        .println("Exception when consolidating Mapped sequence set...");
+                e.printStackTrace(System.err);
                }
              }
            }
          }
-        retrieved[rs].updatePDBIds();
-        foundSeqs.add(retrieved[rs]);
        }
+      retrieved[rs].updatePDBIds();
+      foundSeqs.add(retrieved[rs]);
      }
    }
  
@@ -288,24 +375,27 @@ public class CrossRefs
     * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
     * removes the resolved sourceRef from the search list.
     * 
-   * @param seq
+   * @param fromSeqs
     * @param source
-   * @param sourceRefs
-   * @param dataset
+   * @param unresolvedRefs
     * @param foundSeqs
+   * @param unresolvedRefs
     * @param mappings
     * @return
     */
-  static void findIndirectCrossReferences(SequenceI seq, String source,
-          List<DBRefEntry> sourceRefs, AlignmentI dataset,
-          List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+  static void findIndirectCrossReferences(List<SequenceI> fromSeqs,
+          String source, AlignmentI dataset,
+          List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+          AlignedCodonFrame mappings)
    {
-    Iterator<DBRefEntry> refs = sourceRefs.iterator();
+    Iterator<DBRefEntry> refs = unresolvedRefs.iterator();
      while (refs.hasNext())
      {
        DBRefEntry dbref = refs.next();
-      boolean found = searchDatasetForCrossReference(seq, dbref, dataset,
-              foundSeqs, mappings);
+      boolean found = false;
+      // boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
+      // foundSeqs,
+      // unresolvedRefs, mappings);
        if (found)
        {
          refs.remove();
@@ -427,12 +517,12 @@ public class CrossRefs
     * AlignedCodonFrame
     * 
     * @param dna
-   * @param mapFrom
+   * @param fromSeqs
     * @param xrefs
     * @param retrieved
     * @param mappings
     */
-  static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+  static void updateDbrefMappings(boolean dna, List<SequenceI> fromSeqs,
            List<DBRefEntry> xrefs, SequenceI[] retrieved,
            AlignedCodonFrame mappings)
    {
@@ -453,11 +543,11 @@ public class CrossRefs
            MapList mapping = null;
            if (dna)
            {
-            mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
+            mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
            }
            else
            {
-            mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
+            mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
              if (mapping != null)
              {
                mapping = mapping.getInverse();
@@ -468,15 +558,15 @@ public class CrossRefs
              xref.setMap(new Mapping(seq, mapping));
              if (dna)
              {
-              AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
+              AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
              }
              if (dna)
              {
-              mappings.addMap(mapFrom, seq, mapping);
+              mappings.addMap(fromSeqs, seq, mapping);
              }
              else
              {
-              mappings.addMap(seq, mapFrom, mapping.getInverse());
+              mappings.addMap(seq, fromSeqs, mapping.getInverse());
              }
              continue;
            }
diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java

index 3fd0581..c12de4e 100755 (executable)
--- a/src/jalview/analysis/SequenceIdMatcher.java
+++ b/src/jalview/analysis/SequenceIdMatcher.java
@@ -290,7 +290,7 @@ public class SequenceIdMatcher
      {
        if (s != null)
        {
-        id = new String(s.toLowerCase());
+        id = s.toLowerCase();
        }
        else
        {
@@ -357,5 +357,15 @@ public class SequenceIdMatcher
                  .indexOf(s.charAt(id.length())) > -1)) : false;
        }
      }
+
+    /**
+     * toString method returns the wrapped sequence id. For debugging purposes
+     * only, behaviour not guaranteed not to change.
+     */
+    @Override
+    public String toString()
+    {
+      return id;
+    }
    }
  }
diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java

index cfe87d9..a0e1234 100644 (file)
--- a/src/jalview/datamodel/xdb/embl/EmblEntry.java
+++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java
@@ -212,6 +212,10 @@ public class EmblEntry
          {
            for (DBRefEntry dbref : feature.dbRefs)
            {
+            /*
+             * convert UniProtKB/Swiss-Prot to UNIPROT
+             */
+            dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource()));
              dna.addDBRef(dbref);
            }
          }
@@ -419,14 +423,13 @@ public class EmblEntry
      }
  
      /*
-     * add dbRefs to sequence, and mappings for Uniprot xrefs
+     * add mappings for Uniprot xrefs
       */
      if (feature.dbRefs != null)
      {
        boolean mappingUsed = false;
        for (DBRefEntry ref : feature.dbRefs)
        {
-        ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
          if (ref.getSource().equals(DBRefSource.UNIPROT))
          {
            String proteinSeqName = DBRefSource.UNIPROT + "|"
@@ -482,7 +485,6 @@ public class EmblEntry
              }
            }
          }
-        dna.addDBRef(ref);
        }
        if (noProteinDbref && product != null)
        {
diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java

index 39e4bcd..751bf4d 100644 (file)
--- a/src/jalview/gui/AlignFrame.java
+++ b/src/jalview/gui/AlignFrame.java
@@ -23,7 +23,6 @@ package jalview.gui;
  import jalview.analysis.AlignmentSorter;
  import jalview.analysis.AlignmentUtils;
  import jalview.analysis.CrossRef;
-import jalview.analysis.CrossRefs;
  import jalview.analysis.Dna;
  import jalview.analysis.ParseProperties;
  import jalview.analysis.SequenceIdMatcher;
@@ -4634,22 +4633,25 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener,
    }
  
    /**
-   * Searches selected sequences for xRef products and builds the Show
-   * Cross-References menu (formerly called Show Products)
+   * Searches the alignment sequences for xRefs and builds the Show
+   * Cross-References menu (formerly called Show Products), with database
+   * sources for which cross-references are found (protein sources for a
+   * nucleotide alignment and vice versa)
     * 
-   * @return true if Show Cross-references menu should be enabled.
+   * @return true if Show Cross-references menu should be enabled
     */
    public boolean canShowProducts()
    {
-    SequenceI[] selection = viewport.getSequenceSelection();
+    SequenceI[] seqs = viewport.getAlignment().getSequencesArray();
      AlignmentI dataset = viewport.getAlignment().getDataset();
      boolean showp = false;
      try
      {
        showProducts.removeAll();
        final boolean dna = viewport.getAlignment().isNucleotide();
-      List<String> ptypes = (selection == null || selection.length == 0) ? null
-              : CrossRef.findXrefSourcesForSequences(dna, selection, dataset);
+      List<String> ptypes = (seqs == null || seqs.length == 0) ? null
+              : new CrossRef(seqs, dataset)
+                      .findXrefSourcesForSequences();
  
        for (final String source : ptypes)
        {
@@ -4706,8 +4708,8 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener,
          {
            AlignmentI alignment = AlignFrame.this.getViewport()
                    .getAlignment();
-          AlignmentI xrefs = CrossRefs.findXrefSequences(sel, dna, source,
-                  alignment);
+          AlignmentI xrefs = new CrossRef(sel, alignment)
+                  .findXrefSequences(source);
            if (xrefs != null)
            {
              /*
diff --git a/src/jalview/util/MapList.java b/src/jalview/util/MapList.java

index e51442c..ab811a9 100644 (file)
--- a/src/jalview/util/MapList.java
+++ b/src/jalview/util/MapList.java
@@ -215,7 +215,7 @@ public class MapList
      {
        /*
         * note lowest and highest values - bearing in mind the
-       * direction may be revesed
+       * direction may be reversed
         */
        fromLowest = Math.min(fromLowest, Math.min(from[i], from[i + 1]));
        fromHighest = Math.max(fromHighest, Math.max(from[i], from[i + 1]));
diff --git a/test/jalview/analysis/CrossRefTest.java b/test/jalview/analysis/CrossRefTest.java

index 31f9728..b2720f2 100644 (file)
--- a/test/jalview/analysis/CrossRefTest.java
+++ b/test/jalview/analysis/CrossRefTest.java
@@ -22,6 +22,7 @@ package jalview.analysis;
  
  import static org.testng.AssertJUnit.assertEquals;
  import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertNotNull;
  import static org.testng.AssertJUnit.assertNotSame;
  import static org.testng.AssertJUnit.assertNull;
  import static org.testng.AssertJUnit.assertSame;
@@ -101,7 +102,8 @@ public class CrossRefTest
      /*
       * first with no dbrefs to search
       */
-    CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+    sources = new CrossRef(new SequenceI[] { seq }, al)
+            .findXrefSourcesForSequences();
      assertTrue(sources.isEmpty());
  
      /*
@@ -117,7 +119,8 @@ public class CrossRefTest
      seq.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
      seq.addDBRef(new DBRefEntry("ENSEMBL", "0", "E2349"));
      seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
-    CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+    sources = new CrossRef(new SequenceI[] { seq }, al)
+            .findXrefSourcesForSequences();
      assertEquals(4, sources.size());
      assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]",
              sources.toString());
@@ -136,7 +139,8 @@ public class CrossRefTest
      seq2.addDBRef(new DBRefEntry("GENEDB", "0", "E2348"));
      // TODO include ENSEMBLGENOMES in DBRefSource.DNACODINGDBS ?
      al.addSequence(seq2);
-    CrossRef.findXrefSourcesForSequence(seq, false, al, sources);
+    sources = new CrossRef(new SequenceI[] { seq }, al)
+            .findXrefSourcesForSequences();
      assertEquals(3, sources.size());
      assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
    }
@@ -166,8 +170,8 @@ public class CrossRefTest
       * - but peptide with matching nucleotide dbref does, so is returned
       */
      AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
-    Alignment xrefs = CrossRef.findXrefSequences(
-            new SequenceI[] { emblSeq }, true, "UNIPROT", al);
+    Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
+            .findXrefSequences("UNIPROT");
      assertEquals(1, xrefs.getHeight());
      assertSame(uniprotSeq, xrefs.getSequenceAt(0));
    }
@@ -201,8 +205,8 @@ public class CrossRefTest
       * - but nucleotide with matching peptide dbref does, so is returned
       */
      AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
-    Alignment xrefs = CrossRef.findXrefSequences(
-            new SequenceI[] { uniprotSeq }, false, "EMBL", al);
+    Alignment xrefs = new CrossRef(new SequenceI[] { uniprotSeq },
+            al).findXrefSequences("EMBL");
      assertEquals(1, xrefs.getHeight());
      assertSame(emblSeq, xrefs.getSequenceAt(0));
    }
@@ -228,8 +232,8 @@ public class CrossRefTest
       * equatable to it, so no results found
       */
      AlignmentI al = new Alignment(new SequenceI[] { dna1, dna2 });
-    Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna2 },
-            true, "UNIPROT", al);
+    Alignment xrefs = new CrossRef(new SequenceI[] { dna2 }, al)
+            .findXrefSequences("UNIPROT");
      assertNull(xrefs);
    }
  
@@ -257,8 +261,8 @@ public class CrossRefTest
       * first search for a dbref nowhere on the alignment:
       */
      DBRefEntry dbref = new DBRefEntry("UNIPROT", "0", "P30419");
-    boolean found = CrossRef.searchDataset(dna1, dbref, al, result, null,
-            true, true);
+    CrossRef testee = new CrossRef(al.getSequencesArray(), al);
+    boolean found = testee.searchDataset(dna1, dbref, result, null, true);
      assertFalse(found);
      assertTrue(result.isEmpty());
  
@@ -269,8 +273,7 @@ public class CrossRefTest
       * search for a protein sequence with dbref UNIPROT:Q9ZTS2
       */
      dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
-    found = CrossRef.searchDataset(dna1, dbref, al, result, null, true,
-            true);
+    found = testee.searchDataset(dna1, dbref, result, null, true);
      assertTrue(found);
      assertEquals(1, result.size());
      assertSame(pep1, result.get(0));
@@ -280,8 +283,7 @@ public class CrossRefTest
       */
      result.clear();
      dbref = new DBRefEntry("UNIPROT", "0", "Q9ZTS2");
-    found = CrossRef.searchDataset(pep1, dbref, al, result, null, true,
-            false);
+    found = testee.searchDataset(pep1, dbref, result, null, false);
      assertTrue(found);
      assertEquals(1, result.size());
      assertSame(dna1, result.get(0));
@@ -329,8 +331,8 @@ public class CrossRefTest
       * mapped sequences
       */
      AlignmentI al = new Alignment(new SequenceI[] { dna1 });
-    Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
-            true, "UNIPROT", al);
+    Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+            .findXrefSequences("UNIPROT");
      assertEquals(2, xrefs.getHeight());
  
      /*
@@ -391,9 +393,12 @@ public class CrossRefTest
      final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
      final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
    
-    SequenceFetcher mockFetcher = new SequenceFetcher()
+    /*
+     * argument false suppresses adding DAS sources
+     * todo: define an interface type SequenceFetcherI and mock that
+     */
+    SequenceFetcher mockFetcher = new SequenceFetcher(false)
      {
-
        @Override
        public boolean isFetchable(String source)
        {
@@ -412,8 +417,8 @@ public class CrossRefTest
       * find UNIPROT xrefs for nucleotide sequence
       */
      AlignmentI al = new Alignment(new SequenceI[] { dna1 });
-    Alignment xrefs = CrossRef.findXrefSequences(new SequenceI[] { dna1 },
-            true, "UNIPROT", al);
+    Alignment xrefs = new CrossRef(new SequenceI[] { dna1 }, al)
+            .findXrefSequences("UNIPROT");
      assertEquals(2, xrefs.getHeight());
      assertSame(pep1, xrefs.getSequenceAt(0));
      assertSame(pep2, xrefs.getSequenceAt(1));
@@ -425,4 +430,278 @@ public class CrossRefTest
      SequenceFetcherFactory.setSequenceFetcher(null);
    }
  
+  /**
+   * Test for finding 'product' sequences for the case where both gene and
+   * transcript sequences have dbrefs to Uniprot.
+   */
+  @Test(groups = { "Functional" })
+  public void testFindXrefSequences_forGeneAndTranscripts()
+  {
+    /*
+     * 'gene' sequence
+     */
+    SequenceI gene = new Sequence("ENSG00000157764", "CGCCTCCCTTCCCC");
+    gene.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+    gene.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+
+    /*
+     * 'transcript' with CDS feature (supports mapping to protein)
+     */
+    SequenceI braf001 = new Sequence("ENST00000288602", "taagATGGCGGCGCTGa");
+    braf001.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
+    braf001.addSequenceFeature(new SequenceFeature("CDS", "", 5, 16, 0f,
+            null));
+
+    /*
+     * 'spliced transcript' with CDS ranges
+     */
+    SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
+    braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+    braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
+            null));
+    braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
+            null));
+
+    /*
+     * TODO code is fragile - use of SequenceIdMatcher depends on fetched
+     * sequences having a name starting Source|Accession
+     * which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl 
+     */
+    final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+    final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
+  
+    /*
+     * argument false suppresses adding DAS sources
+     * todo: define an interface type SequenceFetcherI and mock that
+     */
+    SequenceFetcher mockFetcher = new SequenceFetcher(false)
+    {
+      @Override
+      public boolean isFetchable(String source)
+      {
+        return true;
+      }
+  
+      @Override
+      public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+      {
+        return new SequenceI[] { pep1, pep2 };
+      }
+    };
+    SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+  
+    /*
+     * find UNIPROT xrefs for gene and transcripts
+     * verify that
+     * - the two proteins are retrieved but not duplicated
+     * - mappings are built from transcript (CDS) to proteins
+     * - no mappings from gene to proteins
+     */
+    SequenceI[] seqs = new SequenceI[] { gene, braf001, braf002 };
+    AlignmentI al = new Alignment(seqs);
+    Alignment xrefs = new CrossRef(seqs, al)
+            .findXrefSequences("UNIPROT");
+    assertEquals(2, xrefs.getHeight());
+    assertSame(pep1, xrefs.getSequenceAt(0));
+    assertSame(pep2, xrefs.getSequenceAt(1));
+  }
+
+  /**
+   * <pre>
+   * Test that emulates this (real but simplified) case:
+   * Alignment:          DBrefs
+   *     UNIPROT|P0CE19  EMBL|J03321, EMBL|X06707, EMBL|M19487
+   *     UNIPROT|P0CE20  EMBL|J03321, EMBL|X06707, EMBL|X07547
+   * Find cross-references for EMBL. These are mocked here as
+   *     EMBL|J03321     with mappings to P0CE18, P0CE19, P0CE20
+   *     EMBL|X06707     with mappings to P0CE17, P0CE19, P0CE20
+   *     EMBL|M19487     with mappings to P0CE19, Q46432
+   *     EMBL|X07547     with mappings to P0CE20, B0BCM4
+   * EMBL sequences are first 'fetched' (mocked here) for P0CE19.
+   * The 3 EMBL sequences are added to the alignment dataset.
+   * Their dbrefs to Uniprot products P0CE19 and P0CE20 should be matched in the
+   * alignment dataset and updated to reference the original Uniprot sequences.
+   * For the second Uniprot sequence, the J03321 and X06707 xrefs should be 
+   * resolved from the dataset, and only the X07547 dbref fetched.
+   * So the end state to verify is:
+   * - 4 cross-ref sequences returned: J03321, X06707,  M19487, X07547
+   * - P0CE19/20 dbrefs to EMBL sequences now have mappings
+   * - J03321 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+   * - X06707 dbrefs to P0CE19/20 mapped to original Uniprot sequences
+   * </pre>
+   */
+  @Test(groups = { "Functional" })
+  public void testFindXrefSequences_uniprotEmblManyToMany()
+  {
+    /*
+     * Uniprot sequences, both with xrefs to EMBL|J03321 
+     * and EMBL|X07547
+     * Sequences faked to ensure dna translates to protein
+     * (so that mappings can be made)
+     */
+    SequenceI p0ce19 = new Sequence("UNIPROT|P0CE19", "KPFG");
+    p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+    p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+    p0ce19.addDBRef(new DBRefEntry("EMBL", "0", "M19487"));
+    SequenceI p0ce20 = new Sequence("UNIPROT|P0CE20", "KPFG");
+    p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "J03321"));
+    p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X06707"));
+    p0ce20.addDBRef(new DBRefEntry("EMBL", "0", "X07547"));
+  
+    /*
+     * EMBL sequences to be 'fetched', complete with dbrefs and mappings
+     * to their protein products (CDS location  and translations  are provided
+     * in  EMBL XML); these should be matched to, and replaced with,
+     * the corresponding uniprot sequences after fetching
+     */
+    
+    /*
+     * J03321 with mappings to P0CE19 and P0CE20
+     */
+    final SequenceI j03321 = new Sequence("EMBL|J03321", "AAACCCTTTGGG");
+    DBRefEntry dbref1 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+    MapList mapList = new MapList(new int[] { 1, 18 },
+            new int[] { 1, 6 }, 3, 1);
+    Mapping map = new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), mapList);
+    // add a dbref to the mapped to sequence - should get copied to p0ce19
+    map.getTo().addDBRef(new DBRefEntry("PIR", "0", "S01875"));
+    dbref1.setMap(map);
+    j03321.addDBRef(dbref1);
+    DBRefEntry dbref2 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+    dbref2.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"),
+            new MapList(mapList)));
+    j03321.addDBRef(dbref2);
+    
+    /*
+     * X06707 with mappings to P0CE19 and P0CE20
+     */
+    final SequenceI x06707 = new Sequence("EMBL|X06707", "atgAAACCCTTTGGG");
+    // TODO CrossRef.constructMapping ignores the reverse mapping ??
+    // should it not use its inverse if available?
+    // how does this work for real?
+    DBRefEntry dbref3 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+    MapList map2 = new MapList(new int[] { 4, 21 }, new int[] { 1, 6 }, 3,
+            1);
+    dbref3.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"), map2));
+    x06707.addDBRef(dbref3);
+    DBRefEntry dbref4 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+    dbref4.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "KPFG"),
+            new MapList(mapList)));
+    x06707.addDBRef(dbref4);
+    
+    /*
+     * M19487 with mapping to P0CE19 and Q46432
+     */
+    final SequenceI m19487 = new Sequence("EMBL|M19487", "AAACCCTTTGGG");
+    DBRefEntry dbref5 = new DBRefEntry("UNIPROT", "0", "P0CE19");
+    dbref5.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+            new MapList(mapList)));
+    m19487.addDBRef(dbref5);
+    DBRefEntry dbref6 = new DBRefEntry("UNIPROT", "0", "Q46432");
+    dbref6.setMap(new Mapping(new Sequence("UNIPROT|Q46432", "KPFG"),
+            new MapList(mapList)));
+    m19487.addDBRef(dbref6);
+
+    /*
+     * X07547 with mapping to P0CE20 and B0BCM4
+     */
+    final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
+    DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
+    dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+            new MapList(map2)));
+    x07547.addDBRef(dbref7);
+    DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
+    dbref8.setMap(new Mapping(new Sequence("UNIPROT|B0BCM4", "KPFG"),
+            new MapList(map2)));
+    x07547.addDBRef(dbref8);
+  
+    /*
+     * mock sequence fetcher to 'return' the EMBL sequences
+     * TODO: Mockito would allow .thenReturn().thenReturn() here, 
+     * and also capture and verification of the parameters
+     * passed in calls to getSequences() 
+     */
+    SequenceFetcher mockFetcher = new SequenceFetcher(false)
+    {
+      int call  = 0;
+      @Override
+      public boolean isFetchable(String source)
+      {
+        return true;
+      }
+      @Override
+      public SequenceI[] getSequences(List<DBRefEntry> refs, boolean dna)
+      {
+        call++;
+        return call == 1 ? new SequenceI[] { j03321, x06707, m19487 }
+                : new SequenceI[] { x07547 };
+      }
+    };
+    SequenceFetcherFactory.setSequenceFetcher(mockFetcher);
+  
+    /*
+     * find EMBL xrefs for Uniprot seqs and verify that
+     * - the EMBL xref'd sequences are retrieved without duplicates
+     * - mappings are added to the Uniprot dbrefs
+     * - mappings in the EMBL-to-Uniprot dbrefs are updated to the 
+     *   alignment sequences
+     * - dbrefs on the EMBL sequences are added to the original dbrefs
+     */
+    SequenceI[] seqs = new SequenceI[] { p0ce19, p0ce20 };
+    AlignmentI al = new Alignment(seqs);
+    Alignment xrefs = new CrossRef(seqs, al)
+            .findXrefSequences("EMBL");
+
+    /*
+     * verify retrieved sequences
+     */
+    assertNotNull(xrefs);
+    assertEquals(4, xrefs.getHeight());
+    assertSame(j03321, xrefs.getSequenceAt(0));
+    assertSame(x06707, xrefs.getSequenceAt(1));
+    assertSame(m19487, xrefs.getSequenceAt(2));
+    assertSame(x07547, xrefs.getSequenceAt(3));
+
+    /*
+     * verify mappings added to Uniprot-to-EMBL dbrefs
+     */
+    Mapping mapping = p0ce19.getDBRefs()[0].getMap();
+    assertSame(j03321, mapping.getTo());
+    mapping = p0ce19.getDBRefs()[1].getMap();
+    assertSame(x06707, mapping.getTo());
+    mapping = p0ce20.getDBRefs()[0].getMap();
+    assertSame(j03321, mapping.getTo());
+    mapping = p0ce20.getDBRefs()[1].getMap();
+    assertSame(x06707, mapping.getTo());
+
+    /*
+     * verify dbrefs on EMBL are mapped to alignment seqs
+     */
+    assertSame(p0ce19, j03321.getDBRefs()[0].getMap().getTo());
+    assertSame(p0ce20, j03321.getDBRefs()[1].getMap().getTo());
+    assertSame(p0ce19, x06707.getDBRefs()[0].getMap().getTo());
+    assertSame(p0ce20, x06707.getDBRefs()[1].getMap().getTo());
+
+    /*
+     * verify new dbref on EMBL dbref mapping is copied to the
+     * original Uniprot sequence
+     */
+    assertEquals(4, p0ce19.getDBRefs().length);
+    assertEquals("PIR", p0ce19.getDBRefs()[3].getSource());
+    assertEquals("S01875", p0ce19.getDBRefs()[3].getAccessionId());
+  }
+
+  @Test(groups = "Functional")
+  public void testSameSequence()
+  {
+    assertTrue(CrossRef.sameSequence(null, null));
+    SequenceI seq1 = new Sequence("seq1", "ABCDEF");
+    assertFalse(CrossRef.sameSequence(seq1, null));
+    assertFalse(CrossRef.sameSequence(null, seq1));
+    assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "ABCDEF")));
+    assertTrue(CrossRef.sameSequence(seq1, new Sequence("seq2", "abcdef")));
+    assertFalse(CrossRef
+            .sameSequence(seq1, new Sequence("seq2", "ABCDE-F")));
+    assertFalse(CrossRef.sameSequence(seq1, new Sequence("seq2", "BCDEF")));
+  }
  }
diff --git a/test/jalview/analysis/SequenceIdMatcherTest.java b/test/jalview/analysis/SequenceIdMatcherTest.java

index 9d3e3b6..a17270d 100644 (file)
--- a/test/jalview/analysis/SequenceIdMatcherTest.java
+++ b/test/jalview/analysis/SequenceIdMatcherTest.java
@@ -90,5 +90,11 @@ public class SequenceIdMatcherTest
       * case insensitive matching
       */
      assertTrue(testee.equals("a12345"));
+
+    testee = sequenceIdMatcher.new SeqIdName("UNIPROT|A12345");
+    assertFalse(testee.equals("A12345"));
+    assertFalse(testee.equals("UNIPROT|B98765"));
+    assertFalse(testee.equals("UNIPROT|"));
+    assertTrue(testee.equals("UNIPROT"));
    }
  }
diff --git a/test/jalview/util/DnaUtilsTest.java b/test/jalview/util/DnaUtilsTest.java

index 6623c13..d7a76b9 100644 (file)
--- a/test/jalview/util/DnaUtilsTest.java
+++ b/test/jalview/util/DnaUtilsTest.java
@@ -89,20 +89,10 @@ public class DnaUtilsTest
      assertEquals(87064, ranges.get(1)[1]);
  
      /*
-     * beyond 5' or 3' locus
-     */
-    ranges = DnaUtils.parseLocation("<34..126");
-    assertEquals(1, ranges.size());
-    assertEquals(34, ranges.get(0)[0]);
-    assertEquals(126, ranges.get(0)[1]);
-    ranges = DnaUtils.parseLocation("35..>127");
-    assertEquals(1, ranges.size());
-    assertEquals(35, ranges.get(0)[0]);
-    assertEquals(127, ranges.get(0)[1]);
-
-    /*
       * valid things we don't yet handle
       */
+    assertNull(DnaUtils.parseLocation("<34..126"));
+    assertNull(DnaUtils.parseLocation("35..>126"));
      assertNull(DnaUtils.parseLocation("34.126"));
      assertNull(DnaUtils.parseLocation("34^126"));
      assertNull(DnaUtils.parseLocation("order(34..126,130..180)"));
diff --git a/test/jalview/ws/SequenceFetcherTest.java b/test/jalview/ws/SequenceFetcherTest.java

index 76ca69b..50fb3c0 100644 (file)
--- a/test/jalview/ws/SequenceFetcherTest.java
+++ b/test/jalview/ws/SequenceFetcherTest.java
@@ -106,8 +106,9 @@ public class SequenceFetcherTest
            {
              boolean dna = sp.isDnaCoding();
              // try and find products
-            List<String> types = CrossRef.findXrefSourcesForSequences(dna,
-                    al.getSequencesArray(), null);
+            CrossRef crossRef = new CrossRef(al.getSequencesArray(),
+                    al);
+            List<String> types = crossRef.findXrefSourcesForSequences();
              if (types != null)
              {
                System.out.println("Xref Types for: "
@@ -115,9 +116,7 @@ public class SequenceFetcherTest
                for (String source : types)
                {
                  System.out.println("Type: " + source);
-                SequenceI[] prod = jalview.analysis.CrossRef
-                        .findXrefSequences(al.getSequencesArray(), dna,
-                                source, null)
+                SequenceI[] prod = crossRef.findXrefSequences(source)
                          .getSequencesArray();
                  System.out.println("Found "
                          + ((prod == null) ? "no" : "" + prod.length)
@@ -200,8 +199,8 @@ public class SequenceFetcherTest
            // have a bash at finding the products amongst all the retrieved
            // sequences.
            SequenceI[] seqs = al.getSequencesArray();
-          Alignment prodal = jalview.analysis.CrossRef.findXrefSequences(
-                  seqs, dna, null, ds);
+          Alignment prodal = new CrossRef(seqs, ds)
+                  .findXrefSequences(null);
            System.out.println("Found "
                    + ((prodal == null) ? "no" : "" + prodal.getHeight())
                    + " products");
diff --git a/test/jalview/ws/seqfetcher/DbRefFetcherTest.java b/test/jalview/ws/seqfetcher/DbRefFetcherTest.java

index 63b1b9c..44cefe2 100644 (file)
--- a/test/jalview/ws/seqfetcher/DbRefFetcherTest.java
+++ b/test/jalview/ws/seqfetcher/DbRefFetcherTest.java
@@ -178,8 +178,8 @@ public class DbRefFetcherTest
              .getMap().getMappedWidth(), 1);
      assertEquals("Expected local reference map to be 3 nucleotides", dr[0]
              .getMap().getWidth(), 3);
-    AlignmentI sprods = CrossRef.findXrefSequences(
-            alsq.getSequencesArray(), true, dr[0].getSource(), alsq);
+    AlignmentI sprods = new CrossRef(alsq.getSequencesArray(), alsq)
+            .findXrefSequences(dr[0].getSource());
      assertNotNull(
              "Couldn't recover cross reference sequence from dataset. Was it ever added ?",
              sprods);
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Thu, 9 Jun 2016 13:55:45 +0000 (14:55 +0100)
src/jalview/analysis/AlignmentUtils.java		patch \| blob \| history
src/jalview/analysis/CrossRef.java		patch \| blob \| history
src/jalview/analysis/CrossRefs.java		patch \| blob \| history
src/jalview/analysis/SequenceIdMatcher.java		patch \| blob \| history
src/jalview/datamodel/xdb/embl/EmblEntry.java		patch \| blob \| history
src/jalview/gui/AlignFrame.java		patch \| blob \| history
src/jalview/util/MapList.java		patch \| blob \| history
test/jalview/analysis/CrossRefTest.java		patch \| blob \| history
test/jalview/analysis/SequenceIdMatcherTest.java		patch \| blob \| history
test/jalview/util/DnaUtilsTest.java		patch \| blob \| history
test/jalview/ws/SequenceFetcherTest.java		patch \| blob \| history
test/jalview/ws/seqfetcher/DbRefFetcherTest.java		patch \| blob \| history