JAL-2210 refactor code to remove from a list of dbrefs those which match primary...

[jalview.git] / src / jalview / analysis / CrossRef.java
diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java

index a169ba6..05814c2 100644 (file)
--- a/src/jalview/analysis/CrossRef.java
+++ b/src/jalview/analysis/CrossRef.java
@@ -222,6 +222,9 @@ public class CrossRef
        boolean found = false;
        DBRefEntry[] xrfs = DBRefUtils
                .selectDbRefs(!fromDna, dss.getDBRefs());
+      // ENST & ENSP comes in to both Protein and nucleotide, so we need to
+      // filter them
+      // out later.
        if ((xrfs == null || xrfs.length == 0) && dataset != null)
        {
          /*
@@ -249,11 +252,15 @@ public class CrossRef
        List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
                source);
        Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
+      // At this point, if we are retrieving Ensembl, we still don't filter out
+      // ENST when looking for protein crossrefs.
        while (refIterator.hasNext())
        {
          DBRefEntry xref = refIterator.next();
          found = false;
-        if (xref.hasMap())
+        // we're only interested in coding cross-references, not
+        // locus->transcript
+        if (xref.hasMap() && xref.getMap().getMap().isTripletMap())
          {
            SequenceI mappedTo = xref.getMap().getTo();
            if (mappedTo != null)
@@ -271,6 +278,18 @@ public class CrossRef
               * but findInDataset() matches ENSP when looking for Uniprot...
               */
              SequenceI matchInDataset = findInDataset(xref);
+            if (matchInDataset != null && xref.getMap().getTo() != null
+                    && matchInDataset != xref.getMap().getTo())
+            {
+              System.err
+                      .println("Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref."
+                              + "Found:"
+                              + matchInDataset
+                              + "\nExpected:"
+                              + xref.getMap().getTo()
+                              + "\nFor xref:"
+                              + xref);
+            }
              /*matcher.findIdMatch(mappedTo);*/
              if (matchInDataset != null)
              {
@@ -278,13 +297,34 @@ public class CrossRef
                {
                  rseqs.add(matchInDataset);
                }
+              // even if rseqs contained matchInDataset - check mappings between
+              // these seqs are added
+              // need to try harder to only add unique mappings
+              if (xref.getMap().getMap().isTripletMap()
+                      && dataset.getMapping(seq, matchInDataset) == null
+                      && cf.getMappingBetween(seq, matchInDataset) == null)
+              {
+                // materialise a mapping for highlighting between these
+                // sequences
+                if (fromDna)
+                {
+                  cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
+                          xref.getMap().getMappedFromId());
+                }
+                else
+                {
+                  cf.addMap(matchInDataset, dss, xref.getMap().getMap()
+                          .getInverse(), xref.getMap().getMappedFromId());
+                }
+              }
+
                refIterator.remove();
                continue;
              }
+            // TODO: need to determine if this should be a deriveSequence
              SequenceI rsq = new Sequence(mappedTo);
              rseqs.add(rsq);
-            if (xref.getMap().getMap().getFromRatio() != xref.getMap()
-                    .getMap().getToRatio())
+            if (xref.getMap().getMap().isTripletMap())
              {
                // get sense of map correct for adding to product alignment.
                if (fromDna)
@@ -307,7 +347,9 @@ public class CrossRef
          {
            SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
                    + xref.getAccessionId());
-          if (matchedSeq != null)
+          // if there was a match, check it's at least the right type of
+          // molecule!
+          if (matchedSeq != null && matchedSeq.isProtein() == fromDna)
            {
              if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
              {
@@ -356,6 +398,16 @@ public class CrossRef
      SequenceI[] retrieved = null;
      SequenceI dss = seq.getDatasetSequence() == null ? seq : seq
              .getDatasetSequence();
+    // first filter in case we are retrieving crossrefs that have already been
+    // retrieved. this happens for cases where a database record doesn't yield
+    // protein products for CDS
+    removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
+    if (sourceRefs.size() == 0)
+    {
+      // no more work to do! We already had all requested sequence records in
+      // the dataset.
+      return;
+    }
      try
      {
        retrieved = sftch.getSequences(sourceRefs, !fromDna);
@@ -378,6 +430,43 @@ public class CrossRef
                  : retrievedSequence.getDatasetSequence();
          DBRefEntry[] dbr = retrievedSequence.getDBRefs();
          if (dbr != null)
+
+  /**
+   * Search dataset for sequences with a primary reference contained in
+   * sourceRefs.
+   * 
+   * @param sourceRefs
+   *          - list of references to filter.
+   * @param fromDna
+   *          - type of sequence to search for matching primary reference.
+   */
+  private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
+          boolean fromDna)
+  {
+    DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
+    for (SequenceI sq : dataset.getSequences())
+    {
+      boolean dupeFound = false;
+      // !fromDna means we are looking only for nucleotide sequences, not
+      // protein
+      if (sq.isProtein() == fromDna)
+      {
+        for (DBRefEntry dbr : sq.getPrimaryDBRefs())
+        {
+          for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr))
+          {
+            sourceRefs.remove(found);
+            dupeFound = true;
+          }
+        }
+      }
+      if (dupeFound)
+      {
+        dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
+      }
+    }
+  }
+
          {
            for (DBRefEntry dbref : dbr)
            {
@@ -413,7 +502,11 @@ public class CrossRef
                  }
                  else
                  {
-                  matcher.add(map.getTo());
+                  if (dataset.findIndex(map.getTo()) == -1)
+                  {
+                    dataset.addSequence(map.getTo());
+                    matcher.add(map.getTo());
+                  }
                  }
                  try
                  {
@@ -483,8 +576,11 @@ public class CrossRef
          }
          retrievedSequence.updatePDBIds();
          rseqs.add(retrievedDss);
-        dataset.addSequence(retrievedDss);
-        matcher.add(retrievedDss);
+        if (dataset.findIndex(retrievedDss) == -1)
+        {
+          dataset.addSequence(retrievedDss);
+          matcher.add(retrievedDss);
+        }
        }
      }
    }
@@ -546,6 +642,12 @@ public class CrossRef
      String name2 = xref.getSource() + "|" + name;
      SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo
              .getDatasetSequence();
+    // first check ds if ds is directly referenced
+    if (dataset.findIndex(dss) > -1)
+    {
+      return dss;
+    }
+    ;
      for (SequenceI seq : dataset.getSequences())
      {
        /*
@@ -662,24 +764,28 @@ public class CrossRef
            DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
    {
      MapList mapping = null;
-
+    SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom
+            : mapFrom.getDatasetSequence();
+    SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo
+            : mapTo.getDatasetSequence();
      /*
-     * look for a reverse mapping, if found make its inverse
+     * look for a reverse mapping, if found make its inverse. 
+     * Note - we do this on dataset sequences only.
       */
-    if (mapTo.getDBRefs() != null)
+    if (dsmapTo.getDBRefs() != null)
      {
-      for (DBRefEntry dbref : mapTo.getDBRefs())
+      for (DBRefEntry dbref : dsmapTo.getDBRefs())
        {
          String name = dbref.getSource() + "|" + dbref.getAccessionId();
-        if (dbref.hasMap() && mapFrom.getName().startsWith(name))
+        if (dbref.hasMap() && dsmapFrom.getName().startsWith(name))
          {
            /*
             * looks like we've found a map from 'mapTo' to 'mapFrom'
             * - invert it to make the mapping the other way 
             */
            MapList reverse = dbref.getMap().getMap().getInverse();
-          xref.setMap(new Mapping(mapTo, reverse));
-          mappings.addMap(mapFrom, mapTo, reverse);
+          xref.setMap(new Mapping(dsmapTo, reverse));
+          mappings.addMap(mapFrom, dsmapTo, reverse);
            return true;
          }
        }
@@ -791,8 +897,8 @@ public class CrossRef
     *          </ul>
     * @return true if relationship found and sequence added.
     */
-  boolean searchDataset(boolean fromDna, SequenceI fromSeq,
-          DBRefEntry xrf, List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
+  boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
+          List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
            boolean direct)
    {
      boolean found = false;
@@ -853,37 +959,38 @@ public class CrossRef
            // }
            if (!cands.isEmpty())
            {
-            if (!foundSeqs.contains(nxt))
+            if (foundSeqs.contains(nxt))
              {
-              found = true;
-              foundSeqs.add(nxt);
-              if (mappings != null && !direct)
+              continue;
+            }
+            found = true;
+            foundSeqs.add(nxt);
+            if (mappings != null && !direct)
+            {
+              /*
+               * if the matched sequence has mapped dbrefs to
+               * protein product / cdna, add equivalent mappings to
+               * our source sequence
+               */
+              for (DBRefEntry candidate : cands)
                {
-                /*
-                 * if the matched sequence has mapped dbrefs to
-                 * protein product / cdna, add equivalent mappings to
-                 * our source sequence
-                 */
-                for (DBRefEntry candidate : cands)
+                Mapping mapping = candidate.getMap();
+                if (mapping != null)
                  {
-                  Mapping mapping = candidate.getMap();
-                  if (mapping != null)
+                  MapList map = mapping.getMap();
+                  if (mapping.getTo() != null
+                          && map.getFromRatio() != map.getToRatio())
                    {
-                    MapList map = mapping.getMap();
-                    if (mapping.getTo() != null
-                            && map.getFromRatio() != map.getToRatio())
+                    /*
+                     * add a mapping, as from dna to peptide sequence
+                     */
+                    if (map.getFromRatio() == 3)
                      {
-                      /*
-                       * add a mapping, as from dna to peptide sequence
-                       */
-                      if (map.getFromRatio() == 3)
-                      {
-                        mappings.addMap(nxt, fromSeq, map);
-                      }
-                      else
-                      {
-                        mappings.addMap(nxt, fromSeq, map.getInverse());
-                      }
+                      mappings.addMap(nxt, fromSeq, map);
+                    }
+                    else
+                    {
+                      mappings.addMap(nxt, fromSeq, map.getInverse());
                      }
                    }
                  }