JAL-2110 work in progress
[jalview.git] / src / jalview / analysis / CrossRefs.java
index 0f3f425..691e972 100644 (file)
@@ -1,5 +1,6 @@
 package jalview.analysis;
 
+import jalview.analysis.CrossRef.MySequenceFeature;
 import jalview.datamodel.AlignedCodonFrame;
 import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentI;
@@ -20,6 +21,27 @@ import java.util.List;
 
 public class CrossRefs
 {
+  /*
+   * A sub-class that ignores Parent attribute when comparing sequence 
+   * features. This avoids 'duplicate' CDS features that only
+   * differ in their parent Transcript ids.
+   */
+  class MySequenceFeature extends SequenceFeature
+  {
+    private SequenceFeature feat;
+  
+    MySequenceFeature(SequenceFeature sf)
+    {
+      this.feat = sf;
+    }
+  
+    @Override
+    public boolean equals(Object o)
+    {
+      return feat.equals(o, true);
+    }
+  }
+
   /**
    * Finds cross-references for sequences from a specified source database.
    * These may be found in four ways:
@@ -46,44 +68,71 @@ public class CrossRefs
   public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
           String source, AlignmentI dataset)
   {
-    List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
-    AlignedCodonFrame mappings = new AlignedCodonFrame();
-
-    List<DBRefEntry> sourceRefs = new ArrayList<DBRefEntry>();
-
+    /*
+     * filter to only those sequences of the right type (nucleotide/protein)
+     */
+    List<SequenceI> fromSeqs = new ArrayList<SequenceI>();
     for (SequenceI seq : seqs)
     {
-      if (dna != Comparison.isNucleotide(seq))
+      if (dna == Comparison.isNucleotide(seq))
       {
-        /*
-         * mixed alignment, and this sequence is of the wrong type
-         */
-        continue;
+        fromSeqs.add(seq);
       }
+    }
+    return findXrefSequences(fromSeqs, dna, source, dataset);
+  }
+
+  /**
+   * Finds cross-references for sequences from a specified source database.
+   * These may be found in four ways:
+   * <ul>
+   * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
+   * <li>a sequence of complementary type in the alignment dataset, which has a
+   * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
+   * <li>a sequence of complementary type in the alignment, which has a
+   * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
+   * <li>by fetching the accession from the remote database</li>
+   * </ul>
+   * 
+   * @param seqs
+   *          the sequences whose cross-references we are searching for,
+   *          filtered to only those which are of the type denoted by 'dna'
+   * @param dna
+   *          true if the sequences are from a nucleotide alignment, else false
+   * @param source
+   *          the database source we want cross-references to
+   * @param dataset
+   *          the alignment dataset the sequences belong to
+   * @return an alignment containing cross-reference sequences, or null if none
+   *         found
+   */
+  static AlignmentI findXrefSequences(List<SequenceI> fromSeqs,
+          boolean dna, String source, AlignmentI dataset)
+  {
+    List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
+    AlignedCodonFrame mappings = new AlignedCodonFrame();
 
-      /*
-       * get this sequence's dbrefs to source database (if any)
-       */
-      List<DBRefEntry> seqSourceRefs = DBRefUtils.searchRefsForSource(
-              seq.getDBRefs(), source);
+    List<DBRefEntry> unresolvedRefs = new ArrayList<DBRefEntry>();
 
-      /*
-       * first extract any mapped sequences from sourceRefs
-       */
-      findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings);
+    /*
+     * first extract any mapped sequences from sourceRefs
+     * if successful, sequence is removed from fromSeqs
+     * if unsuccessful, dbrefs are added to unresolvedRefs
+     */
+    findMappedDbrefs(fromSeqs, source, foundSeqs,
+            unresolvedRefs, mappings);
 
-      /*
-       * for remaining sourceRefs, try to match a 
-       * complementary sequence in the dataset
-       */
-      findIndirectCrossReferences(seq, source, seqSourceRefs, dataset,
-              foundSeqs, mappings);
-    }
+    /*
+     * then search the alignment dataset for dbref resolutions
+     */
+    findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
+            unresolvedRefs, mappings);
 
     /*
      * fetch any remaining sourceRefs from the source database
      */
-    fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset);
+    fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
+            dna, dataset);
 
     if (foundSeqs.isEmpty())
     {
@@ -98,52 +147,90 @@ public class CrossRefs
   /**
    * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
    * found, adds the sequence to foundSeqs and removes the dbref from the list.
+   * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
+   * version number to 0 i.e. use source and accession only).
    * 
-   * @param seq
-   *          the dataset sequence we are searching from
-   * @param sourceRefs
-   *          the sequence's dbrefs to 'source'
+   * @param fromSeqs
+   *          the dataset sequences we are searching from
+   * @param source
+   *          the database source we are searching dbrefs for
    * @param foundSeqs
-   *          a list of cross-references to add to
+   *          a list of found sequences to add to
+   * @param unresolvedRefs
+   *          a list of unresolved cross-references to add to
    * @param mappings
    *          a set of sequence mappings to add to
    * @return
    */
-  static void findMappedDbrefs(SequenceI seq, List<DBRefEntry> sourceRefs,
-          List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+  static void findMappedDbrefs(List<SequenceI> fromSeqs, String source,
+          List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+          AlignedCodonFrame mappings)
   {
-    Iterator<DBRefEntry> refs = sourceRefs.iterator();
-    while (refs.hasNext())
+    Iterator<SequenceI> it = fromSeqs.iterator();
+    while (it.hasNext())
     {
-      DBRefEntry dbref = refs.next();
-      Mapping map = dbref.getMap();
-      if (map != null)
+      SequenceI seq = it.next();
+      SequenceI dss = seq.getDatasetSequence();
+      dss = dss == null ? seq : dss;
+
+      DBRefEntry[] dbRefs = seq.getDBRefs();
+      if (dbRefs == null)
+      {
+        continue;
+      }
+      boolean resolved = false;
+      for (DBRefEntry dbref : dbRefs)
       {
-        SequenceI mappedTo = map.getTo();
-        if (mappedTo != null)
+        if (!source.equals(dbref.getSource()))
         {
-          foundSeqs.add(new Sequence(mappedTo));
-          refs.remove();
-      
-          /*
-           * check mapping is not 'direct' (it shouldn't be if we reach here)
-           * and add mapping (dna-to-peptide or vice versa) to the set
-           */
-          MapList mapList = map.getMap();
-          int fromRatio = mapList.getFromRatio();
-          int toRatio = mapList.getToRatio();
-          if (fromRatio != toRatio)
+          continue;
+        }
+        DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
+                dbref.getAccessionId());
+        Mapping map = dbref.getMap();
+        if (map != null)
+        {
+          unresolvedRefs.remove(todo);
+          resolved = true;
+          SequenceI mappedTo = map.getTo();
+          if (mappedTo != null)
           {
-            if (fromRatio == 3)
-            {
-              mappings.addMap(seq, mappedTo, mapList);
-            }
-            else
+            foundSeqs.add(new Sequence(mappedTo));
+
+            /*
+             * check mapping is not 'direct' (it shouldn't be if we reach here)
+             * and add mapping (dna-to-peptide or vice versa) to the set
+             */
+            MapList mapList = map.getMap();
+            int fromRatio = mapList.getFromRatio();
+            int toRatio = mapList.getToRatio();
+            if (fromRatio != toRatio)
             {
-              mappings.addMap(mappedTo, seq, mapList.getInverse());
+              if (fromRatio == 3)
+              {
+                mappings.addMap(dss, mappedTo, mapList);
+              }
+              else
+              {
+                mappings.addMap(mappedTo, dss, mapList.getInverse());
+              }
             }
           }
         }
+        else
+        {
+          /*
+           * no mapping to resolve dbref - add source+accession to list to resolve
+           */
+          if (!unresolvedRefs.contains(todo))
+          {
+            unresolvedRefs.add(todo);
+          }
+        }
+      }
+      if (resolved)
+      {
+        it.remove();
       }
     }
   }
@@ -153,13 +240,13 @@ public class CrossRefs
    * to the foundSeqs list. If found, tries to make a mapping between seq and
    * the retrieved sequence and insert it into the database reference.
    * 
-   * @param seq
+   * @param fromSeqs
    * @param sourceRefs
    * @param foundSeqs
    * @param mappings
    * @param dna
    */
-  static void fetchCrossReferences(SequenceI seq,
+  static void fetchCrossReferences(List<SequenceI> fromSeqs,
           List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
           AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
   {
@@ -170,116 +257,116 @@ public class CrossRefs
       retrieved = sftch.getSequences(sourceRefs, !dna);
     } catch (Exception e)
     {
-      System.err
-              .println("Problem whilst retrieving cross references for Sequence : "
-                      + seq.getName());
+      System.err.println("Problem whilst retrieving cross references: "
+              + e.getMessage());
       e.printStackTrace();
       return;
     }
 
-    if (retrieved != null)
+    if (retrieved == null)
     {
-      updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings);
+      return;
+    }
+    updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
 
-      SequenceIdMatcher matcher = new SequenceIdMatcher(
-              dataset.getSequences());
-      List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
-      CrossRef me = new CrossRef();
-      for (int rs = 0; rs < retrieved.length; rs++)
+    SequenceIdMatcher matcher = new SequenceIdMatcher(
+            dataset.getSequences());
+    List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
+    CrossRefs me = new CrossRefs();
+    for (int rs = 0; rs < retrieved.length; rs++)
+    {
+      // TODO: examine each sequence for 'redundancy'
+      DBRefEntry[] dbr = retrieved[rs].getDBRefs();
+      if (dbr != null && dbr.length > 0)
       {
-        // TODO: examine each sequence for 'redundancy'
-        DBRefEntry[] dbr = retrieved[rs].getDBRefs();
-        if (dbr != null && dbr.length > 0)
+        for (int di = 0; di < dbr.length; di++)
         {
-          for (int di = 0; di < dbr.length; di++)
+          // find any entry where we should put in the sequence being
+          // cross-referenced into the map
+          Mapping map = dbr[di].getMap();
+          if (map != null)
           {
-            // find any entry where we should put in the sequence being
-            // cross-referenced into the map
-            Mapping map = dbr[di].getMap();
-            if (map != null)
+            if (map.getTo() != null && map.getMap() != null)
             {
-              if (map.getTo() != null && map.getMap() != null)
+              SequenceI matched = matcher.findIdMatch(map.getTo());
+              if (matched != null)
               {
-                SequenceI matched = matcher.findIdMatch(map.getTo());
-                if (matched != null)
-                {
-                  /*
-                   * already got an xref to this sequence; update this
-                   * map to point to the same sequence, and add
-                   * any new dbrefs to it
-                   */
-                  for (DBRefEntry ref : map.getTo().getDBRefs())
-                  {
-                    matched.addDBRef(ref); // add or update mapping
-                  }
-                  map.setTo(matched);
-                }
-                else
+                /*
+                 * already got an xref to this sequence; update this
+                 * map to point to the same sequence, and add
+                 * any new dbrefs to it
+                 */
+                for (DBRefEntry ref : map.getTo().getDBRefs())
                 {
-                  matcher.add(map.getTo());
+                  matched.addDBRef(ref); // add or update mapping
                 }
-                try
+                map.setTo(matched);
+              }
+              else
+              {
+                matcher.add(map.getTo());
+              }
+              try
+              {
+                // compare ms with dss and replace with dss in mapping
+                // if map is congruent
+                SequenceI ms = map.getTo();
+                int sf = map.getMap().getToLowest();
+                int st = map.getMap().getToHighest();
+                SequenceI mappedrg = ms.getSubSequence(sf, st);
+                // SequenceI loc = dss.getSubSequence(sf, st);
+                if (mappedrg.getLength() > 0
+                        && ms.getSequenceAsString().equals(
+                                fromSeqs.getSequenceAsString()))
+                // && mappedrg.getSequenceAsString().equals(
+                // loc.getSequenceAsString()))
                 {
-                  // compare ms with dss and replace with dss in mapping
-                  // if map is congruent
-                  SequenceI ms = map.getTo();
-                  int sf = map.getMap().getToLowest();
-                  int st = map.getMap().getToHighest();
-                  SequenceI mappedrg = ms.getSubSequence(sf, st);
-                  // SequenceI loc = dss.getSubSequence(sf, st);
-                  if (mappedrg.getLength() > 0
-                          && ms.getSequenceAsString().equals(
-                                  seq.getSequenceAsString()))
-                  // && mappedrg.getSequenceAsString().equals(
-                  // loc.getSequenceAsString()))
+                  String msg = "Mapping updated from " + ms.getName()
+                          + " to retrieved crossreference "
+                          + fromSeqs.getName();
+                  System.out.println(msg);
+                  // method to update all refs of existing To on
+                  // retrieved sequence with dss and merge any props
+                  // on To onto dss.
+                  map.setTo(fromSeqs);
+                  /*
+                   * copy sequence features as well, avoiding
+                   * duplication (e.g. same variation from 2 
+                   * transcripts)
+                   */
+                  SequenceFeature[] sfs = ms.getSequenceFeatures();
+                  if (sfs != null)
                   {
-                    String msg = "Mapping updated from " + ms.getName()
-                            + " to retrieved crossreference "
-                            + seq.getName();
-                    System.out.println(msg);
-                    // method to update all refs of existing To on
-                    // retrieved sequence with dss and merge any props
-                    // on To onto dss.
-                    map.setTo(seq);
-                    /*
-                     * copy sequence features as well, avoiding
-                     * duplication (e.g. same variation from 2 
-                     * transcripts)
-                     */
-                    SequenceFeature[] sfs = ms.getSequenceFeatures();
-                    if (sfs != null)
+                    for (SequenceFeature feat : sfs)
                     {
-                      for (SequenceFeature feat : sfs)
+                      /* 
+                       * we override SequenceFeature.equals here (but
+                       * not elsewhere) to ignore Parent attribute
+                       * TODO not quite working yet!
+                       */
+                      if (!copiedFeatures
+                              .contains(me.new MySequenceFeature(feat)))
                       {
-                        /* 
-                         * we override SequenceFeature.equals here (but
-                         * not elsewhere) to ignore Parent attribute
-                         * TODO not quite working yet!
-                         */
-                        if (!copiedFeatures
-                                .contains(me.new MySequenceFeature(feat)))
-                        {
-                          seq.addSequenceFeature(feat);
-                          copiedFeatures.add(feat);
-                        }
+                        fromSeqs.addSequenceFeature(feat);
+                        copiedFeatures.add(feat);
                       }
                     }
                   }
-                  mappings.addMap(retrieved[rs].getDatasetSequence(),
-                          map.getTo(), map.getMap());
-                } catch (Exception e)
-                {
-                  System.err
-                          .println("Exception when consolidating Mapped sequence set...");
-                  e.printStackTrace(System.err);
                 }
+                mappings.addMap(retrieved[rs].getDatasetSequence(),
+                        map.getTo(), map.getMap());
+              } catch (Exception e)
+              {
+                System.err
+                        .println("Exception when consolidating Mapped sequence set...");
+                e.printStackTrace(System.err);
               }
             }
           }
         }
-        retrieved[rs].updatePDBIds();
-        foundSeqs.add(retrieved[rs]);
       }
+      retrieved[rs].updatePDBIds();
+      foundSeqs.add(retrieved[rs]);
     }
   }
 
@@ -288,24 +375,27 @@ public class CrossRefs
    * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
    * removes the resolved sourceRef from the search list.
    * 
-   * @param seq
+   * @param fromSeqs
    * @param source
-   * @param sourceRefs
-   * @param dataset
+   * @param unresolvedRefs
    * @param foundSeqs
+   * @param unresolvedRefs
    * @param mappings
    * @return
    */
-  static void findIndirectCrossReferences(SequenceI seq, String source,
-          List<DBRefEntry> sourceRefs, AlignmentI dataset,
-          List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
+  static void findIndirectCrossReferences(List<SequenceI> fromSeqs,
+          String source, AlignmentI dataset,
+          List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
+          AlignedCodonFrame mappings)
   {
-    Iterator<DBRefEntry> refs = sourceRefs.iterator();
+    Iterator<DBRefEntry> refs = unresolvedRefs.iterator();
     while (refs.hasNext())
     {
       DBRefEntry dbref = refs.next();
-      boolean found = searchDatasetForCrossReference(seq, dbref, dataset,
-              foundSeqs, mappings);
+      boolean found = false;
+      // boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
+      // foundSeqs,
+      // unresolvedRefs, mappings);
       if (found)
       {
         refs.remove();
@@ -427,12 +517,12 @@ public class CrossRefs
    * AlignedCodonFrame
    * 
    * @param dna
-   * @param mapFrom
+   * @param fromSeqs
    * @param xrefs
    * @param retrieved
    * @param mappings
    */
-  static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+  static void updateDbrefMappings(boolean dna, List<SequenceI> fromSeqs,
           List<DBRefEntry> xrefs, SequenceI[] retrieved,
           AlignedCodonFrame mappings)
   {
@@ -453,11 +543,11 @@ public class CrossRefs
           MapList mapping = null;
           if (dna)
           {
-            mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
+            mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
           }
           else
           {
-            mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
+            mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
             if (mapping != null)
             {
               mapping = mapping.getInverse();
@@ -468,15 +558,15 @@ public class CrossRefs
             xref.setMap(new Mapping(seq, mapping));
             if (dna)
             {
-              AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
+              AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
             }
             if (dna)
             {
-              mappings.addMap(mapFrom, seq, mapping);
+              mappings.addMap(fromSeqs, seq, mapping);
             }
             else
             {
-              mappings.addMap(seq, mapFrom, mapping.getInverse());
+              mappings.addMap(seq, fromSeqs, mapping.getInverse());
             }
             continue;
           }