Merge branch 'feature/JAL-3181linkOrdering' into develop
[jalview.git] / src / jalview / ws / DBRefFetcher.java
index 748cb72..ae4207b 100644 (file)
@@ -26,18 +26,14 @@ import jalview.datamodel.AlignmentI;
 import jalview.datamodel.DBRefEntry;
 import jalview.datamodel.DBRefSource;
 import jalview.datamodel.Mapping;
-import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.gui.CutAndPasteTransfer;
-import jalview.gui.DasSourceBrowser;
 import jalview.gui.Desktop;
 import jalview.gui.FeatureSettings;
 import jalview.gui.IProgressIndicator;
 import jalview.gui.OOMWarning;
 import jalview.util.DBRefUtils;
 import jalview.util.MessageManager;
-import jalview.ws.dbsources.das.api.jalviewSourceI;
-import jalview.ws.dbsources.das.datamodel.DasSequenceSource;
 import jalview.ws.seqfetcher.DbSourceProxy;
 
 import java.util.ArrayList;
@@ -60,6 +56,10 @@ import uk.ac.ebi.www.picr.AccessionMappingService.AccessionMapperServiceLocator;
  */
 public class DBRefFetcher implements Runnable
 {
+  private static final String NEWLINE = System.lineSeparator();
+
+  public static final String TRIM_RETRIEVED_SEQUENCES = "TRIM_FETCHED_DATASET_SEQS";
+
   public interface FetchFinishedListenerI
   {
     void finished();
@@ -71,8 +71,6 @@ public class DBRefFetcher implements Runnable
 
   CutAndPasteTransfer output = new CutAndPasteTransfer();
 
-  StringBuffer sbuffer = new StringBuffer();
-
   boolean running = false;
 
   /**
@@ -115,9 +113,10 @@ public class DBRefFetcher implements Runnable
    */
   public DBRefFetcher(SequenceI[] seqs,
           IProgressIndicator progressIndicatorFrame,
-          DbSourceProxy[] sources, FeatureSettings featureSettings, boolean isNucleotide)
+          DbSourceProxy[] sources, FeatureSettings featureSettings,
+          boolean isNucleotide)
   {
-    listeners = new ArrayList<FetchFinishedListenerI>();
+    listeners = new ArrayList<>();
     this.progressWindow = progressIndicatorFrame;
     alseqs = new SequenceI[seqs.length];
     SequenceI[] ds = new SequenceI[seqs.length];
@@ -139,7 +138,7 @@ public class DBRefFetcher implements Runnable
             .getSequenceFetcherSingleton(progressIndicatorFrame);
     // set default behaviour for transferring excess sequence data to the
     // dataset
-    trimDsSeqs = Cache.getDefault("TRIM_FETCHED_DATASET_SEQS", true);
+    trimDsSeqs = Cache.getDefault(TRIM_RETRIEVED_SEQUENCES, true);
     if (sources == null)
     {
       setDatabaseSources(featureSettings, isNucleotide);
@@ -163,23 +162,7 @@ public class DBRefFetcher implements Runnable
   {
     // af.featureSettings_actionPerformed(null);
     String[] defdb = null;
-    List<DbSourceProxy> selsources = new ArrayList<DbSourceProxy>();
-    Vector<jalviewSourceI> dasselsrc = (featureSettings != null) ? featureSettings
-            .getSelectedSources() : new DasSourceBrowser()
-            .getSelectedSources();
-
-    for (jalviewSourceI src : dasselsrc)
-    {
-      List<DbSourceProxy> sp = src.getSequenceSourceProxies();
-      if (sp != null)
-      {
-        selsources.addAll(sp);
-        if (sp.size() > 1)
-        {
-          Cache.log.debug("Added many Db Sources for :" + src.getTitle());
-        }
-      }
-    }
+    List<DbSourceProxy> selsources = new ArrayList<>();
     // select appropriate databases based on alignFrame context.
     if (forNucleotide)
     {
@@ -189,7 +172,7 @@ public class DBRefFetcher implements Runnable
     {
       defdb = DBRefSource.PROTEINDBS;
     }
-    List<DbSourceProxy> srces = new ArrayList<DbSourceProxy>();
+    List<DbSourceProxy> srces = new ArrayList<>();
     for (String ddb : defdb)
     {
       List<DbSourceProxy> srcesfordb = sfetcher.getSourceProxy(ddb);
@@ -233,29 +216,6 @@ public class DBRefFetcher implements Runnable
   }
 
   /**
-   * retrieve all the das sequence sources and add them to the list of db
-   * sources to retrieve from
-   */
-  public void appendAllDasSources()
-  {
-    if (dbSources == null)
-    {
-      dbSources = new DbSourceProxy[0];
-    }
-    // append additional sources
-    DbSourceProxy[] otherdb = sfetcher
-            .getDbSourceProxyInstances(DasSequenceSource.class);
-    if (otherdb != null && otherdb.length > 0)
-    {
-      DbSourceProxy[] newsrc = new DbSourceProxy[dbSources.length
-              + otherdb.length];
-      System.arraycopy(dbSources, 0, newsrc, 0, dbSources.length);
-      System.arraycopy(otherdb, 0, newsrc, dbSources.length, otherdb.length);
-      dbSources = newsrc;
-    }
-  }
-
-  /**
    * start the fetcher thread
    * 
    * @param waitTillFinished
@@ -308,14 +268,14 @@ public class DBRefFetcher implements Runnable
       }
       else if (seqs == null)
       {
-        seqs = new Vector<SequenceI>();
+        seqs = new Vector<>();
         seqs.addElement(seq);
       }
 
     }
     else
     {
-      seqs = new Vector<SequenceI>();
+      seqs = new Vector<>();
       seqs.addElement(seq);
     }
 
@@ -330,9 +290,8 @@ public class DBRefFetcher implements Runnable
   {
     if (dbSources == null)
     {
-      throw new Error(
-              MessageManager
-                      .getString("error.implementation_error_must_init_dbsources"));
+      throw new Error(MessageManager
+              .getString("error.implementation_error_must_init_dbsources"));
     }
     running = true;
     long startTime = System.currentTimeMillis();
@@ -340,7 +299,7 @@ public class DBRefFetcher implements Runnable
     {
       progressWindow.setProgressBar(
               MessageManager.getString("status.fetching_db_refs"),
-            startTime);
+              startTime);
     }
     try
     {
@@ -355,8 +314,9 @@ public class DBRefFetcher implements Runnable
       e.printStackTrace();
     }
 
-    Vector<SequenceI> sdataset = new Vector<SequenceI>(
+    Vector<SequenceI> sdataset = new Vector<>(
             Arrays.asList(dataset));
+    List<String> warningMessages = new ArrayList<>();
 
     int db = 0;
     while (sdataset.size() > 0 && db < dbSources.length)
@@ -368,8 +328,8 @@ public class DBRefFetcher implements Runnable
       SequenceI[] currSeqs = new SequenceI[sdataset.size()];
       sdataset.copyInto(currSeqs);// seqs that are to be validated against
       // dbSources[db]
-      Vector<String> queries = new Vector<String>(); // generated queries curSeq
-      seqRefs = new Hashtable<String, Vector<SequenceI>>();
+      Vector<String> queries = new Vector<>(); // generated queries curSeq
+      seqRefs = new Hashtable<>();
 
       int seqIndex = 0;
 
@@ -388,16 +348,16 @@ public class DBRefFetcher implements Runnable
           // Still queries to make for current seqIndex
           StringBuffer queryString = new StringBuffer("");
           int numq = 0;
-          int nqSize = (maxqlen > queries.size()) ? queries
-                  .size() : maxqlen;
+          int nqSize = (maxqlen > queries.size()) ? queries.size()
+                  : maxqlen;
 
           while (queries.size() > 0 && numq < nqSize)
           {
             String query = queries.elementAt(0);
             if (dbsource.isValidReference(query))
             {
-              queryString.append((numq == 0) ? "" : dbsource
-                      .getAccessionSeparator());
+              queryString.append(
+                      (numq == 0) ? "" : dbsource.getAccessionSeparator());
               queryString.append(query);
               numq++;
             }
@@ -425,18 +385,19 @@ public class DBRefFetcher implements Runnable
           if (retrieved != null)
           {
             transferReferences(sdataset, dbsource.getDbSource(), retrieved,
-                    trimDsSeqs);
+                    trimDsSeqs, warningMessages);
           }
         }
         else
         {
           // make some more strings for use as queries
-          for (int i = 0; (seqIndex < dataset.length) && (i < 50); seqIndex++, i++)
+          for (int i = 0; (seqIndex < dataset.length)
+                  && (i < 50); seqIndex++, i++)
           {
             SequenceI sequence = dataset[seqIndex];
-            DBRefEntry[] uprefs = DBRefUtils.selectRefs(
-                    sequence.getDBRefs(),
-                    new String[] { dbsource.getDbSource() }); // jalview.datamodel.DBRefSource.UNIPROT
+            DBRefEntry[] uprefs = DBRefUtils
+                    .selectRefs(sequence.getDBRefs(), new String[]
+                    { dbsource.getDbSource() }); // jalview.datamodel.DBRefSource.UNIPROT
             // });
             // check for existing dbrefs to use
             if (uprefs != null && uprefs.length > 0)
@@ -444,7 +405,8 @@ public class DBRefFetcher implements Runnable
               for (int j = 0; j < uprefs.length; j++)
               {
                 addSeqId(sequence, uprefs[j].getAccessionId());
-                queries.addElement(uprefs[j].getAccessionId().toUpperCase());
+                queries.addElement(
+                        uprefs[j].getAccessionId().toUpperCase());
               }
             }
             else
@@ -461,14 +423,13 @@ public class DBRefFetcher implements Runnable
                   // resolve the string against PICR to recover valid IDs
                   try
                   {
-                    presp = picrClient
-                            .getUPIForAccession(token, null,
-                                    picrClient.getMappedDatabaseNames(),
-                                    null, true);
+                    presp = picrClient.getUPIForAccession(token, null,
+                            picrClient.getMappedDatabaseNames(), null,
+                            true);
                   } catch (Exception e)
                   {
-                    System.err.println("Exception with Picr for '" + token
-                            + "'\n");
+                    System.err.println(
+                            "Exception with Picr for '" + token + "'\n");
                     e.printStackTrace();
                   }
                 }
@@ -480,8 +441,8 @@ public class DBRefFetcher implements Runnable
                     // present, and do a transferReferences
                     // otherwise transfer non sequence x-references directly.
                   }
-                  System.out
-                          .println("Validated ID against PICR... (for what its worth):"
+                  System.out.println(
+                          "Validated ID against PICR... (for what its worth):"
                                   + token);
                   addSeqId(sequence, token);
                   queries.addElement(token.toUpperCase());
@@ -489,7 +450,8 @@ public class DBRefFetcher implements Runnable
                 else
                 {
                   // if ()
-                  // System.out.println("Not querying source with token="+token+"\n");
+                  // System.out.println("Not querying source with
+                  // token="+token+"\n");
                   addSeqId(sequence, token);
                   queries.addElement(token.toUpperCase());
                 }
@@ -501,14 +463,20 @@ public class DBRefFetcher implements Runnable
       // advance to next database
       db++;
     } // all databases have been queried
-    if (sbuffer.length() > 0)
+    if (!warningMessages.isEmpty())
     {
-      output.setText(MessageManager
-              .getString("label.your_sequences_have_been_verified")
-              + sbuffer.toString());
+      StringBuilder sb = new StringBuilder(warningMessages.size() * 30);
+      sb.append(MessageManager
+              .getString("label.your_sequences_have_been_verified"));
+      for (String msg : warningMessages)
+      {
+        sb.append(msg).append(NEWLINE);
+      }
+      output.setText(sb.toString());
+
       Desktop.addInternalFrame(output,
-              MessageManager.getString("label.sequence_names_updated"),
-              600, 300);
+              MessageManager.getString("label.sequences_updated"), 600,
+              300);
       // The above is the dataset, we must now find out the index
       // of the viewed sequence
 
@@ -529,38 +497,50 @@ public class DBRefFetcher implements Runnable
 
   /**
    * Verify local sequences in seqRefs against the retrieved sequence database
-   * records.
+   * records. Returns true if any sequence was modified as a result (start/end
+   * changed and/or sequence enlarged), else false.
    * 
+   * @param sdataset
+   *          dataset sequences we are retrieving for
+   * @param dbSource
+   *          database source we are retrieving from
+   * @param retrievedAl
+   *          retrieved sequences as alignment
    * @param trimDatasetSeqs
-   * 
+   *          if true, sequences will not be enlarged to match longer retrieved
+   *          sequences, only their start/end adjusted
+   * @param warningMessages
+   *          a list of messages to add to
    */
-  void transferReferences(Vector<SequenceI> sdataset, String dbSource,
-          AlignmentI retrievedAl, boolean trimDatasetSeqs)
+  boolean transferReferences(Vector<SequenceI> sdataset, String dbSource,
+          AlignmentI retrievedAl, boolean trimDatasetSeqs,
+          List<String> warningMessages)
   {
     // System.out.println("trimming ? " + trimDatasetSeqs);
     if (retrievedAl == null || retrievedAl.getHeight() == 0)
     {
-      return;
+      return false;
     }
-    SequenceI[] retrieved = recoverDbSequences(retrievedAl
-            .getSequencesArray());
+
+    boolean modified = false;
+    SequenceI[] retrieved = recoverDbSequences(
+            retrievedAl.getSequencesArray());
     SequenceI sequence = null;
-    boolean transferred = false;
-    StringBuilder messages = new StringBuilder(64);
 
-    for (SequenceI entry : retrieved)
+    for (SequenceI retrievedSeq : retrieved)
     {
       // Work out which sequences this sequence matches,
       // taking into account all accessionIds and names in the file
-      Vector<SequenceI> sequenceMatches = new Vector<SequenceI>();
+      Vector<SequenceI> sequenceMatches = new Vector<>();
       // look for corresponding accession ids
-      DBRefEntry[] entryRefs = jalview.util.DBRefUtils.selectRefs(
-              entry.getDBRefs(), new String[] { dbSource });
+      DBRefEntry[] entryRefs = DBRefUtils
+              .selectRefs(retrievedSeq.getDBRefs(), new String[]
+              { dbSource });
       if (entryRefs == null)
       {
         System.err
                 .println("Dud dbSource string ? no entryrefs selected for "
-                        + dbSource + " on " + entry.getName());
+                        + dbSource + " on " + retrievedSeq.getName());
         continue;
       }
       for (int j = 0; j < entryRefs.length; j++)
@@ -614,7 +594,8 @@ public class DBRefFetcher implements Runnable
        */
       // sequenceMatches now contains the set of all sequences associated with
       // the returned db record
-      String entrySeq = entry.getSequenceAsString().toUpperCase();
+      final String retrievedSeqString = retrievedSeq.getSequenceAsString();
+      String entrySeq = retrievedSeqString.toUpperCase();
       for (int m = 0; m < sequenceMatches.size(); m++)
       {
         sequence = sequenceMatches.elementAt(m);
@@ -627,13 +608,14 @@ public class DBRefFetcher implements Runnable
         // TODO:
         // verify sequence against the entry sequence
 
-        String nonGapped = AlignSeq.extractGaps("-. ",
-                sequence.getSequenceAsString()).toUpperCase();
-
-        int absStart = entrySeq.indexOf(nonGapped);
         Mapping mp;
-
         final int sequenceStart = sequence.getStart();
+
+        boolean remoteEnclosesLocal = false;
+        String nonGapped = AlignSeq
+                .extractGaps("-. ", sequence.getSequenceAsString())
+                .toUpperCase();
+        int absStart = entrySeq.indexOf(nonGapped);
         if (absStart == -1)
         {
           // couldn't find local sequence in sequence from database, so check if
@@ -643,87 +625,89 @@ public class DBRefFetcher implements Runnable
           {
             // verification failed. couldn't find any relationship between
             // entrySeq and local sequence
-            messages.append(sequence.getName()
-                    + " Sequence not 100% match with " + entry.getName()
-                    + "\n");
+            // messages suppressed as many-to-many matches are confusing
+            // String msg = sequence.getName()
+            // + " Sequence not 100% match with "
+            // + retrievedSeq.getName();
+            // addWarningMessage(warningMessages, msg);
             continue;
           }
           /*
-           * found match for the whole of the database sequence within the local
-           * sequence's reference frame. 
+           * retrieved sequence is a proper subsequence of local sequence
            */
-          transferred = true;
-          sbuffer.append(sequence.getName() + " has " + absStart
-                  + " prefixed residues compared to " + entry.getName()
-                  + "\n");
+          String msg = sequence.getName() + " has " + absStart
+                  + " prefixed residues compared to "
+                  + retrievedSeq.getName();
+          addWarningMessage(warningMessages, msg);
 
           /*
            * So create a mapping to the external entry from the matching region of 
            * the local sequence, and leave local start/end untouched. 
            */
-          mp = new Mapping(null, new int[] { sequenceStart + absStart,
-              sequenceStart + absStart + entrySeq.length() - 1 }, new int[]
-          { entry.getStart(), entry.getStart() + entrySeq.length() - 1 },
+          mp = new Mapping(null,
+                  new int[]
+                  { sequenceStart + absStart,
+                      sequenceStart + absStart + entrySeq.length() - 1 },
+                  new int[]
+                  { retrievedSeq.getStart(),
+                      retrievedSeq.getStart() + entrySeq.length() - 1 },
                   1, 1);
           updateRefFrame = false;
         }
         else
         {
           /*
-           * found a match for the local sequence within sequence from 
-           * the external database 
+           * local sequence is a subsequence of (or matches) retrieved sequence
            */
-          transferred = true;
-
-          // update start and end of local sequence to place it in entry's
-          // reference frame.
-          // apply identity map map from whole of local sequence to matching
-          // region of database
-          // sequence
-          mp = null; // Mapping.getIdentityMap();
-          // new Mapping(null,
-          // new int[] { absStart+sequence.getStart(),
-          // absStart+sequence.getStart()+entrySeq.length()-1},
-          // new int[] { entry.getStart(), entry.getEnd() }, 1, 1);
-          // relocate local features for updated start
+          remoteEnclosesLocal = true;
+          mp = null;
 
           if (updateRefFrame)
           {
-            if (sequence.getSequenceFeatures() != null)
+            /*
+             * relocate existing sequence features by offset
+             */
+            int startShift = absStart - sequenceStart + 1;
+            if (startShift != 0)
             {
-              /*
-               * relocate existing sequence features by offset
-               */
-              SequenceFeature[] sf = sequence.getSequenceFeatures();
-              int start = sequenceStart;
-              int end = sequence.getEnd();
-              int startShift = 1 - absStart - start; // how much the features
-                                                     // are
-              // to be shifted by
-              for (int sfi = 0; sfi < sf.length; sfi++)
-              {
-                if (sf[sfi].getBegin() >= start && sf[sfi].getEnd() <= end)
-                {
-                  // shift feature along by absstart
-                  sf[sfi].setBegin(sf[sfi].getBegin() + startShift);
-                  sf[sfi].setEnd(sf[sfi].getEnd() + startShift);
-                }
-              }
+              modified |= sequence.getFeatures().shiftFeatures(1,
+                      startShift);
             }
           }
         }
 
         System.out.println("Adding dbrefs to " + sequence.getName()
-                + " from " + dbSource + " sequence : " + entry.getName());
-        sequence.transferAnnotation(entry, mp);
+                + " from " + dbSource + " sequence : "
+                + retrievedSeq.getName());
+        sequence.transferAnnotation(retrievedSeq, mp);
 
-        absStart += entry.getStart();
+        absStart += retrievedSeq.getStart();
         int absEnd = absStart + nonGapped.length() - 1;
         if (!trimDatasetSeqs)
         {
-          // insert full length sequence from record
-          sequence.setSequence(entry.getSequenceAsString());
-          sequence.setStart(entry.getStart());
+          /*
+           * update start position and/or expand to longer retrieved sequence
+           */
+          if (!retrievedSeqString.equals(sequence.getSequenceAsString())
+                  && remoteEnclosesLocal)
+          {
+            sequence.setSequence(retrievedSeqString);
+            modified = true;
+            addWarningMessage(warningMessages,
+                    "Sequence for " + sequence.getName() + " expanded from "
+                            + retrievedSeq.getName());
+          }
+          if (sequence.getStart() != retrievedSeq.getStart())
+          {
+            sequence.setStart(retrievedSeq.getStart());
+            modified = true;
+            if (absStart != sequenceStart)
+            {
+              addWarningMessage(warningMessages,
+                      "Start/end position for " + sequence.getName()
+                              + " updated from " + retrievedSeq.getName());
+            }
+          }
         }
         if (updateRefFrame)
         {
@@ -731,24 +715,34 @@ public class DBRefFetcher implements Runnable
           if (trimDatasetSeqs)
           {
             // just fix start/end
-            sequence.setStart(absStart);
-            sequence.setEnd(absEnd);
+            if (sequence.getStart() != absStart
+                    || sequence.getEnd() != absEnd)
+            {
+              sequence.setStart(absStart);
+              sequence.setEnd(absEnd);
+              modified = true;
+              addWarningMessage(warningMessages,
+                      "Start/end for " + sequence.getName()
+                              + " updated from " + retrievedSeq.getName());
+            }
           }
           // search for alignment sequences to update coordinate frame for
           for (int alsq = 0; alsq < alseqs.length; alsq++)
           {
             if (alseqs[alsq].getDatasetSequence() == sequence)
             {
-              String ngAlsq = AlignSeq.extractGaps("-. ",
-                      alseqs[alsq].getSequenceAsString()).toUpperCase();
+              String ngAlsq = AlignSeq
+                      .extractGaps("-. ",
+                              alseqs[alsq].getSequenceAsString())
+                      .toUpperCase();
               int oldstrt = alseqs[alsq].getStart();
               alseqs[alsq].setStart(sequence.getSequenceAsString()
-                      .toUpperCase().indexOf(ngAlsq)
-                      + sequence.getStart());
+                      .toUpperCase().indexOf(ngAlsq) + sequence.getStart());
               if (oldstrt != alseqs[alsq].getStart())
               {
-                alseqs[alsq].setEnd(ngAlsq.length()
-                        + alseqs[alsq].getStart() - 1);
+                alseqs[alsq].setEnd(
+                        ngAlsq.length() + alseqs[alsq].getStart() - 1);
+                modified = true;
               }
             }
           }
@@ -761,14 +755,22 @@ public class DBRefFetcher implements Runnable
         // and remove it from the rest
         // TODO: decide if we should remove annotated sequence from set
         sdataset.remove(sequence);
-        // TODO: should we make a note of sequences that have received new DB
-        // ids, so we can query all enabled DAS servers for them ?
       }
     }
-    if (!transferred)
+    return modified;
+  }
+
+  /**
+   * Adds the message to the list unless it already contains it
+   * 
+   * @param messageList
+   * @param msg
+   */
+  void addWarningMessage(List<String> messageList, String msg)
+  {
+    if (!messageList.contains(msg))
     {
-      // report the ID/sequence mismatches
-      sbuffer.append(messages);
+      messageList.add(msg);
     }
   }
 
@@ -780,8 +782,9 @@ public class DBRefFetcher implements Runnable
    */
   private SequenceI[] recoverDbSequences(SequenceI[] sequencesArray)
   {
-    Vector<SequenceI> nseq = new Vector<SequenceI>();
-    for (int i = 0; sequencesArray != null && i < sequencesArray.length; i++)
+    Vector<SequenceI> nseq = new Vector<>();
+    for (int i = 0; sequencesArray != null
+            && i < sequencesArray.length; i++)
     {
       nseq.addElement(sequencesArray[i]);
       DBRefEntry[] dbr = sequencesArray[i].getDBRefs();