JAL-1705 EnsemblGene added, and related refactoring
[jalview.git] / src / jalview / ext / ensembl / EnsemblSeqProxy.java
index b805417..7153a9e 100644 (file)
@@ -92,7 +92,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
     }
 
-  };
+  }
+
+  /*
+   * genomic sequence, with features retrieved from the REST overlap service
+   */
+  private SequenceI genomicSequence;
 
   /**
    * Constructor
@@ -177,24 +182,32 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       /*
        * get 'dummy' genomic sequence with exon, cds and variation features
        */
-      EnsemblOverlap gffFetcher = new EnsemblOverlap();
-      EnsemblFeatureType[] features = getFeaturesToFetch();
-      AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
-              features);
-      if (geneFeatures.getHeight() > 0)
+      if (genomicSequence == null)
+      {
+        EnsemblOverlap gffFetcher = new EnsemblOverlap();
+        EnsemblFeatureType[] features = getFeaturesToFetch();
+        AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
+                features);
+        if (geneFeatures.getHeight() > 0)
+        {
+          /*
+           * transfer features to the query sequence
+           */
+          genomicSequence = geneFeatures.getSequenceAt(0);
+        }
+      }
+      if (genomicSequence != null)
       {
-        /*
-         * transfer features to the query sequence
-         */
-        SequenceI genomicSequence = geneFeatures.getSequenceAt(0);
         SequenceI querySeq = alignment.findName(accId);
-        transferFeatures(accId, genomicSequence, querySeq);
+        if (transferFeatures(accId, genomicSequence, querySeq))
+        {
 
-        /*
-         * fetch and map protein product, and add it as a cross-reference
-         * of the retrieved sequence
-         */
-        addProteinProduct(querySeq);
+          /*
+           * fetch and map protein product, and add it as a cross-reference
+           * of the retrieved sequence
+           */
+          addProteinProduct(querySeq);
+        }
       }
     } catch (IOException e)
     {
@@ -398,19 +411,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   }
 
   @Override
-  public boolean useGetRequest()
+  protected boolean useGetRequest()
   {
     return false;
   }
 
   @Override
-  public String getRequestMimeType()
+  protected String getRequestMimeType()
   {
     return "application/json";
   }
 
   @Override
-  public String getResponseMimeType()
+  protected String getResponseMimeType()
   {
     return "text/x-fasta";
   }
@@ -441,17 +454,27 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    * backwards (for negative strand). Aborts and returns null if both positive
    * and negative strand are found (this should not normally happen).
    * 
-   * @param sfs
+   * @param sourceSequence
    * @param accId
+   * @param start
+   *          the start position of the sequence we are mapping to
    * @return
    */
-  protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId)
+  protected MapList getGenomicRanges(SequenceI sourceSequence,
+          String accId, int start)
   {
+    SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
+    if (sfs == null)
+    {
+      return null;
+    }
+
     /*
      * generously size for initial number of cds regions
      * (worst case titin Q8WZ42 has c. 313 exons)
      */
     List<int[]> regions = new ArrayList<int[]>(100);
+    int sourceLength = sourceSequence.getLength();
     int mappedLength = 0;
     int direction = 1; // forward
     boolean directionSet = false;
@@ -485,12 +508,28 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
           }
           else
           {
-            regions.add(new int[] { sf.getBegin(), sf.getEnd() });
-          }
-          mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+          regions.add(new int[] { sf.getBegin(), sf.getEnd() });
         }
+        mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+
+        if (mappedLength >= sourceLength)
+        {
+          /*
+           * break for the case of matching gene features v gene sequence
+           * - only need to locate the 'gene' feature for accId
+           */
+          break;
+        }
+      }
     }
   
+    if (regions.isEmpty())
+    {
+      System.out.println("Failed to identify target sequence for " + accId
+              + " from genomic features");
+      return null;
+    }
+
     /*
      * a final sort is needed since Ensembl returns CDS sorted within source
      * (havana / ensembl_havana)
@@ -498,14 +537,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     Collections.sort(regions, new RangeSorter(direction == 1));
   
     List<int[]> to = new ArrayList<int[]>();
-    to.add(new int[] { 1, mappedLength });
+    to.add(new int[] { start, start + mappedLength - 1 });
   
     return new MapList(regions, to, 1, 1);
   }
 
   /**
-   * Returns true if the sequence feature identifies positions of the genomic
-   * sequence feature which are within the sequence being retrieved.
+   * Returns true if the sequence feature marks positions of the genomic
+   * sequence feature which are within the sequence being retrieved. For
+   * example, an 'exon' feature whose parent is the target transcript marks the
+   * cdna positions of the transcript.
    * 
    * @param sf
    * @param accId
@@ -527,13 +568,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   protected void transferFeature(SequenceFeature sf,
           SequenceI targetSequence, MapList overlap)
   {
-    String parent = (String) sf.getValue(PARENT);
-    if (parent != null && !parent.contains(targetSequence.getName()))
-    {
-      // this genomic feature belongs to a different transcript
-      return;
-    }
-
     int start = sf.getBegin();
     int end = sf.getEnd();
     int[] mappedRange = overlap.locateInTo(start, end);
@@ -541,9 +575,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     if (mappedRange != null)
     {
       SequenceFeature copy = new SequenceFeature(sf);
-      int offset = targetSequence.getStart() - 1;
-      copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1]));
-      copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1]));
+      copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
+      copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
       targetSequence.addSequenceFeature(copy);
 
       /*
@@ -570,17 +603,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    * @param accessionId
    * @param sourceSequence
    * @param targetSequence
+   * @return true if any features were transferred, else false
    */
-  protected void transferFeatures(String accessionId,
+  protected boolean transferFeatures(String accessionId,
           SequenceI sourceSequence, SequenceI targetSequence)
   {
     if (sourceSequence == null || targetSequence == null)
     {
-      return;
+      return false;
     }
 
     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
-    MapList overlap = getGenomicRanges(sfs, accessionId);
+    MapList overlap = getGenomicRanges(sourceSequence, accessionId,
+            targetSequence.getStart());
+    if (overlap == null)
+    {
+      return false;
+    }
 
     final boolean forwardStrand = overlap.isFromForwardStrand();
 
@@ -598,23 +637,46 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       }
     });
 
+    boolean transferred = false;
     for (SequenceFeature sf : sfs)
     {
-      if (retainFeature(sf.getType()))
+      if (retainFeature(sf, accessionId))
       {
         transferFeature(sf, targetSequence, overlap);
+        transferred = true;
       }
     }
+    return transferred;
   }
 
   /**
-   * Answers true if the feature type is one to attach to the retrieved sequence
+   * Answers true if the feature is one to attach to the retrieved sequence
    * 
    * @param type
    * @return
    */
-  protected boolean retainFeature(@SuppressWarnings("unused") String type)
+  protected boolean retainFeature(SequenceFeature sf, String accessionId)
+  {
+    String parent = (String) sf.getValue(PARENT);
+    if (parent != null && !parent.contains(accessionId))
+    {
+      // this genomic feature belongs to a different transcript
+      return false;
+    }
+    return true;
+  }
+
+  @Override
+  public String getDescription()
+  {
+    return "Ensembl " + getSourceEnsemblType().getType()
+            + " sequence with variant features";
+  }
+
+  public AlignmentI getSequenceRecords(String transcriptId,
+          SequenceI geneSeq) throws Exception
   {
-    return true; // default is to keep all
+    this.genomicSequence = geneSeq;
+    return getSequenceRecords(transcriptId);
   }
 }