JAL-1705 align CDS and peptide products to transcripts

[jalview.git] / src / jalview / ext / ensembl / EnsemblSeqProxy.java
diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java

index 8c1e972..869a702 100644 (file)
--- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java
+++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
@@ -38,7 +38,7 @@ import java.util.Map.Entry;
  public abstract class EnsemblSeqProxy extends EnsemblRestClient
  {
    private static final List<String> CROSS_REFERENCES = Arrays
-          .asList(new String[] { "CCDS" });
+          .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" });
  
    protected static final String CONSEQUENCE_TYPE = "consequence_type";
  
@@ -46,33 +46,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
  
    protected static final String ID = "ID";
  
-  /*
-   * this needs special handling, as it isA sequence_variant in the
-   * Sequence Ontology, but behaves in Ensembl as if it isA transcript
-   */
-  protected static final String NMD_VARIANT = "NMD_transcript_variant";
-
    protected static final String NAME = "Name";
  
+  /*
+   * enum for 'type' parameter to the /sequence REST service
+   */
    public enum EnsemblSeqType
    {
      /**
-     * type=genomic for the full dna including introns
+     * type=genomic to fetch full dna including introns
       */
      GENOMIC("genomic"),
  
      /**
-     * type=cdna for transcribed dna including UTRs
+     * type=cdna to fetch dna including UTRs
       */
      CDNA("cdna"),
  
      /**
-     * type=cds for coding dna excluding UTRs
+     * type=cds to fetch coding dna excluding UTRs
       */
      CDS("cds"),
  
      /**
-     * type=protein for the peptide product sequence
+     * type=protein to fetch peptide product sequence
       */
      PROTEIN("protein");
  
@@ -157,17 +154,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                  + ")";
          System.err.println(msg);
          break;
-        // if (alignment != null)
-        // {
-        // break; // return what we got
-        // }
-        // else
-        // {
-        // throw new JalviewException(msg, r);
-        // }
        }
      }
  
+    if (alignment == null)
+    {
+      return null;
+    }
+
      /*
       * fetch and transfer genomic sequence features,
       * fetch protein product and add as cross-reference
@@ -207,7 +201,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         * get 'dummy' genomic sequence with exon, cds and variation features
         */
        SequenceI genomicSequence = null;
-      EnsemblOverlap gffFetcher = new EnsemblOverlap();
+      EnsemblFeatures gffFetcher = new EnsemblFeatures();
        EnsemblFeatureType[] features = getFeaturesToFetch();
        AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
                features);
@@ -274,7 +268,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
        if (mapList != null)
        {
-        Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList);
+        // clunky: ensure Uniprot xref if we have one is on mapped sequence
+        SequenceI ds = proteinSeq.getDatasetSequence();
+        ds.setSourceDBRef(proteinSeq.getSourceDBRef());
+        Mapping map = new Mapping(ds, mapList);
          DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
                  accId, map);
          querySeq.getDatasetSequence().addDBRef(dbr);
@@ -294,8 +291,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    }
  
    /**
-   * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein
-   * sequence
+   * Get database xrefs from Ensembl, and attach them to the sequence
     * 
     * @param seq
     */
@@ -347,7 +343,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      int mappedDnaLength = getCdsRanges(dnaSeq, ranges);
  
      int proteinLength = proteinSeq.getLength();
-    List<int[]> proteinRange = new ArrayList<int[]>();
+    int proteinEnd = proteinLength;
      int proteinStart = 1;
  
      /*
@@ -359,15 +355,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        proteinStart = 2;
        proteinLength--;
      }
-    proteinRange.add(new int[] { proteinStart, proteinLength });
+    List<int[]> proteinRange = new ArrayList<int[]>();
  
      /*
       * dna length should map to protein (or protein plus stop codon)
       */
      int codesForResidues = mappedDnaLength / 3;
-    if (codesForResidues == proteinLength
-            || codesForResidues == (proteinLength + 1))
+    if (codesForResidues == (proteinLength + 1))
      {
+      MappingUtils.unmapStopCodon(ranges, mappedDnaLength);
+      codesForResidues--;
+    }
+    if (codesForResidues == proteinLength)
+    {
+      proteinRange.add(new int[] { proteinStart, proteinEnd });
        return new MapList(ranges, proteinRange, 3, 1);
      }
      return null;
@@ -392,14 +393,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      {
        return 0;
      }
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
      int mappedDnaLength = 0;
      for (SequenceFeature sf : sfs)
      {
        /*
         * process a CDS feature (or a sub-type of CDS)
         */
-      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
-              SequenceOntologyI.CDS))
+      if (so.isA(sf.getType(), SequenceOntologyI.CDS))
        {
          int phase = 0;
          try {
@@ -414,7 +415,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
           */
          int begin = sf.getBegin();
          int end = sf.getEnd();
-        if (ranges.isEmpty() && phase > 0)
+        if (ranges.isEmpty())
          {
            begin += phase;
            if (begin > end)
@@ -719,18 +720,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        /*
         * for sequence_variant, make an additional feature with consequence
         */
-      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
-              SequenceOntologyI.SEQUENCE_VARIANT))
-      {
-        String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
-        if (consequence != null)
-        {
-          SequenceFeature sf2 = new SequenceFeature("consequence",
-                  consequence, copy.getBegin(), copy.getEnd(), 0f,
-                  null);
-          targetSequence.addSequenceFeature(sf2);
-        }
-      }
+      // if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+      // SequenceOntologyI.SEQUENCE_VARIANT))
+      // {
+      // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
+      // if (consequence != null)
+      // {
+      // SequenceFeature sf2 = new SequenceFeature("consequence",
+      // consequence, copy.getBegin(), copy.getEnd(), 0f,
+      // null);
+      // targetSequence.addSequenceFeature(sf2);
+      // }
+      // }
      }
    }
  
@@ -750,6 +751,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        return false;
      }
  
+    // long start = System.currentTimeMillis();
      SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
      MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
              targetSequence.getStart());
@@ -758,7 +760,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        return false;
      }
  
-    return transferFeatures(sfs, targetSequence, mapping, accessionId);
+    boolean result = transferFeatures(sfs, targetSequence, mapping,
+            accessionId);
+    // System.out.println("transferFeatures (" + (sfs.length) + " --> "
+    // + targetSequence.getSequenceFeatures().length + ") to "
+    // + targetSequence.getName()
+    // + " took " + (System.currentTimeMillis() - start) + "ms");
+    return result;
    }
  
    /**
@@ -925,6 +933,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          count++;
        }
      }
+
+    /*
+     * ugly sort to get sequence features in start position order
+     * - would be better to store in Sequence as a TreeSet instead?
+     */
+    Arrays.sort(peptide.getSequenceFeatures(),
+            new Comparator<SequenceFeature>()
+            {
+              @Override
+              public int compare(SequenceFeature o1, SequenceFeature o2)
+              {
+                int c = Integer.compare(o1.getBegin(), o2.getBegin());
+                return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+                        : c;
+              }
+            });
      return count;
    }
  
@@ -1105,7 +1129,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     */
    public static boolean isTranscript(String featureType)
    {
-    return NMD_VARIANT.equals(featureType)
+    return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
              || SequenceOntologyFactory.getInstance().isA(featureType,
                      SequenceOntologyI.TRANSCRIPT);
    }