JAL-1705 align CDS and peptide products to transcripts
[jalview.git] / src / jalview / ext / ensembl / EnsemblSeqProxy.java
index 8c1e972..869a702 100644 (file)
@@ -38,7 +38,7 @@ import java.util.Map.Entry;
 public abstract class EnsemblSeqProxy extends EnsemblRestClient
 {
   private static final List<String> CROSS_REFERENCES = Arrays
-          .asList(new String[] { "CCDS" });
+          .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" });
 
   protected static final String CONSEQUENCE_TYPE = "consequence_type";
 
@@ -46,33 +46,30 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
 
   protected static final String ID = "ID";
 
-  /*
-   * this needs special handling, as it isA sequence_variant in the
-   * Sequence Ontology, but behaves in Ensembl as if it isA transcript
-   */
-  protected static final String NMD_VARIANT = "NMD_transcript_variant";
-
   protected static final String NAME = "Name";
 
+  /*
+   * enum for 'type' parameter to the /sequence REST service
+   */
   public enum EnsemblSeqType
   {
     /**
-     * type=genomic for the full dna including introns
+     * type=genomic to fetch full dna including introns
      */
     GENOMIC("genomic"),
 
     /**
-     * type=cdna for transcribed dna including UTRs
+     * type=cdna to fetch dna including UTRs
      */
     CDNA("cdna"),
 
     /**
-     * type=cds for coding dna excluding UTRs
+     * type=cds to fetch coding dna excluding UTRs
      */
     CDS("cds"),
 
     /**
-     * type=protein for the peptide product sequence
+     * type=protein to fetch peptide product sequence
      */
     PROTEIN("protein");
 
@@ -157,17 +154,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                 + ")";
         System.err.println(msg);
         break;
-        // if (alignment != null)
-        // {
-        // break; // return what we got
-        // }
-        // else
-        // {
-        // throw new JalviewException(msg, r);
-        // }
       }
     }
 
+    if (alignment == null)
+    {
+      return null;
+    }
+
     /*
      * fetch and transfer genomic sequence features,
      * fetch protein product and add as cross-reference
@@ -207,7 +201,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        * get 'dummy' genomic sequence with exon, cds and variation features
        */
       SequenceI genomicSequence = null;
-      EnsemblOverlap gffFetcher = new EnsemblOverlap();
+      EnsemblFeatures gffFetcher = new EnsemblFeatures();
       EnsemblFeatureType[] features = getFeaturesToFetch();
       AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
               features);
@@ -274,7 +268,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
       if (mapList != null)
       {
-        Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList);
+        // clunky: ensure Uniprot xref if we have one is on mapped sequence
+        SequenceI ds = proteinSeq.getDatasetSequence();
+        ds.setSourceDBRef(proteinSeq.getSourceDBRef());
+        Mapping map = new Mapping(ds, mapList);
         DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
                 accId, map);
         querySeq.getDatasetSequence().addDBRef(dbr);
@@ -294,8 +291,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
   }
 
   /**
-   * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein
-   * sequence
+   * Get database xrefs from Ensembl, and attach them to the sequence
    * 
    * @param seq
    */
@@ -347,7 +343,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     int mappedDnaLength = getCdsRanges(dnaSeq, ranges);
 
     int proteinLength = proteinSeq.getLength();
-    List<int[]> proteinRange = new ArrayList<int[]>();
+    int proteinEnd = proteinLength;
     int proteinStart = 1;
 
     /*
@@ -359,15 +355,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       proteinStart = 2;
       proteinLength--;
     }
-    proteinRange.add(new int[] { proteinStart, proteinLength });
+    List<int[]> proteinRange = new ArrayList<int[]>();
 
     /*
      * dna length should map to protein (or protein plus stop codon)
      */
     int codesForResidues = mappedDnaLength / 3;
-    if (codesForResidues == proteinLength
-            || codesForResidues == (proteinLength + 1))
+    if (codesForResidues == (proteinLength + 1))
     {
+      MappingUtils.unmapStopCodon(ranges, mappedDnaLength);
+      codesForResidues--;
+    }
+    if (codesForResidues == proteinLength)
+    {
+      proteinRange.add(new int[] { proteinStart, proteinEnd });
       return new MapList(ranges, proteinRange, 3, 1);
     }
     return null;
@@ -392,14 +393,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     {
       return 0;
     }
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
     int mappedDnaLength = 0;
     for (SequenceFeature sf : sfs)
     {
       /*
        * process a CDS feature (or a sub-type of CDS)
        */
-      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
-              SequenceOntologyI.CDS))
+      if (so.isA(sf.getType(), SequenceOntologyI.CDS))
       {
         int phase = 0;
         try {
@@ -414,7 +415,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          */
         int begin = sf.getBegin();
         int end = sf.getEnd();
-        if (ranges.isEmpty() && phase > 0)
+        if (ranges.isEmpty())
         {
           begin += phase;
           if (begin > end)
@@ -719,18 +720,18 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       /*
        * for sequence_variant, make an additional feature with consequence
        */
-      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
-              SequenceOntologyI.SEQUENCE_VARIANT))
-      {
-        String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
-        if (consequence != null)
-        {
-          SequenceFeature sf2 = new SequenceFeature("consequence",
-                  consequence, copy.getBegin(), copy.getEnd(), 0f,
-                  null);
-          targetSequence.addSequenceFeature(sf2);
-        }
-      }
+      // if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+      // SequenceOntologyI.SEQUENCE_VARIANT))
+      // {
+      // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
+      // if (consequence != null)
+      // {
+      // SequenceFeature sf2 = new SequenceFeature("consequence",
+      // consequence, copy.getBegin(), copy.getEnd(), 0f,
+      // null);
+      // targetSequence.addSequenceFeature(sf2);
+      // }
+      // }
     }
   }
 
@@ -750,6 +751,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       return false;
     }
 
+    // long start = System.currentTimeMillis();
     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
     MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
             targetSequence.getStart());
@@ -758,7 +760,13 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       return false;
     }
 
-    return transferFeatures(sfs, targetSequence, mapping, accessionId);
+    boolean result = transferFeatures(sfs, targetSequence, mapping,
+            accessionId);
+    // System.out.println("transferFeatures (" + (sfs.length) + " --> "
+    // + targetSequence.getSequenceFeatures().length + ") to "
+    // + targetSequence.getName()
+    // + " took " + (System.currentTimeMillis() - start) + "ms");
+    return result;
   }
 
   /**
@@ -925,6 +933,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         count++;
       }
     }
+
+    /*
+     * ugly sort to get sequence features in start position order
+     * - would be better to store in Sequence as a TreeSet instead?
+     */
+    Arrays.sort(peptide.getSequenceFeatures(),
+            new Comparator<SequenceFeature>()
+            {
+              @Override
+              public int compare(SequenceFeature o1, SequenceFeature o2)
+              {
+                int c = Integer.compare(o1.getBegin(), o2.getBegin());
+                return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+                        : c;
+              }
+            });
     return count;
   }
 
@@ -1105,7 +1129,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    */
   public static boolean isTranscript(String featureType)
   {
-    return NMD_VARIANT.equals(featureType)
+    return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
             || SequenceOntologyFactory.getInstance().isA(featureType,
                     SequenceOntologyI.TRANSCRIPT);
   }