JAL-1705 further unit tests

[jalview.git] / src / jalview / ext / ensembl / EnsemblSeqProxy.java
diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java

index fb0b01c..b2804f2 100644 (file)
--- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java
+++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
@@ -1,5 +1,6 @@
  package jalview.ext.ensembl;
  
+import jalview.analysis.AlignmentUtils;
  import jalview.datamodel.Alignment;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.DBRefEntry;
@@ -10,7 +11,8 @@ import jalview.datamodel.SequenceI;
  import jalview.exceptions.JalviewException;
  import jalview.io.FastaFile;
  import jalview.io.FileParse;
-import jalview.io.gff.SequenceOntology;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
  import jalview.schemes.ResidueProperties;
  import jalview.util.DBRefUtils;
  import jalview.util.MapList;
@@ -35,18 +37,15 @@ import java.util.Map.Entry;
   */
  public abstract class EnsemblSeqProxy extends EnsemblRestClient
  {
+  private static final List<String> CROSS_REFERENCES = Arrays
+          .asList(new String[] { "CCDS" });
+
    protected static final String CONSEQUENCE_TYPE = "consequence_type";
  
    protected static final String PARENT = "Parent";
  
    protected static final String ID = "ID";
  
-  /*
-   * this needs special handling, as it isA sequence_variant in the
-   * Sequence Ontology, but behaves in Ensembl as if it isA transcript
-   */
-  protected static final String NMD_VARIANT = "NMD_transcript_variant";
-
    protected static final String NAME = "Name";
  
    public enum EnsemblSeqType
@@ -123,7 +122,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    @Override
    public AlignmentI getSequenceRecords(String query) throws Exception
    {
-    long now = System.currentTimeMillis();
      // TODO use a String... query vararg instead?
  
      // danger: accession separator used as a regex here, a string elsewhere
@@ -152,14 +150,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                  + " chunks. Unexpected problem (" + r.getLocalizedMessage()
                  + ")";
          System.err.println(msg);
-        if (alignment != null)
-        {
-          break; // return what we got
-        }
-        else
-        {
-          throw new JalviewException(msg, r);
-        }
+        break;
+        // if (alignment != null)
+        // {
+        // break; // return what we got
+        // }
+        // else
+        // {
+        // throw new JalviewException(msg, r);
+        // }
        }
      }
  
@@ -172,9 +171,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        addFeaturesAndProduct(accId, alignment);
      }
  
-    inProgress = false;
-    System.out.println(getClass().getName() + " took "
-            + (System.currentTimeMillis() - now) + "ms to fetch");
+    for (SequenceI seq : alignment.getSequences())
+    {
+      getCrossReferences(seq);
+    }
+
      return alignment;
    }
  
@@ -287,6 +288,45 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    }
  
    /**
+   * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein
+   * sequence
+   * 
+   * @param seq
+   */
+  protected void getCrossReferences(SequenceI seq)
+  {
+    while (seq.getDatasetSequence() != null)
+    {
+      seq = seq.getDatasetSequence();
+    }
+
+    EnsemblXref xrefFetcher = new EnsemblXref();
+    List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName(),
+            getCrossReferenceDatabases());
+    for (DBRefEntry xref : xrefs)
+    {
+      seq.addDBRef(xref);
+      /*
+       * Save any Uniprot xref to be the reference for SIFTS mapping
+       */
+      if (DBRefSource.UNIPROT.equals(xref.getSource()))
+      {
+        seq.setSourceDBRef(xref);
+      }
+    }
+  }
+
+  /**
+   * Returns a list of database names to be used when fetching cross-references.
+   * 
+   * @return
+   */
+  protected List<String> getCrossReferenceDatabases()
+  {
+    return CROSS_REFERENCES;
+  }
+
+  /**
     * Returns a mapping from dna to protein by inspecting sequence features of
     * type "CDS" on the dna.
     * 
@@ -296,29 +336,64 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     */
    protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq)
    {
-    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
-    if (sfs == null)
+    List<int[]> ranges = new ArrayList<int[]>(50);
+
+    int mappedDnaLength = getCdsRanges(dnaSeq, ranges);
+
+    int proteinLength = proteinSeq.getLength();
+    List<int[]> proteinRange = new ArrayList<int[]>();
+    int proteinStart = 1;
+
+    /*
+     * incomplete start codon may mean X at start of peptide
+     * we ignore both for mapping purposes
+     */
+    if (proteinSeq.getCharAt(0) == 'X')
      {
-      return null;
+      proteinStart = 2;
+      proteinLength--;
      }
+    proteinRange.add(new int[] { proteinStart, proteinLength });
  
-    List<int[]> ranges = new ArrayList<int[]>(50);
-    SequenceOntology so = SequenceOntology.getInstance();
-
-    int mappedDnaLength = 0;
-    
      /*
-     * Map CDS columns of dna to peptide. No need to worry about reverse strand
-     * dna here since the retrieved sequence is as transcribed (reverse
-     * complement for reverse strand), i.e in the same sense as the peptide. 
+     * dna length should map to protein (or protein plus stop codon)
       */
-    boolean fivePrimeIncomplete = false;
+    int codesForResidues = mappedDnaLength / 3;
+    if (codesForResidues == proteinLength
+            || codesForResidues == (proteinLength + 1))
+    {
+      return new MapList(ranges, proteinRange, 3, 1);
+    }
+    return null;
+  }
+
+  /**
+   * Adds CDS ranges to the ranges list, and returns the total length mapped
+   * from.
+   * 
+   * No need to worry about reverse strand dna, here since the retrieved
+   * sequence is as transcribed (reverse complement for reverse strand), i.e in
+   * the same sense as the peptide.
+   * 
+   * @param dnaSeq
+   * @param ranges
+   * @return
+   */
+  protected int getCdsRanges(SequenceI dnaSeq, List<int[]> ranges)
+  {
+    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+    if (sfs == null)
+    {
+      return 0;
+    }
+    int mappedDnaLength = 0;
      for (SequenceFeature sf : sfs)
      {
        /*
         * process a CDS feature (or a sub-type of CDS)
         */
-      if (so.isA(sf.getType(), SequenceOntology.CDS))
+      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+              SequenceOntologyI.CDS))
        {
          int phase = 0;
          try {
@@ -335,7 +410,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          int end = sf.getEnd();
          if (ranges.isEmpty() && phase > 0)
          {
-          fivePrimeIncomplete = true;
            begin += phase;
            if (begin > end)
            {
@@ -346,26 +420,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          mappedDnaLength += Math.abs(end - begin) + 1;
        }
      }
-    int proteinLength = proteinSeq.getLength();
-    List<int[]> proteinRange = new ArrayList<int[]>();
-    int proteinStart = 1;
-    if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X')
-    {
-      proteinStart = 2;
-      proteinLength--;
-    }
-    proteinRange.add(new int[] { proteinStart, proteinLength });
-
-    /*
-     * dna length should map to protein (or protein plus stop codon)
-     */
-    int codesForResidues = mappedDnaLength / 3;
-    if (codesForResidues == proteinLength
-            || codesForResidues == (proteinLength + 1))
-    {
-      return new MapList(ranges, proteinRange, 3, 1);
-    }
-    return null;
+    return mappedDnaLength;
    }
  
    /**
@@ -526,7 +581,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     *          the start position of the sequence we are mapping to
     * @return
     */
-  protected MapList getGenomicRanges(SequenceI sourceSequence,
+  protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
            String accId, int start)
    {
      SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
@@ -552,11 +607,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         */
        if (identifiesSequence(sf, accId))
        {
-          int strand = sf.getStrand();
-  
-          if (directionSet && strand != direction)
-          {
-            // abort - mix of forward and backward
+        int strand = sf.getStrand();
+        strand = strand == 0 ? 1 : strand; // treat unknown as forward
+
+        if (directionSet && strand != direction)
+        {
+          // abort - mix of forward and backward
            System.err.println("Error: forward and backward strand for "
                    + accId);
              return null;
@@ -601,8 +657,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       */
      Collections.sort(regions, new RangeSorter(direction == 1));
    
-    List<int[]> to = new ArrayList<int[]>();
-    to.add(new int[] { start, start + mappedLength - 1 });
+    List<int[]> to = Arrays.asList(new int[] { start,
+        start + mappedLength - 1 });
    
      return new MapList(regions, to, 1, 1);
    }
@@ -657,7 +713,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        /*
         * for sequence_variant, make an additional feature with consequence
         */
-      if (SequenceOntology.getInstance().isSequenceVariant(sf.getType()))
+      if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+              SequenceOntologyI.SEQUENCE_VARIANT))
        {
          String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
          if (consequence != null)
@@ -688,7 +745,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      }
  
      SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
-    MapList mapping = getGenomicRanges(sourceSequence, accessionId,
+    MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
              targetSequence.getStart());
      if (mapping == null)
      {
@@ -797,7 +854,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
      
      SequenceFeature[] sfs = sequence.getSequenceFeatures();
      if (sfs != null) {
-      SequenceOntology so = SequenceOntology.getInstance();
+      SequenceOntologyI so = SequenceOntologyFactory.getInstance();
        for (SequenceFeature sf :sfs) {
          if (so.isA(sf.getType(), type))
          {
@@ -834,7 +891,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        peptide = peptide.getDatasetSequence();
      }
    
-    mapExonsToProtein(dnaSeq, peptide, dnaToProtein);
+    AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein,
+            SequenceOntologyI.EXON);
  
      LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
              dnaSeq, dnaToProtein);
@@ -852,11 +910,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
                residue);
        if (!peptideVariants.isEmpty())
        {
-        Collections.sort(peptideVariants);
          String desc = StringUtils.listToDelimitedString(peptideVariants,
                  ", ");
          SequenceFeature sf = new SequenceFeature(
-                SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
+                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
                  peptidePos, 0f, null);
          peptide.addSequenceFeature(sf);
          count++;
@@ -866,47 +923,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    }
  
    /**
-   * Transfers exon features to the corresponding mapped regions of the protein
-   * sequence. This is useful because it allows visualisation of exon boundaries
-   * on the peptide (using 'colour by label' for the exon name). Returns the
-   * number of features written.
-   * 
-   * @param dnaSeq
-   * @param peptide
-   * @param dnaToProtein
-   */
-  static int mapExonsToProtein(SequenceI dnaSeq, SequenceI peptide,
-          MapList dnaToProtein)
-  {
-    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
-    if (sfs == null)
-    {
-      return 0;
-    }
-
-    SequenceOntology so = SequenceOntology.getInstance();
-    int count = 0;
-
-    for (SequenceFeature sf : sfs)
-    {
-      if (so.isA(sf.getType(), SequenceOntology.EXON))
-      {
-        int start = sf.getBegin();
-        int end = sf.getEnd();
-        int[] mapsTo = dnaToProtein.locateInTo(start, end);
-        if (mapsTo != null)
-        {
-          SequenceFeature copy = new SequenceFeature(SequenceOntology.EXON,
-                  sf.getDescription(), mapsTo[0], mapsTo[1], 0f, null);
-          peptide.addSequenceFeature(copy);
-          count++;
-        }
-      }
-    }
-    return count;
-  }
-
-  /**
     * Builds a map whose key is position in the protein sequence, and value is an
     * array of all variants for the coding codon positions
     * 
@@ -922,7 +938,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       * LinkedHashMap ensures we add the peptide features in sequence order
       */
      LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
-    SequenceOntology so = SequenceOntology.getInstance();
+    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
    
      SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
      if (dnaFeatures == null)
@@ -930,6 +946,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        return variants;
      }
    
+    int dnaStart = dnaSeq.getStart();
      int[] lastCodon = null;
      int lastPeptidePostion = 0;
    
@@ -944,7 +961,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          // not handling multi-locus variant features
          continue;
        }
-      if (so.isSequenceVariant(sf.getType()))
+      if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
        {
          int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
          if (mapsTo == null)
@@ -985,7 +1002,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
          for (int codonPos = 0; codonPos < 3; codonPos++)
          {
            String nucleotide = String.valueOf(dnaSeq
-                  .getCharAt(codon[codonPos] - 1));
+                  .getCharAt(codon[codonPos] - dnaStart));
            if (codon[codonPos] == dnaCol)
            {
              /*
@@ -1011,11 +1028,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    }
  
    /**
-   * Returns a non-redundant list of all peptide translations generated by the
-   * given dna variants, excluding the current residue value
+   * Returns a sorted, non-redundant list of all peptide translations generated
+   * by the given dna variants, excluding the current residue value
     * 
     * @param codonVariants
-   *          an array of base values for codon positions 1, 2, 3
+   *          an array of base values (acgtACGT) for codon positions 1, 2, 3
     * @param residue
     *          the current residue translation
     * @return
@@ -1036,13 +1053,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
            String peptide = codon.contains("-") ? "-" : ResidueProperties
                    .codonTranslate(codon);
            if (peptide != null && !result.contains(peptide)
-                  && !peptide.equals(residue))
+                  && !peptide.equalsIgnoreCase(residue))
            {
              result.add(peptide);
            }
          }
        }
      }
+
+    /*
+     * sort alphabetically with STOP at the end
+     */
+    Collections.sort(result, new Comparator<String>()
+    {
+
+      @Override
+      public int compare(String o1, String o2)
+      {
+        if ("STOP".equals(o1))
+        {
+          return 1;
+        }
+        else if ("STOP".equals(o2))
+        {
+          return -1;
+        }
+        else
+        {
+          return o1.compareTo(o2);
+        }
+      }
+    });
      return result;
    }
  
@@ -1058,7 +1099,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
     */
    public static boolean isTranscript(String featureType)
    {
-    return NMD_VARIANT.equals(featureType)
-            || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT);
+    return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
+            || SequenceOntologyFactory.getInstance().isA(featureType,
+                    SequenceOntologyI.TRANSCRIPT);
    }
  }