Merge branch 'develop' into merge/develop_bug/JAL-2154projectMappings
authorJim Procter <jprocter@issues.jalview.org>
Tue, 30 Aug 2016 13:30:34 +0000 (14:30 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Tue, 30 Aug 2016 13:30:34 +0000 (14:30 +0100)
1  2 
src/jalview/analysis/AlignmentUtils.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
test/jalview/analysis/AlignmentUtilsTests.java

@@@ -22,6 -22,7 +22,6 @@@ package jalview.analysis
  
  import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
  
 -import jalview.api.DBRefEntryI;
  import jalview.datamodel.AlignedCodon;
  import jalview.datamodel.AlignedCodonFrame;
  import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@@ -72,6 -73,7 +72,7 @@@ import java.util.TreeMap
  public class AlignmentUtils
  {
  
+   private static final int CODON_LENGTH = 3;
    private static final String SEQUENCE_VARIANT = "sequence_variant:";
    private static final String ID = "ID";
  
     * A data model to hold the 'normal' base value at a position, and an optional
     * sequence variant feature
     */
-   static class DnaVariant
+   static final class DnaVariant
    {
-     String base;
+     final String base;
  
      SequenceFeature variant;
  
      DnaVariant(String nuc)
      {
        base = nuc;
+       variant = null;
      }
  
      DnaVariant(String nuc, SequenceFeature var)
        base = nuc;
        variant = var;
      }
+     public String getSource()
+     {
+       return variant == null ? null : variant.getFeatureGroup();
+     }
    }
  
    /**
      /*
       * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
       */
-     final int mappedLength = 3 * aaSeqChars.length;
+     final int mappedLength = CODON_LENGTH * aaSeqChars.length;
      int cdnaLength = cdnaSeqChars.length;
      int cdnaStart = cdnaSeq.getStart();
      int cdnaEnd = cdnaSeq.getEnd();
       */
      if (cdnaLength != mappedLength && cdnaLength > 2)
      {
-       String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
+       String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - CODON_LENGTH, CODON_LENGTH)
                .toUpperCase();
        for (String stop : ResidueProperties.STOP)
        {
          if (lastCodon.equals(stop))
          {
-           cdnaEnd -= 3;
-           cdnaLength -= 3;
+           cdnaEnd -= CODON_LENGTH;
+           cdnaLength -= CODON_LENGTH;
            break;
          }
        }
      int startOffset = 0;
      if (cdnaLength != mappedLength
              && cdnaLength > 2
-             && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
+             && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH).toUpperCase()
                      .equals(ResidueProperties.START))
      {
-       startOffset += 3;
-       cdnaStart += 3;
-       cdnaLength -= 3;
+       startOffset += CODON_LENGTH;
+       cdnaStart += CODON_LENGTH;
+       cdnaLength -= CODON_LENGTH;
      }
  
      if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
         * protein is translation of dna (+/- start/stop codons)
         */
        MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
-       { proteinStart, proteinEnd }, 3, 1);
+       { proteinStart, proteinEnd }, CODON_LENGTH, 1);
        return map;
      }
  
      int aaPos = 0;
      int dnaPos = cdnaStart;
      for (; dnaPos < cdnaSeqChars.length - 2
-             && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
+             && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
      {
-       String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        final String translated = ResidueProperties.codonTranslate(codon);
  
        /*
      {
        return true;
      }
-     if (dnaPos == cdnaSeqChars.length - 3)
+     if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
      {
-       String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
        {
          return true;
        }
        width = Math.max(dnaSeq.getLength(), width);
      }
-     int oldwidth, diff;
+     int oldwidth;
+     int diff;
      for (SequenceI dnaSeq : dna.getSequences())
      {
        oldwidth = dnaSeq.getLength();
      for (AlignedCodonFrame mapping : dnaMappings)
      {
        SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
-       int peptideLength = peptide.getLength();
        if (peptide != null)
        {
+         int peptideLength = peptide.getLength();
          Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
          if (map != null)
          {
                    .getFromRanges());
            int mappedToLength = MappingUtils
                    .getLength(mapList.getToRanges());
-           boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3)
+           boolean addStopCodon = (cdsLength == mappedFromLength * CODON_LENGTH + CODON_LENGTH)
                    || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1);
            if (cdsLength != mappedToLength && !addStopCodon)
            {
            /*
             * pre-fill the aligned cds sequence with gaps
             */
-           char[] alignedCds = new char[peptideLength * 3
-                   + (addStopCodon ? 3 : 0)];
+           char[] alignedCds = new char[peptideLength * CODON_LENGTH
+                   + (addStopCodon ? CODON_LENGTH : 0)];
            Arrays.fill(alignedCds, gapChar);
  
            /*
            {
              if (Comparison.isGap(residue))
              {
-               cdsCol += 3;
+               cdsCol += CODON_LENGTH;
              }
              else
              {
                if (codon == null)
                {
                  // e.g. incomplete start codon, X in peptide
-                 cdsCol += 3;
+                 cdsCol += CODON_LENGTH;
                }
                else
                {
             * append stop codon if not mapped from protein,
             * closing it up to the end of the mapped sequence
             */
-           if (copiedBases == nucleotides.length - 3)
+           if (copiedBases == nucleotides.length - CODON_LENGTH)
            {
              for (int i = alignedCds.length - 1; i >= 0; i--)
              {
                  break;
                }
              }
-             for (int i = nucleotides.length - 3; i < nucleotides.length; i++)
+             for (int i = nucleotides.length - CODON_LENGTH; i < nucleotides.length; i++)
              {
                alignedCds[cdsCol++] = nucleotides[i];
              }
             * didn't find mapped CDS sequence - construct it and add
             * its dataset sequence to the dataset
             */
 -          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
 -          SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
 +          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
 +                  dataset).deriveSequence();
 +          // cdsSeq has a name constructed as CDS|<dbref>
 +          // <dbref> will be either the accession for the coding sequence,
 +          // marked in the /via/ dbref to the protein product accession
 +          // or it will be the original nucleotide accession.
 +          SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
 +
            cdsSeqs.add(cdsSeq);
 +
            if (!dataset.getSequences().contains(cdsSeqDss))
            {
 +            // check if this sequence is a newly created one
 +            // so needs adding to the dataset
              dataset.addSequence(cdsSeqDss);
            }
  
            MapList cdsToProteinMap = new MapList(cdsRange, mapList.getToRanges(),
                    mapList.getFromRatio(), mapList.getToRatio());
            AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
 -          cdsToProteinMapping.addMap(cdsSeq, proteinProduct, cdsToProteinMap);
 +          cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
 +                  cdsToProteinMap);
  
            /*
             * guard against duplicating the mapping if repeating this action
              mappings.add(cdsToProteinMapping);
            }
  
 -          /*
 -           * copy protein's dbrefs to CDS sequence
 -           * this enables Get Cross-References from CDS alignment
 -           */
 -          DBRefEntry[] proteinRefs = DBRefUtils.selectDbRefs(false,
 -                  proteinProduct.getDBRefs());
 -          if (proteinRefs != null)
 -          {
 -            for (DBRefEntry ref : proteinRefs)
 -            {
 -              DBRefEntry cdsToProteinRef = new DBRefEntry(ref);
 -              cdsToProteinRef.setMap(new Mapping(proteinProduct,
 -                      cdsToProteinMap));
 -              cdsSeqDss.addDBRef(cdsToProteinRef);
 -            }
 -          }
 -
 +          propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
 +                  proteinProduct, aMapping);
            /*
             * add another mapping from original 'from' range to CDS
             */
            MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
                    cdsRange, 1,
                    1);
 -          dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeq,
 +          dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
                    dnaToCdsMap);
            if (!mappings.contains(dnaToCdsMapping))
            {
             * same source and accession, so need a different accession for
             * the CDS from the dna sequence
             */
 -          DBRefEntryI dnaRef = dnaDss.getSourceDBRef();
 -          if (dnaRef != null)
 +          
 +          // specific use case:
 +          // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
 +          // ENSG02, ENSG03, with transcripts and products similarly named.
 +          // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
 +          
 +          // JBPNote: ?? can't actually create an example that demonstrates we
 +          // need to
 +          // synthesize an xref.
 +          
 +          for (DBRefEntry primRef : dnaDss.getPrimaryDBRefs())
            {
 +            // creates a complementary cross-reference to the source sequence's
 +            // primary reference.
 +
 +            DBRefEntry cdsCrossRef = new DBRefEntry(primRef.getSource(),
 +                    primRef.getSource() + ":" + primRef.getVersion(),
 +                    primRef.getAccessionId());
 +            cdsCrossRef
 +                    .setMap(new Mapping(dnaDss, new MapList(dnaToCdsMap)));
 +            cdsSeqDss.addDBRef(cdsCrossRef);
 +
 +            // problem here is that the cross-reference is synthesized -
 +            // cdsSeq.getName() may be like 'CDS|dnaaccession' or
 +            // 'CDS|emblcdsacc'
              // assuming cds version same as dna ?!?
 -            DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
 -                    dnaRef.getVersion(), cdsSeq.getName());
 +
 +            DBRefEntry proteinToCdsRef = new DBRefEntry(
 +                    primRef.getSource(), primRef.getVersion(),
 +                    cdsSeq.getName());
 +            //
              proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
                      .getInverse()));
              proteinProduct.addDBRef(proteinToCdsRef);
      int mappedFromLength = MappingUtils.getLength(aMapping.getMap()
              .getFromRanges());
      int dnaLength = seqDss.getLength();
-     if (mappedFromLength == dnaLength || mappedFromLength == dnaLength - 3)
+     if (mappedFromLength == dnaLength || mappedFromLength == dnaLength - CODON_LENGTH)
      {
        return seqDss;
      }
        for (SequenceToSequenceMapping map : acf.getMappings())
        {
          Mapping mapping = map.getMapping();
-         if (mapping != aMapping && mapping.getMap().getFromRatio() == 3
+         if (mapping != aMapping && mapping.getMap().getFromRatio() == CODON_LENGTH
                  && proteinProduct == mapping.getTo()
                  && seqDss != map.getFromSeq())
          {
     * 
     * @param seq
     * @param mapping
 +   * @param dataset
 +   *          - existing dataset. We check for sequences that look like the CDS
 +   *          we are about to construct, if one exists already, then we will
 +   *          just return that one.
     * @return CDS sequence (as a dataset sequence)
     */
 -  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping)
 +  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
 +          AlignmentI dataset)
    {
      char[] seqChars = seq.getSequence();
      List<int[]> fromRanges = mapping.getMap().getFromRanges();
          }
        }
      }
 -
 +    
      /*
       * assign 'from id' held in the mapping if set (e.g. EMBL protein_id),
       * else generate a sequence name
      String mapFromId = mapping.getMappedFromId();
      String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName());
      SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
 +    if (dataset != null)
 +    {
 +      SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
 +      if (matches != null)
 +      {
 +        boolean matched = false;
 +        for (SequenceI mtch : matches)
 +        {
 +          if (mtch.getStart() != newSeq.getStart())
 +          {
 +            continue;
 +          }
 +          if (mtch.getEnd() != newSeq.getEnd())
 +          {
 +            continue;
 +          }
 +          if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
 +          {
 +            continue;
 +          }
 +          if (!matched)
 +          {
 +            matched = true;
 +            newSeq = mtch;
 +          }
 +          else
 +          {
 +            System.err
 +                    .println("JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
 +                            + mtch.toString());
 +          }
 +        }
 +      }
 +    }
      // newSeq.setDescription(mapFromId);
  
      return newSeq;
    }
  
    /**
 +   * add any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
 +   * the given mapping.
 +   * 
 +   * @param cdsSeq
 +   * @param contig
 +   * @param mapping
 +   * @return list of DBRefEntrys added.
 +   */
 +  public static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
 +          SequenceI contig, SequenceI proteinProduct, Mapping mapping)
 +  {
 +
 +    // gather direct refs from contig congrent with mapping
 +    List<DBRefEntry> direct = new ArrayList<DBRefEntry>();
 +    HashSet<String> directSources = new HashSet<String>();
 +    if (contig.getDBRefs() != null)
 +    {
 +      for (DBRefEntry dbr : contig.getDBRefs())
 +      {
 +        if (dbr.hasMap() && dbr.getMap().getMap().isTripletMap())
 +        {
 +          MapList map = dbr.getMap().getMap();
 +          // check if map is the CDS mapping
 +          if (mapping.getMap().equals(map))
 +          {
 +            direct.add(dbr);
 +            directSources.add(dbr.getSource());
 +          }
 +        }
 +      }
 +    }
 +    DBRefEntry[] onSource = DBRefUtils.selectRefs(
 +            proteinProduct.getDBRefs(),
 +            directSources.toArray(new String[0]));
 +    List<DBRefEntry> propagated = new ArrayList<DBRefEntry>();
 +
 +    // and generate appropriate mappings
 +    for (DBRefEntry cdsref : direct)
 +    {
 +      // clone maplist and mapping
 +      MapList cdsposmap = new MapList(Arrays.asList(new int[][] { new int[]
 +      { cdsSeq.getStart(), cdsSeq.getEnd() } }), cdsref.getMap().getMap()
 +              .getToRanges(), 3, 1);
 +      Mapping cdsmap = new Mapping(cdsref.getMap().getTo(), cdsref.getMap()
 +              .getMap());
 +
 +      // create dbref
 +      DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
 +              cdsref.getVersion(), cdsref.getAccessionId(), new Mapping(
 +                      cdsmap.getTo(), cdsposmap));
 +
 +      // and see if we can map to the protein product for this mapping.
 +      // onSource is the filtered set of accessions on protein that we are
 +      // tranferring, so we assume accession is the same.
 +      if (cdsmap.getTo() == null && onSource != null)
 +      {
 +        List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
 +                cdsref.getAccessionId());
 +        if (sourceRefs != null)
 +        {
 +          for (DBRefEntry srcref : sourceRefs)
 +          {
 +            if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
 +            {
 +              // we have found a complementary dbref on the protein product, so
 +              // update mapping's getTo
 +              newref.getMap().setTo(proteinProduct);
 +            }
 +          }
 +        }
 +      }
 +      cdsSeq.addDBRef(newref);
 +      propagated.add(newref);
 +    }
 +    return propagated;
 +  }
 +
 +  /**
     * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
     * feature start/end ranges, optionally omitting specified feature types.
     * Returns the number of features copied.
      /*
       * dna length should map to protein (or protein plus stop codon)
       */
-     int codesForResidues = mappedDnaLength / 3;
+     int codesForResidues = mappedDnaLength / CODON_LENGTH;
      if (codesForResidues == (proteinLength + 1))
      {
        // assuming extra codon is for STOP and not in peptide
      if (codesForResidues == proteinLength)
      {
        proteinRange.add(new int[] { proteinStart, proteinEnd });
-       return new MapList(ranges, proteinRange, 3, 1);
+       return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
      }
      return null;
    }
       * are currently ignored here
       */
      String trans = codon.contains("-") ? "-"
-             : (codon.length() > 3 ? null : ResidueProperties
+             : (codon.length() > CODON_LENGTH ? null : ResidueProperties
                      .codonTranslate(codon));
      if (trans != null && !trans.equals(residue))
      {
        // set score to 0f so 'graduated colour' option is offered! JAL-2060
        SequenceFeature sf = new SequenceFeature(
                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
-               peptidePos, 0f, "Jalview");
+               peptidePos, 0f, var.getSource());
        StringBuilder attributes = new StringBuilder(32);
        String id = (String) var.variant.getValue(ID);
        if (id != null)
          }
          sf.setValue(ID, id);
          attributes.append(ID).append("=").append(id);
-         // TODO handle other species variants
+         // TODO handle other species variants JAL-2064
          StringBuilder link = new StringBuilder(32);
          try
          {
     * @param dnaToProtein
     * @return
     */
+   @SuppressWarnings("unchecked")
    static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
            SequenceI dnaSeq, MapList dnaToProtein)
    {
          List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
          if (codonVariants == null)
          {
-           codonVariants = new ArrayList[3];
+           codonVariants = new ArrayList[CODON_LENGTH];
            codonVariants[0] = new ArrayList<DnaVariant>();
            codonVariants[1] = new ArrayList<DnaVariant>();
            codonVariants[2] = new ArrayList<DnaVariant>();
          /*
           * save nucleotide (and any variant) for each codon position
           */
-         for (int codonPos = 0; codonPos < 3; codonPos++)
+         for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
          {
            String nucleotide = String.valueOf(
                    dnaSeq.getCharAt(codon[codonPos] - dnaStart))
    {
      AlignmentI copy = new Alignment(new Alignment(seqs));
      copy.setDataset(dataset);
 -
 +    boolean isProtein = !copy.isNucleotide();
      SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
      if (xrefs != null)
      {
          {
            for (DBRefEntry dbref : dbrefs)
            {
 -            if (dbref.getMap() == null || dbref.getMap().getTo() == null)
 +            if (dbref.getMap() == null || dbref.getMap().getTo() == null
 +                    || dbref.getMap().getTo().isProtein() != isProtein)
              {
                continue;
              }
@@@ -276,7 -276,8 +276,7 @@@ public abstract class EnsemblSeqProxy e
        {
          // clunky: ensure Uniprot xref if we have one is on mapped sequence
          SequenceI ds = proteinSeq.getDatasetSequence();
 -        ds.setSourceDBRef(proteinSeq.getSourceDBRef());
 -
 +        // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence()
          Mapping map = new Mapping(ds, mapList);
          DBRefEntry dbr = new DBRefEntry(getDbSource(),
                  getEnsemblDataVersion(), proteinSeq.getName(), map);
        seq = seq.getDatasetSequence();
      }
  
 -    EnsemblXref xrefFetcher = new EnsemblXref(getDomain());
 +    EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(),
 +            getEnsemblDataVersion());
      List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName());
      for (DBRefEntry xref : xrefs)
      {
      DBRefEntry self = new DBRefEntry(getDbSource(),
              getEnsemblDataVersion(), seq.getName());
      seq.addDBRef(self);
 -    seq.setSourceDBRef(self);
    }
  
    /**
          {
            DBRefEntry dbref = DBRefUtils.parseToDbRef(sq, getDbSource(),
                    getEnsemblDataVersion(), name);
 -          sq.setSourceDBRef(dbref);
 +          sq.addDBRef(dbref);
          }
        }
        if (alignment == null)
        SequenceFeature copy = new SequenceFeature(sf);
        copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
        copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
+       if (".".equals(copy.getFeatureGroup()))
+       {
+         copy.setFeatureGroup(getDbSource());
+       }
        targetSequence.addSequenceFeature(copy);
  
        /*
@@@ -994,44 -994,29 +994,44 @@@ public class AlignmentUtilsTest
  
      /*
       * need a sourceDbRef if we are to construct dbrefs to the CDS
 -     * sequence
 +     * sequence from the dna contig sequences
       */
      DBRefEntry dbref = new DBRefEntry("ENSEMBL", "0", "dna1");
 -    dna1.getDatasetSequence().setSourceDBRef(dbref);
 +    dna1.getDatasetSequence().addDBRef(dbref);
 +    org.testng.Assert.assertEquals(dbref, dna1.getPrimaryDBRefs().get(0));
      dbref = new DBRefEntry("ENSEMBL", "0", "dna2");
 -    dna2.getDatasetSequence().setSourceDBRef(dbref);
 +    dna2.getDatasetSequence().addDBRef(dbref);
 +    org.testng.Assert.assertEquals(dbref, dna2.getPrimaryDBRefs().get(0));
  
      /*
       * CDS sequences are 'discovered' from dna-to-protein mappings on the alignment
       * dataset (e.g. added from dbrefs by CrossRef.findXrefSequences)
       */
 -    MapList map = new MapList(new int[] { 4, 6, 10, 12 },
 +    MapList mapfordna1 = new MapList(new int[] { 4, 6, 10, 12 },
              new int[] { 1, 2 }, 3, 1);
      AlignedCodonFrame acf = new AlignedCodonFrame();
 -    acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
 +    acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(),
 +            mapfordna1);
      dna.addCodonFrame(acf);
 -    map = new MapList(new int[] { 1, 3, 7, 9, 13, 15 }, new int[] { 1, 3 },
 +    MapList mapfordna2 = new MapList(new int[] { 1, 3, 7, 9, 13, 15 },
 +            new int[] { 1, 3 },
              3, 1);
      acf = new AlignedCodonFrame();
 -    acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
 +    acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(),
 +            mapfordna2);
      dna.addCodonFrame(acf);
  
      /*
 +     * In this case, mappings originally came from matching Uniprot accessions - so need an xref on dna involving those regions. These are normally constructed from CDS annotation
 +     */
 +    DBRefEntry dna1xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep1",
 +            new Mapping(mapfordna1));
 +    dna1.getDatasetSequence().addDBRef(dna1xref);
 +    DBRefEntry dna2xref = new DBRefEntry("UNIPROT", "ENSEMBL", "pep2",
 +            new Mapping(mapfordna2));
 +    dna2.getDatasetSequence().addDBRef(dna2xref);
 +
 +    /*
       * execute method under test:
       */
      AlignmentI cds = AlignmentUtils.makeCdsAlignment(new SequenceI[] {
       * verify CDS has a dbref with mapping to peptide
       */
      assertNotNull(cds1Dss.getDBRefs());
 -    assertEquals(1, cds1Dss.getDBRefs().length);
 +    assertEquals(2, cds1Dss.getDBRefs().length);
      dbref = cds1Dss.getDBRefs()[0];
 -    assertEquals("UNIPROT", dbref.getSource());
 -    assertEquals("0", dbref.getVersion());
 -    assertEquals("pep1", dbref.getAccessionId());
 +    assertEquals(dna1xref.getSource(), dbref.getSource());
 +    // version is via ensembl's primary ref
 +    assertEquals(dna1xref.getVersion(), dbref.getVersion());
 +    assertEquals(dna1xref.getAccessionId(), dbref.getAccessionId());
      assertNotNull(dbref.getMap());
      assertSame(pep1.getDatasetSequence(), dbref.getMap().getTo());
      MapList cdsMapping = new MapList(new int[] { 1, 6 },
       * verify peptide has added a dbref with reverse mapping to CDS
       */
      assertNotNull(pep1.getDBRefs());
 +    // FIXME pep1.getDBRefs() is 1 - is that the correct behaviour ?
      assertEquals(2, pep1.getDBRefs().length);
      dbref = pep1.getDBRefs()[1];
      assertEquals("ENSEMBL", dbref.getSource());
    public void testComputePeptideVariants()
    {
      /*
-      * scenario: AAATTTCCC codes for KFP, with variants
-      *           GAA -> E
-      *           CAA -> Q
-      *           AAG synonymous
-      *           AAT -> N
-      *              TTC synonymous
-      *                 CAC,CGC -> H,R (as one variant)
+      * scenario: AAATTTCCC codes for KFP
+      * variants:
+      *           GAA -> E             source: Ensembl
+      *           CAA -> Q             source: dbSNP
+      *           AAG synonymous       source: COSMIC
+      *           AAT -> N             source: Ensembl
+      *           ...TTC synonymous    source: dbSNP
+      *           ......CAC,CGC -> H,R source: COSMIC
+      *                 (one variant with two alleles)
       */
      SequenceI peptide = new Sequence("pep/10-12", "KFP");
  
       * two distinct variants for codon 1 position 1
       * second one has clinical significance
       */
+     String ensembl = "Ensembl";
+     String dbSnp = "dbSNP";
+     String cosmic = "COSMIC";
      SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1,
-             0f, null);
+             0f, ensembl);
      sf1.setValue("alleles", "A,G"); // GAA -> E
      sf1.setValue("ID", "var1.125A>G");
      SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1,
-             0f, null);
+             0f, dbSnp);
      sf2.setValue("alleles", "A,C"); // CAA -> Q
      sf2.setValue("ID", "var2");
      sf2.setValue("clinical_significance", "Dodgy");
      SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 3, 3,
-             0f, null);
+             0f, cosmic);
      sf3.setValue("alleles", "A,G"); // synonymous
      sf3.setValue("ID", "var3");
      sf3.setValue("clinical_significance", "None");
      SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3,
-             0f, null);
+             0f, ensembl);
      sf4.setValue("alleles", "A,T"); // AAT -> N
      sf4.setValue("ID", "sequence_variant:var4"); // prefix gets stripped off
      sf4.setValue("clinical_significance", "Benign");
      SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 6, 6,
-             0f, null);
+             0f, dbSnp);
      sf5.setValue("alleles", "T,C"); // synonymous
      sf5.setValue("ID", "var5");
      sf5.setValue("clinical_significance", "Bad");
      SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 8, 8,
-             0f, null);
+             0f, cosmic);
      sf6.setValue("alleles", "C,A,G"); // CAC,CGC -> H,R
      sf6.setValue("ID", "var6");
      sf6.setValue("clinical_significance", "Good");
  
      /*
       * verify added sequence features for
-      * var1 K -> E
-      * var2 K -> Q
-      * var4 K -> N
-      * var6 P -> H
-      * var6 P -> R
+      * var1 K -> E Ensembl
+      * var2 K -> Q dbSNP
+      * var4 K -> N Ensembl
+      * var6 P -> H COSMIC
+      * var6 P -> R COSMIC
       */
      SequenceFeature[] sfs = peptide.getSequenceFeatures();
      assertEquals(5, sfs.length);
      SequenceFeature sf = sfs[0];
      assertEquals(1, sf.getBegin());
      assertEquals(1, sf.getEnd());
      assertEquals(
              "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG",
              sf.links.get(0));
-     assertEquals("Jalview", sf.getFeatureGroup());
+     assertEquals(ensembl, sf.getFeatureGroup());
      sf = sfs[1];
      assertEquals(1, sf.getBegin());
      assertEquals(1, sf.getEnd());
      assertEquals(
              "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2",
              sf.links.get(0));
-     assertEquals("Jalview", sf.getFeatureGroup());
+     assertEquals(dbSnp, sf.getFeatureGroup());
      sf = sfs[2];
      assertEquals(1, sf.getBegin());
      assertEquals(1, sf.getEnd());
      assertEquals(
              "p.Lys1Asn var4|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var4",
              sf.links.get(0));
-     assertEquals("Jalview", sf.getFeatureGroup());
+     assertEquals(ensembl, sf.getFeatureGroup());
+     // var5 generates two distinct protein variant features
      sf = sfs[3];
      assertEquals(3, sf.getBegin());
      assertEquals(3, sf.getEnd());
      assertEquals(
              "p.Pro3His var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
              sf.links.get(0));
-     // var5 generates two distinct protein variant features
-     assertEquals("Jalview", sf.getFeatureGroup());
+     assertEquals(cosmic, sf.getFeatureGroup());
      sf = sfs[4];
      assertEquals(3, sf.getBegin());
      assertEquals(3, sf.getEnd());
      assertEquals(
              "p.Pro3Arg var6|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var6",
              sf.links.get(0));
-     assertEquals("Jalview", sf.getFeatureGroup());
+     assertEquals(cosmic, sf.getFeatureGroup());
    }
  
    /**