import jalview.util.DBRefUtils;
import jalview.util.MapList;
import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
}
else
{
- MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq);
+ MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
if (map != null)
{
acf.addMap(cdnaSeq, aaSeq, map);
}
/**
- * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
- * must be three times the length of the protein, possibly after ignoring
- * start and/or stop codons, and must translate to the protein. Returns null
- * if no mapping is determined.
+ * Builds a mapping (if possible) of a cDNA to a protein sequence.
+ * <ul>
+ * <li>first checks if the cdna translates exactly to the protein sequence</li>
+ * <li>else checks for translation after removing a STOP codon</li>
+ * <li>else checks for translation after removing a START codon</li>
+ * <li>if that fails, inspect CDS features on the cDNA sequence</li>
+ * </ul>
+ * Returns null if no mapping is determined.
*
- * @param proteinSeqs
+ * @param proteinSeq
+ * the aligned protein sequence
* @param cdnaSeq
+ * the aligned cdna sequence
* @return
*/
- public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq,
+ public static MapList mapCdnaToProtein(SequenceI proteinSeq,
SequenceI cdnaSeq)
{
/*
final int proteinEnd = proteinSeq.getEnd();
/*
- * If lengths don't match, try ignoring stop codon.
+ * If lengths don't match, try ignoring stop codon (if present)
*/
if (cdnaLength != mappedLength && cdnaLength > 2)
{
cdnaLength -= 3;
}
- if (cdnaLength != mappedLength)
- {
- return null;
- }
- if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
+ if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
{
- return null;
+ /*
+ * protein is translation of dna (+/- start/stop codons)
+ */
+ MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
}
- MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] {
- proteinStart, proteinEnd }, 3, 1);
- return map;
+
+ /*
+ * translation failed - try mapping CDS annotated regions of dna
+ */
+ return mapCdsToProtein(cdnaSeq, proteinSeq);
}
/**
return false;
}
- int aaResidue = 0;
- for (int i = cdnaStart; i < cdnaSeqChars.length - 2
- && aaResidue < aaSeqChars.length; i += 3, aaResidue++)
+ int aaPos = 0;
+ int dnaPos = cdnaStart;
+ for (; dnaPos < cdnaSeqChars.length - 2
+ && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
{
- String codon = String.valueOf(cdnaSeqChars, i, 3);
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
final String translated = ResidueProperties.codonTranslate(codon);
+
/*
* allow * in protein to match untranslatable in dna
*/
- final char aaRes = aaSeqChars[aaResidue];
+ final char aaRes = aaSeqChars[aaPos];
if ((translated == null || "STOP".equals(translated)) && aaRes == '*')
{
continue;
return false;
}
}
- // fail if we didn't match all of the aa sequence
- return (aaResidue == aaSeqChars.length);
+
+ /*
+ * check we matched all of the protein sequence
+ */
+ if (aaPos != aaSeqChars.length)
+ {
+ return false;
+ }
+
+ /*
+ * check we matched all of the dna except
+ * for optional trailing STOP codon
+ */
+ if (dnaPos == cdnaSeqChars.length)
+ {
+ return true;
+ }
+ if (dnaPos == cdnaSeqChars.length - 3)
+ {
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+ if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
+ {
+ return true;
+ }
+ }
+ return false;
}
/**
Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
int mappedSequenceCount)
{
- // TODO there must be an easier way! root problem is that our mapping data
- // model does not include phase so can't map part of a codon to a peptide
+ // TODO delete this ugly hack once JAL-2022 is resolved
+ // i.e. we can model startPhase > 0 (incomplete start codon)
+
List<SequenceI> sequencesChecked = new ArrayList<SequenceI>();
AlignedCodon lastCodon = null;
Map<SequenceI, AlignedCodon> toAdd = new HashMap<SequenceI, AlignedCodon>();
* Just try to make a mapping (it is not yet stored), test whether
* successful.
*/
- return mapProteinSequenceToCdna(proteinDs, dnaDs) != null;
+ return mapCdnaToProtein(proteinDs, dnaDs) != null;
}
/**
/**
* Constructs an alignment consisting of the mapped (CDS) regions in the given
* nucleotide sequences, and updates mappings to match. The new sequences are
- * aligned as per the original sequences (with gapped columns omitted).
+ * aligned as per the original sequence, with entirely gapped columns (codon
+ * interrupted by intron) omitted.
*
* @param dna
* aligned dna sequences
to.setName(to.getName() + "|" + cdsAccId);
}
}
+
+ /**
+ * Returns a mapping from dna to protein by inspecting sequence features of
+ * type "CDS" on the dna.
+ *
+ * @param dnaSeq
+ * @param proteinSeq
+ * @return
+ */
+ public static MapList mapCdsToProtein(SequenceI dnaSeq,
+ SequenceI proteinSeq)
+ {
+ List<int[]> ranges = findCdsPositions(dnaSeq);
+ int mappedDnaLength = MappingUtils.getLength(ranges);
+
+ int proteinLength = proteinSeq.getLength();
+ int proteinStart = proteinSeq.getStart();
+ int proteinEnd = proteinSeq.getEnd();
+
+ /*
+ * incomplete start codon may mean X at start of peptide
+ * we ignore both for mapping purposes
+ */
+ if (proteinSeq.getCharAt(0) == 'X')
+ {
+ // todo JAL-2022 support startPhase > 0
+ proteinStart++;
+ proteinLength--;
+ }
+ List<int[]> proteinRange = new ArrayList<int[]>();
+
+ /*
+ * dna length should map to protein (or protein plus stop codon)
+ */
+ int codesForResidues = mappedDnaLength / 3;
+ if (codesForResidues == (proteinLength + 1))
+ {
+ // assuming extra codon is for STOP and not in peptide
+ codesForResidues--;
+ }
+ if (codesForResidues == proteinLength)
+ {
+ proteinRange.add(new int[] { proteinStart, proteinEnd });
+ return new MapList(ranges, proteinRange, 3, 1);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
+ * start/end positions of sequence features of type "CDS" (or a sub-type of
+ * CDS in the Sequence Ontology)
+ *
+ * @param dnaSeq
+ * @return
+ */
+ public static List<int[]> findCdsPositions(SequenceI dnaSeq)
+ {
+ List<int[]> result = new ArrayList<int[]>();
+ SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return result;
+ }
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ for (SequenceFeature sf : sfs)
+ {
+ /*
+ * process a CDS feature (or a sub-type of CDS)
+ */
+ if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+ {
+ int phase = 0;
+ try {
+ phase = Integer.parseInt(sf.getPhase());
+ } catch (NumberFormatException e)
+ {
+ // ignore
+ }
+ /*
+ * phase > 0 on first codon means 5' incomplete - skip to the start
+ * of the next codon; example ENST00000496384
+ */
+ int begin = sf.getBegin();
+ int end = sf.getEnd();
+ if (result.isEmpty())
+ {
+ // TODO JAL-2022 support start phase > 0
+ begin += phase;
+ if (begin > end)
+ {
+ continue; // shouldn't happen?
+ }
+ }
+ result.add(new int[] { begin, end });
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Maps exon features from dna to protein, and computes variants in peptide
+ * product generated by variants in dna, and adds them as sequence_variant
+ * features on the protein sequence. Returns the number of variant features
+ * added.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ public static int computeProteinFeatures(SequenceI dnaSeq,
+ SequenceI peptide, MapList dnaToProtein)
+ {
+ while (dnaSeq.getDatasetSequence() != null)
+ {
+ dnaSeq = dnaSeq.getDatasetSequence();
+ }
+ while (peptide.getDatasetSequence() != null)
+ {
+ peptide = peptide.getDatasetSequence();
+ }
+
+ transferFeatures(dnaSeq, peptide, dnaToProtein,
+ SequenceOntologyI.EXON);
+
+ LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+ dnaSeq, dnaToProtein);
+
+ /*
+ * scan codon variations, compute peptide variants and add to peptide sequence
+ */
+ int count = 0;
+ for (Entry<Integer, String[][]> variant : variants.entrySet())
+ {
+ int peptidePos = variant.getKey();
+ String[][] codonVariants = variant.getValue();
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
+ List<String> peptideVariants = computePeptideVariants(codonVariants,
+ residue);
+ if (!peptideVariants.isEmpty())
+ {
+ String desc = StringUtils.listToDelimitedString(peptideVariants,
+ ", ");
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, null);
+ peptide.addSequenceFeature(sf);
+ count++;
+ }
+ }
+
+ /*
+ * ugly sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet instead?
+ */
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator<SequenceFeature>()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
+ return count;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is an
+ * array of all variants for the coding codon positions
+ *
+ * @param dnaSeq
+ * @param dnaToProtein
+ * @return
+ */
+ static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+ SequenceI dnaSeq, MapList dnaToProtein)
+ {
+ /*
+ * map from peptide position to all variant features of the codon for it
+ * LinkedHashMap ensures we add the peptide features in sequence order
+ */
+ LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return variants;
+ }
+
+ int dnaStart = dnaSeq.getStart();
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ String[][] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new String[3][];
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.toUpperCase().split(",");
+ int i = 0;
+ for (String allele : alleles)
+ {
+ alleles[i++] = allele.trim(); // lose any space characters "A, G"
+ }
+
+ /*
+ * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and this variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(
+ dnaSeq.getCharAt(codon[codonPos] - dnaStart))
+ .toUpperCase();
+ if (codonVariants[codonPos] == null)
+ {
+ /*
+ * record current dna base
+ */
+ codonVariants[codonPos] = new String[] { nucleotide };
+ }
+ if (codon[codonPos] == dnaCol)
+ {
+ /*
+ * add alleles to dna base (and any previously found alleles)
+ */
+ String[] known = codonVariants[codonPos];
+ String[] dnaVariants = new String[alleles.length + known.length];
+ System.arraycopy(known, 0, dnaVariants, 0, known.length);
+ System.arraycopy(alleles, 0, dnaVariants, known.length,
+ alleles.length);
+ codonVariants[codonPos] = dnaVariants;
+ }
+ }
+ }
+ }
+ return variants;
+ }
+
+ /**
+ * Returns a sorted, non-redundant list of all peptide translations generated
+ * by the given dna variants, excluding the current residue value
+ *
+ * @param codonVariants
+ * an array of base values (acgtACGT) for codon positions 1, 2, 3
+ * @param residue
+ * the current residue translation
+ * @return
+ */
+ static List<String> computePeptideVariants(
+ String[][] codonVariants, String residue)
+ {
+ List<String> result = new ArrayList<String>();
+ for (String base1 : codonVariants[0])
+ {
+ for (String base2 : codonVariants[1])
+ {
+ for (String base3 : codonVariants[2])
+ {
+ String codon = base1 + base2 + base3;
+ /*
+ * get peptide translation of codon e.g. GAT -> D
+ * note that variants which are not single alleles,
+ * e.g. multibase variants or HGMD_MUTATION etc
+ * are ignored here
+ */
+ String peptide = codon.contains("-") ? "-"
+ : (codon.length() > 3 ? null : ResidueProperties
+ .codonTranslate(codon));
+ if (peptide != null && !result.contains(peptide)
+ && !peptide.equalsIgnoreCase(residue))
+ {
+ result.add(peptide);
+ }
+ }
+ }
+ }
+
+ /*
+ * sort alphabetically with STOP at the end
+ */
+ Collections.sort(result, new Comparator<String>()
+ {
+
+ @Override
+ public int compare(String o1, String o2)
+ {
+ if ("STOP".equals(o1))
+ {
+ return 1;
+ }
+ else if ("STOP".equals(o2))
+ {
+ return -1;
+ }
+ else
+ {
+ return o1.compareTo(o2);
+ }
+ }
+ });
+ return result;
+ }
}
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.DBRefUtils;
+import jalview.util.MapList;
import jalview.ws.SequenceFetcher;
import jalview.ws.seqfetcher.ASequenceFetcher;
* are found e.g. alternative protein products for a protein's gene
* @return products (as dataset sequences)
*/
- public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
- String source, AlignmentI al, List<SequenceI> addedPeers)
+ public static Alignment findXrefSequences(SequenceI[] seqs,
+ final boolean dna, final String source, AlignmentI al,
+ List<SequenceI> addedPeers)
{
AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
List<SequenceI> rseqs = new ArrayList<SequenceI>();
// xrefs on this sequence.
if (dataset != null)
{
- found |= searchDataset(dss, xref, dataset, rseqs, cf); // ,false,!dna);
+ found |= searchDataset(dss, xref, dataset, rseqs, cf, false,
+ !dna);
+ // ,false,!dna);
if (found)
{
xrfs[r] = null; // we've recovered seqs for this one.
if (retrieved != null)
{
+ updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
+
List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
CrossRef me = new CrossRef();
for (int rs = 0; rs < retrieved.length; rs++)
}
/**
+ * Updates any empty mappings in the cross-references with one to a compatible
+ * retrieved sequence if found, and adds any new mappings to the
+ * AlignedCodonFrame
+ *
+ * @param dna
+ * @param mapFrom
+ * @param xrefs
+ * @param retrieved
+ * @param acf
+ */
+ static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+ DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
+ {
+ SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
+ for (DBRefEntry xref : xrefs)
+ {
+ if (!xref.hasMap())
+ {
+ String targetSeqName = xref.getSource() + "|"
+ + xref.getAccessionId();
+ SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
+ if (matches == null)
+ {
+ return;
+ }
+ for (SequenceI seq : matches)
+ {
+ MapList mapping = null;
+ if (dna)
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
+ }
+ else
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
+ if (mapping != null)
+ {
+ mapping = mapping.getInverse();
+ }
+ }
+ if (mapping != null)
+ {
+ xref.setMap(new Mapping(seq, mapping));
+ if (dna)
+ {
+ AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
+ }
+ if (dna)
+ {
+ acf.addMap(mapFrom, seq, mapping);
+ }
+ else
+ {
+ acf.addMap(seq, mapFrom, mapping.getInverse());
+ }
+ continue;
+ }
+ }
+ }
+ }
+ }
+
+ /**
* find references to lrfs in the cross-reference set of each sequence in
* dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
* based on source and accession string only - Map and Version are nulled.
import jalview.io.gff.SequenceOntologyFactory;
import jalview.io.gff.SequenceOntologyI;
+import java.util.Arrays;
import java.util.List;
import com.stevesoft.pat.Regex;
*/
public class EnsemblCdna extends EnsemblSeqProxy
{
+ private static final List<String> CROSS_REFERENCES = Arrays
+ .asList(new String[] { "Uniprot/SWISSPROT", "Uniprot/SPTREMBL" });
+
/*
* accepts ENST or ENSTG with 11 digits
* or ENSMUST or similar for other species
@Override
protected List<String> getCrossReferenceDatabases()
{
- return super.getCrossReferenceDatabases();
+ return CROSS_REFERENCES;
// 30/01/16 also found Vega_transcript, OTTT, ENS_LRG_transcript, UCSC,
// HGNC_trans_name, RefSeq_mRNA, RefSeq_mRNA_predicted
}
* and also means we don't need to keep CDS features on CDS sequence (where
* they are redundant information).
*/
- @Override
protected List<int[]> getCdsRanges(SequenceI dnaSeq)
{
int len = dnaSeq.getLength();
*/
public class EnsemblGene extends EnsemblSeqProxy
{
+ private static final List<String> CROSS_REFERENCES = Arrays
+ .asList(new String[] { "CCDS" });
+
private static final String GENE_PREFIX = "gene:";
/*
/*
* fetch and save cross-references
*/
- super.getCrossReferences(transcript);
+ new EnsemblCdna(getDomain()).getCrossReferences(transcript);
/*
* and finally fetch the protein product and save as a cross-reference
// found these for ENSG00000157764 on 30/01/2016:
// return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress",
// "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"};
- return super.getCrossReferenceDatabases();
+ return CROSS_REFERENCES;
}
/**
import jalview.io.FileParse;
import jalview.io.gff.SequenceOntologyFactory;
import jalview.io.gff.SequenceOntologyI;
-import jalview.schemes.ResidueProperties;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
-import jalview.util.MappingUtils;
-import jalview.util.StringUtils;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.LinkedHashMap;
import java.util.List;
-import java.util.Map.Entry;
/**
* Base class for Ensembl sequence fetchers
public abstract class EnsemblSeqProxy extends EnsemblRestClient
{
private static final List<String> CROSS_REFERENCES = Arrays
- .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" });
+ .asList(new String[] { "CCDS", "Uniprot/SWISSPROT",
+ "Uniprot/SPTREMBL" });
protected static final String CONSEQUENCE_TYPE = "consequence_type";
.getSequenceRecords(accId);
if (protein == null || protein.getHeight() == 0)
{
- System.out.println("Failed to retrieve protein for " + accId);
+ System.out.println("No protein product found for " + accId);
return;
}
SequenceI proteinSeq = protein.getSequenceAt(0);
proteinSeq.createDatasetSequence();
querySeq.createDatasetSequence();
- MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
+ MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq);
if (mapList != null)
{
// clunky: ensure Uniprot xref if we have one is on mapped sequence
querySeq.getDatasetSequence().addDBRef(dbr);
/*
- * compute peptide variants from dna variants and add as
- * sequence features on the protein sequence ta-da
+ * copy exon features to protein, compute peptide variants from dna
+ * variants and add as features on the protein sequence ta-da
*/
- computeProteinFeatures(querySeq, proteinSeq, mapList);
+ AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList);
}
} catch (Exception e)
{
/**
* Returns a list of database names to be used when fetching cross-references.
+ * Specifically, the names are used to filter data returned by the Ensembl
+ * xrefs REST service on the value in field 'dbname'.
*
* @return
*/
}
/**
- * Returns a mapping from dna to protein by inspecting sequence features of
- * type "CDS" on the dna.
- *
- * @param dnaSeq
- * @param proteinSeq
- * @return
- */
- protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq)
- {
- List<int[]> ranges = getCdsRanges(dnaSeq);
- int mappedDnaLength = MappingUtils.getLength(ranges);
-
- int proteinLength = proteinSeq.getLength();
- int proteinEnd = proteinLength;
- int proteinStart = 1;
-
- /*
- * incomplete start codon may mean X at start of peptide
- * we ignore both for mapping purposes
- */
- if (proteinSeq.getCharAt(0) == 'X')
- {
- proteinStart = 2;
- proteinLength--;
- }
- List<int[]> proteinRange = new ArrayList<int[]>();
-
- /*
- * dna length should map to protein (or protein plus stop codon)
- */
- int codesForResidues = mappedDnaLength / 3;
- if (codesForResidues == (proteinLength + 1))
- {
- // assuming extra codon is for STOP and not in peptide
- codesForResidues--;
- }
- if (codesForResidues == proteinLength)
- {
- proteinRange.add(new int[] { proteinStart, proteinEnd });
- return new MapList(ranges, proteinRange, 3, 1);
- }
- return null;
- }
-
- /**
- * Returns a list of CDS ranges found.
- *
- * No need to worry about reverse strand dna, here since the retrieved
- * sequence is as transcribed (reverse complement for reverse strand), i.e in
- * the same sense as the peptide.
- *
- * @param dnaSeq
- * @return
- */
- protected List<int[]> getCdsRanges(SequenceI dnaSeq)
- {
- List<int[]> result = new ArrayList<int[]>();
- SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
- if (sfs == null)
- {
- return result;
- }
- SequenceOntologyI so = SequenceOntologyFactory.getInstance();
- for (SequenceFeature sf : sfs)
- {
- /*
- * process a CDS feature (or a sub-type of CDS)
- */
- if (so.isA(sf.getType(), SequenceOntologyI.CDS))
- {
- int phase = 0;
- try {
- phase = Integer.parseInt(sf.getPhase());
- } catch (NumberFormatException e)
- {
- // ignore
- }
- /*
- * phase > 0 on first codon means 5' incomplete - skip to the start
- * of the next codon; example ENST00000496384
- */
- int begin = sf.getBegin();
- int end = sf.getEnd();
- if (result.isEmpty())
- {
- begin += phase;
- if (begin > end)
- {
- continue; // shouldn't happen?
- }
- }
- result.add(new int[] { begin, end });
- }
- }
- return result;
- }
-
- /**
* Fetches sequences for the list of accession ids and adds them to the
* alignment. Returns the extended (or created) alignment.
*
}
/**
- * Maps exon features from dna to protein, and computes variants in peptide
- * product generated by variants in dna, and adds them as sequence_variant
- * features on the protein sequence. Returns the number of variant features
- * added.
- *
- * @param dnaSeq
- * @param peptide
- * @param dnaToProtein
- */
- static int computeProteinFeatures(SequenceI dnaSeq,
- SequenceI peptide, MapList dnaToProtein)
- {
- while (dnaSeq.getDatasetSequence() != null)
- {
- dnaSeq = dnaSeq.getDatasetSequence();
- }
- while (peptide.getDatasetSequence() != null)
- {
- peptide = peptide.getDatasetSequence();
- }
-
- AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein,
- SequenceOntologyI.EXON);
-
- LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
- dnaSeq, dnaToProtein);
-
- /*
- * scan codon variations, compute peptide variants and add to peptide sequence
- */
- int count = 0;
- for (Entry<Integer, String[][]> variant : variants.entrySet())
- {
- int peptidePos = variant.getKey();
- String[][] codonVariants = variant.getValue();
- String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
- List<String> peptideVariants = computePeptideVariants(codonVariants,
- residue);
- if (!peptideVariants.isEmpty())
- {
- String desc = StringUtils.listToDelimitedString(peptideVariants,
- ", ");
- SequenceFeature sf = new SequenceFeature(
- SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
- peptidePos, 0f, null);
- peptide.addSequenceFeature(sf);
- count++;
- }
- }
-
- /*
- * ugly sort to get sequence features in start position order
- * - would be better to store in Sequence as a TreeSet instead?
- */
- Arrays.sort(peptide.getSequenceFeatures(),
- new Comparator<SequenceFeature>()
- {
- @Override
- public int compare(SequenceFeature o1, SequenceFeature o2)
- {
- int c = Integer.compare(o1.getBegin(), o2.getBegin());
- return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
- : c;
- }
- });
- return count;
- }
-
- /**
- * Builds a map whose key is position in the protein sequence, and value is an
- * array of all variants for the coding codon positions
- *
- * @param dnaSeq
- * @param dnaToProtein
- * @return
- */
- static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
- SequenceI dnaSeq, MapList dnaToProtein)
- {
- /*
- * map from peptide position to all variant features of the codon for it
- * LinkedHashMap ensures we add the peptide features in sequence order
- */
- LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
- SequenceOntologyI so = SequenceOntologyFactory.getInstance();
-
- SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
- if (dnaFeatures == null)
- {
- return variants;
- }
-
- int dnaStart = dnaSeq.getStart();
- int[] lastCodon = null;
- int lastPeptidePostion = 0;
-
- /*
- * build a map of codon variations for peptides
- */
- for (SequenceFeature sf : dnaFeatures)
- {
- int dnaCol = sf.getBegin();
- if (dnaCol != sf.getEnd())
- {
- // not handling multi-locus variant features
- continue;
- }
- if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
- {
- int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
- if (mapsTo == null)
- {
- // feature doesn't lie within coding region
- continue;
- }
- int peptidePosition = mapsTo[0];
- String[][] codonVariants = variants.get(peptidePosition);
- if (codonVariants == null)
- {
- codonVariants = new String[3][];
- variants.put(peptidePosition, codonVariants);
- }
-
- /*
- * extract dna variants to a string array
- */
- String alls = (String) sf.getValue("alleles");
- if (alls == null)
- {
- continue;
- }
- String[] alleles = alls.split(",");
-
- /*
- * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
- */
- int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
- : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
- peptidePosition, peptidePosition));
- lastPeptidePostion = peptidePosition;
- lastCodon = codon;
-
- /*
- * save nucleotide (and this variant) for each codon position
- */
- for (int codonPos = 0; codonPos < 3; codonPos++)
- {
- String nucleotide = String.valueOf(dnaSeq
- .getCharAt(codon[codonPos] - dnaStart));
- if (codon[codonPos] == dnaCol)
- {
- /*
- * record current dna base and its alleles
- */
- String[] dnaVariants = new String[alleles.length + 1];
- dnaVariants[0] = nucleotide;
- System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
- codonVariants[codonPos] = dnaVariants;
- }
- else if (codonVariants[codonPos] == null)
- {
- /*
- * record current dna base only
- * (at least until we find any variation and overwrite it)
- */
- codonVariants[codonPos] = new String[] { nucleotide };
- }
- }
- }
- }
- return variants;
- }
-
- /**
- * Returns a sorted, non-redundant list of all peptide translations generated
- * by the given dna variants, excluding the current residue value
- *
- * @param codonVariants
- * an array of base values (acgtACGT) for codon positions 1, 2, 3
- * @param residue
- * the current residue translation
- * @return
- */
- static List<String> computePeptideVariants(
- String[][] codonVariants, String residue)
- {
- List<String> result = new ArrayList<String>();
- for (String base1 : codonVariants[0])
- {
- for (String base2 : codonVariants[1])
- {
- for (String base3 : codonVariants[2])
- {
- String codon = base1 + base2 + base3;
- // TODO: report frameshift/insertion/deletion
- // and multiple-base variants?!
- String peptide = codon.contains("-") ? "-" : ResidueProperties
- .codonTranslate(codon);
- if (peptide != null && !result.contains(peptide)
- && !peptide.equalsIgnoreCase(residue))
- {
- result.add(peptide);
- }
- }
- }
- }
-
- /*
- * sort alphabetically with STOP at the end
- */
- Collections.sort(result, new Comparator<String>()
- {
-
- @Override
- public int compare(String o1, String o2)
- {
- if ("STOP".equals(o1))
- {
- return 1;
- }
- else if ("STOP".equals(o2))
- {
- return -1;
- }
- else
- {
- return o1.compareTo(o2);
- }
- }
- });
- return result;
- }
-
- /**
* Answers true if the feature type is either 'NMD_transcript_variant' or
* 'transcript' or one of its sub-types in the Sequence Ontology. This is
* needed because NMD_transcript_variant behaves like 'transcript' in Ensembl
import jalview.datamodel.AlignmentOrder;
import jalview.datamodel.AlignmentView;
import jalview.datamodel.ColumnSelection;
+import jalview.datamodel.DBRefSource;
import jalview.datamodel.HiddenSequences;
import jalview.datamodel.PDBEntry;
import jalview.datamodel.SeqCigar;
Alignment al = makeCrossReferencesAlignment(
alignment.getDataset(), xrefs);
- /*
- * Copy dna-to-protein mappings to new alignment
- */
- // TODO 1: no mappings are set up for EMBL product
- // TODO 2: if they were, should add them to protein alignment, not
- // dna
- // List<AlignedCodonFrame> cf = xrefs.getCodonFrames();
- // for (AlignedCodonFrame acf : cf)
- // {
- // al.addCodonFrame(acf);
- // }
AlignFrame newFrame = new AlignFrame(al, DEFAULT_WIDTH,
DEFAULT_HEIGHT);
String newtitle = String.format("%s %s %s",
{
copyAlignment = AlignmentUtils.makeCdsAlignment(
sequenceSelection, cf, alignment);
+ if (copyAlignment.getHeight() == 0)
+ {
+ System.err.println("Failed to make CDS alignment");
+ }
al.getCodonFrames().clear();
al.getCodonFrames().addAll(cf);
}
copyAlignment.addSequence(peer);
}
- /*
- * align protein to dna
- */
- if (dna)
+ if (copyAlignment.getHeight() > 0)
{
- al.alignAs(copyAlignment);
- }
- else
- {
- copyAlignment.alignAs(al);
- }
-
- AlignFrame copyThis = new AlignFrame(copyAlignment,
- AlignFrame.DEFAULT_WIDTH, AlignFrame.DEFAULT_HEIGHT);
- copyThis.setTitle(AlignFrame.this.getTitle());
-
- boolean showSequenceFeatures = viewport
- .isShowSequenceFeatures();
- newFrame.setShowSeqFeatures(showSequenceFeatures);
- copyThis.setShowSeqFeatures(showSequenceFeatures);
- FeatureRenderer myFeatureStyling = alignPanel.getSeqPanel().seqCanvas
- .getFeatureRenderer();
-
- /*
- * copy feature rendering settings to split frame
- */
- newFrame.alignPanel.getSeqPanel().seqCanvas
- .getFeatureRenderer().transferSettings(
- myFeatureStyling);
- copyThis.alignPanel.getSeqPanel().seqCanvas
- .getFeatureRenderer().transferSettings(
- myFeatureStyling);
+ /*
+ * align protein to dna
+ */
+ // FIXME what if the dna is not aligned :-O
+ if (dna)
+ {
+ al.alignAs(copyAlignment);
+ }
+ else
+ {
+ /*
+ * align cdna to protein - currently only if
+ * fetching and aligning Ensembl transcripts!
+ */
+ if (DBRefSource.ENSEMBL.equalsIgnoreCase(source))
+ {
+ copyAlignment.alignAs(al);
+ }
+ }
- /*
- * apply 'database source' feature configuration
- * if any was found
- */
- newFrame.getViewport()
- .applyFeaturesStyle(featureColourScheme);
- copyThis.getViewport()
- .applyFeaturesStyle(featureColourScheme);
-
- SplitFrame sf = new SplitFrame(dna ? copyThis : newFrame,
- dna ? newFrame : copyThis);
- newFrame.setVisible(true);
- copyThis.setVisible(true);
- String linkedTitle = MessageManager
- .getString("label.linked_view_title");
- Desktop.addInternalFrame(sf, linkedTitle, -1, -1);
- sf.adjustDivider();
+ AlignFrame copyThis = new AlignFrame(copyAlignment,
+ AlignFrame.DEFAULT_WIDTH, AlignFrame.DEFAULT_HEIGHT);
+ copyThis.setTitle(AlignFrame.this.getTitle());
+
+ boolean showSequenceFeatures = viewport
+ .isShowSequenceFeatures();
+ newFrame.setShowSeqFeatures(showSequenceFeatures);
+ copyThis.setShowSeqFeatures(showSequenceFeatures);
+ FeatureRenderer myFeatureStyling = alignPanel.getSeqPanel().seqCanvas
+ .getFeatureRenderer();
+
+ /*
+ * copy feature rendering settings to split frame
+ */
+ newFrame.alignPanel.getSeqPanel().seqCanvas
+ .getFeatureRenderer().transferSettings(
+ myFeatureStyling);
+ copyThis.alignPanel.getSeqPanel().seqCanvas
+ .getFeatureRenderer().transferSettings(
+ myFeatureStyling);
+
+ /*
+ * apply 'database source' feature configuration
+ * if any was found
+ */
+ // TODO is this the feature colouring for the original
+ // alignment or the fetched xrefs? either could be Ensembl
+ newFrame.getViewport().applyFeaturesStyle(
+ featureColourScheme);
+ copyThis.getViewport().applyFeaturesStyle(
+ featureColourScheme);
+
+ SplitFrame sf = new SplitFrame(dna ? copyThis : newFrame,
+ dna ? newFrame : copyThis);
+ newFrame.setVisible(true);
+ copyThis.setVisible(true);
+ String linkedTitle = MessageManager
+ .getString("label.linked_view_title");
+ Desktop.addInternalFrame(sf, linkedTitle, -1, -1);
+ sf.adjustDivider();
+ }
}
else
{
public static String getUniprotEntryId(UniprotEntry entry)
{
StringBuilder name = new StringBuilder(32);
- name.append("UniProt/Swiss-Prot");
+ // name.append("UniProt/Swiss-Prot");
+ // use 'canonicalised' name for optimal id matching
+ name.append(DBRefSource.UNIPROT);
for (String accessionId : entry.getAccession())
{
name.append(BAR_DELIMITER);
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@Test(groups = { "Functional" })
public void testTranslatesAs()
{
+ // null arguments check
+ assertFalse(AlignmentUtils.translatesAs(null, 0, null));
+ assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null));
+ assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' }));
+
+ // straight translation
assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0,
"FPKG".toCharArray()));
- // with start codon (not in protein)
+ // with extra start codon (not in protein)
assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(),
3, "FPKG".toCharArray()));
// with stop codon1 (not in protein)
assertTrue(AlignmentUtils.translatesAs(
"atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray()));
- // with embedded stop codon
+ // with embedded stop codons
assertTrue(AlignmentUtils.translatesAs(
"atgtttTAGcccaaaTAAgggtga".toCharArray(), 3,
"F*PK*G".toCharArray()));
// wrong protein
assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
0, "FPMG".toCharArray()));
+
+ // truncated dna
+ assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0,
+ "FPKG".toCharArray()));
+
+ // truncated protein
+ assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
+ 0, "FPK".toCharArray()));
+
+ // overlong dna (doesn't end in stop codon)
+ assertFalse(AlignmentUtils.translatesAs(
+ "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray()));
+
+ // dna + stop codon + more
+ assertFalse(AlignmentUtils.translatesAs(
+ "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray()));
+
+ // overlong protein
+ assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(),
+ 0, "FPKGQ".toCharArray()));
}
/**
* @throws IOException
*/
@Test(groups = { "Functional" })
- public void testMapProteinSequenceToCdna_forSubsequence()
+ public void testMapCdnaToProtein_forSubsequence()
throws IOException
{
SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12);
SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48);
dna.createDatasetSequence();
- MapList map = AlignmentUtils.mapProteinSequenceToCdna(prot, dna);
+ MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna);
assertEquals(10, map.getToLowest());
assertEquals(12, map.getToHighest());
assertEquals(40, map.getFromLowest());
assertEquals("--N-G", prot2.getSequenceAsString());
assertEquals("---XG", prot3.getSequenceAsString());
}
+
+ /**
+ * Tests for the method that maps the subset of a dna sequence that has CDS
+ * (or subtype) feature - case where the start codon is incomplete.
+ */
+ @Test(groups = "Functional")
+ public void testGetCdsRanges_fivePrimeIncomplete()
+ {
+ SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
+ dnaSeq.createDatasetSequence();
+ SequenceI ds = dnaSeq.getDatasetSequence();
+
+ // CDS for dna 5-6 (incomplete codon), 7-9
+ SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
+ sf.setPhase("2"); // skip 2 bases to start of next codon
+ ds.addSequenceFeature(sf);
+ // CDS for dna 13-15
+ sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
+ ds.addSequenceFeature(sf);
+
+ List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
+
+ /*
+ * check the mapping starts with the first complete codon
+ */
+ assertEquals(6, MappingUtils.getLength(ranges));
+ assertEquals(2, ranges.size());
+ assertEquals(7, ranges.get(0)[0]);
+ assertEquals(9, ranges.get(0)[1]);
+ assertEquals(13, ranges.get(1)[0]);
+ assertEquals(15, ranges.get(1)[1]);
+ }
+
+ /**
+ * Tests for the method that maps the subset of a dna sequence that has CDS
+ * (or subtype) feature.
+ */
+ @Test(groups = "Functional")
+ public void testGetCdsRanges()
+ {
+ SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
+ dnaSeq.createDatasetSequence();
+ SequenceI ds = dnaSeq.getDatasetSequence();
+
+ // CDS for dna 3-6
+ SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
+ ds.addSequenceFeature(sf);
+ // exon feature should be ignored here
+ sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
+ ds.addSequenceFeature(sf);
+ // CDS for dna 10-12
+ sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
+ ds.addSequenceFeature(sf);
+
+ List<int[]> ranges = AlignmentUtils.findCdsPositions(dnaSeq);
+ assertEquals(6, MappingUtils.getLength(ranges));
+ assertEquals(2, ranges.size());
+ assertEquals(4, ranges.get(0)[0]);
+ assertEquals(6, ranges.get(0)[1]);
+ assertEquals(10, ranges.get(1)[0]);
+ assertEquals(12, ranges.get(1)[1]);
+ }
+
+ /**
+ * Test the method that computes a map of codon variants for each protein
+ * position from "sequence_variant" features on dna
+ */
+ @Test(groups = "Functional")
+ public void testBuildDnaVariantsMap()
+ {
+ SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag");
+ MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1);
+
+ /*
+ * first with no variants on dna
+ */
+ LinkedHashMap<Integer, String[][]> variantsMap = AlignmentUtils
+ .buildDnaVariantsMap(dna, map);
+ assertTrue(variantsMap.isEmpty());
+
+ // single allele codon 1, on base 1
+ SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1,
+ 0f, null);
+ sf.setValue("alleles", "T");
+ dna.addSequenceFeature(sf);
+
+ // two alleles codon 2, on bases 2 and 3
+ sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null);
+ sf.setValue("alleles", "T");
+ dna.addSequenceFeature(sf);
+ sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null);
+ sf.setValue("alleles", "G");
+ dna.addSequenceFeature(sf);
+
+ // two alleles codon 3, both on base 2
+ sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null);
+ sf.setValue("alleles", "C, G");
+ dna.addSequenceFeature(sf);
+
+ // no alleles on codon 4
+ // alleles on codon 5 on all 3 bases
+ sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null);
+ sf.setValue("alleles", "C, G"); // (C duplicates given base value)
+ dna.addSequenceFeature(sf);
+ sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null);
+ sf.setValue("alleles", "g, a"); // should force to upper-case
+ dna.addSequenceFeature(sf);
+ sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null);
+ sf.setValue("alleles", "A, T");
+ dna.addSequenceFeature(sf);
+
+ variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
+ assertEquals(4, variantsMap.size());
+ assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" },
+ { "G" } }, variantsMap.get(1)));
+ assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" },
+ { "A", "G" } }, variantsMap.get(2)));
+ assertTrue(Arrays.deepEquals(new String[][] { { "T" },
+ { "T", "C", "G" }, { "T" } }, variantsMap.get(3)));
+ // duplicated bases are not removed here, handled in computePeptideVariants
+ assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" },
+ { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5)));
+ }
+
+ /**
+ * Tests for the method that computes all peptide variants given codon
+ * variants
+ */
+ @Test(groups = "Functional")
+ public void testComputePeptideVariants()
+ {
+ String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
+
+ /*
+ * AGT codes for S - this is not included in the variants returned
+ */
+ List<String> variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[]", variants.toString());
+
+ // S is reported if it differs from the current value (A):
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "A");
+ assertEquals("[S]", variants.toString());
+
+ /*
+ * synonymous variant is not reported
+ */
+ codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
+ // AGC and AGT both code for S
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "s");
+ assertEquals("[]", variants.toString());
+
+ /*
+ * equivalent variants are only reported once
+ */
+ codonVariants = new String[][] { { "C" }, { "T" },
+ { "A", "C", "G", "T" } };
+ // CTA CTC CTG CTT all code for L
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[L]", variants.toString());
+
+ /*
+ * vary codons 1 and 2; variant products are sorted and non-redundant
+ */
+ codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
+ // aga ata cga cta code for R, I, R, L
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[I, L, R]", variants.toString());
+
+ /*
+ * vary codons 2 and 3
+ */
+ codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
+ // aga agc ata atc code for R, S, I, I
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[I, R]", variants.toString());
+
+ /*
+ * vary codons 1 and 3
+ */
+ codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
+ // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[K, N, Y, STOP]", variants.toString());
+
+ /*
+ * vary codons 1, 2 and 3
+ */
+ codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
+ { "t", "g" } };
+ // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
+ variants = AlignmentUtils.computePeptideVariants(codonVariants, "S");
+ assertEquals("[C, R, T, W]", variants.toString());
+ }
}
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertTrue;
+import jalview.analysis.AlignmentUtils;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
-import jalview.datamodel.Sequence;
-import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.io.AppletFormatAdapter;
import jalview.io.FastaFile;
import jalview.io.FileParse;
import jalview.io.gff.SequenceOntologyFactory;
import jalview.io.gff.SequenceOntologyLite;
-import jalview.util.MappingUtils;
import java.lang.reflect.Method;
import java.net.MalformedURLException;
: "DOWN or unreachable ******************* BAD!"));
}
- /**
- * Tests for the method that computes all peptide variants given codon
- * variants
- */
- @Test(groups = "Functional")
- public void testComputePeptideVariants()
- {
- String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
-
- /*
- * AGT codes for S - this is not included in the variants returned
- */
- List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[]", variants.toString());
-
- // S is reported if it differs from the current value (A):
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
- assertEquals("[S]", variants.toString());
-
- /*
- * synonymous variant is not reported
- */
- codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
- // AGC and AGT both code for S
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
- assertEquals("[]", variants.toString());
-
- /*
- * equivalent variants are only reported once
- */
- codonVariants = new String[][] { { "C" }, { "T" },
- { "A", "C", "G", "T" } };
- // CTA CTC CTG CTT all code for L
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[L]", variants.toString());
-
- /*
- * vary codons 1 and 2; variant products are sorted and non-redundant
- */
- codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
- // aga ata cga cta code for R, I, R, L
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[I, L, R]", variants.toString());
-
- /*
- * vary codons 2 and 3
- */
- codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
- // aga agc ata atc code for R, S, I, I
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[I, R]", variants.toString());
-
- /*
- * vary codons 1 and 3
- */
- codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
- // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[K, N, Y, STOP]", variants.toString());
-
- /*
- * vary codons 1, 2 and 3
- */
- codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
- { "t", "g" } };
- // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
- variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
- assertEquals("[C, R, T, W]", variants.toString());
- }
-
- /**
- * Tests for the method that maps the subset of a dna sequence that has CDS
- * (or subtype) feature.
- */
- @Test(groups = "Functional")
- public void testGetCdsRanges()
- {
- EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
-
- SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
- dnaSeq.createDatasetSequence();
- SequenceI ds = dnaSeq.getDatasetSequence();
-
- // CDS for dna 3-6
- SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
- ds.addSequenceFeature(sf);
- // exon feature should be ignored here
- sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
- ds.addSequenceFeature(sf);
- // CDS for dna 10-12
- sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
- ds.addSequenceFeature(sf);
-
- List<int[]> ranges = testee.getCdsRanges(dnaSeq);
- assertEquals(6, MappingUtils.getLength(ranges));
- assertEquals(2, ranges.size());
- assertEquals(4, ranges.get(0)[0]);
- assertEquals(6, ranges.get(0)[1]);
- assertEquals(10, ranges.get(1)[0]);
- assertEquals(12, ranges.get(1)[1]);
-
- }
-
@Test(groups = "Functional")
public void getGenomicRangesFromFeatures()
{
}
- /**
- * Tests for the method that maps the subset of a dna sequence that has CDS
- * (or subtype) feature - case where the start codon is incomplete.
- */
- @Test(groups = "Functional")
- public void testGetCdsRanges_fivePrimeIncomplete()
- {
- EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
-
- SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
- dnaSeq.createDatasetSequence();
- SequenceI ds = dnaSeq.getDatasetSequence();
-
- // CDS for dna 5-6 (incomplete codon), 7-9
- SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
- sf.setPhase("2"); // skip 2 bases to start of next codon
- ds.addSequenceFeature(sf);
- // CDS for dna 13-15
- sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
- ds.addSequenceFeature(sf);
-
- List<int[]> ranges = testee.getCdsRanges(dnaSeq);
-
- /*
- * check the mapping starts with the first complete codon
- */
- assertEquals(6, MappingUtils.getLength(ranges));
- assertEquals(2, ranges.size());
- assertEquals(7, ranges.get(0)[0]);
- assertEquals(9, ranges.get(0)[1]);
- assertEquals(13, ranges.get(1)[0]);
- assertEquals(15, ranges.get(1)[1]);
- }
-
@Test(groups = "Functional")
public void testIsTranscriptIdentifier()
{