From 6c52cc0b81ae3abdc3c5f6f88a23364a0246351a Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 11 Mar 2016 12:06:31 +0000 Subject: [PATCH] JAL-1705 refactoring etc for fetching Ensembl --> Uniprot cross-references --- src/jalview/analysis/AlignmentUtils.java | 439 +++++++++++++++++++-- src/jalview/analysis/CrossRef.java | 75 +++- src/jalview/ext/ensembl/EnsemblCdna.java | 6 +- src/jalview/ext/ensembl/EnsemblCds.java | 1 - src/jalview/ext/ensembl/EnsemblGene.java | 7 +- src/jalview/ext/ensembl/EnsemblSeqProxy.java | 352 +---------------- src/jalview/gui/AlignFrame.java | 125 +++--- src/jalview/ws/dbsources/Uniprot.java | 4 +- test/jalview/analysis/AlignmentUtilsTests.java | 228 ++++++++++- test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 141 +------ 10 files changed, 796 insertions(+), 582 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 450ae27..db69823 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -42,6 +42,7 @@ import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.util.MappingUtils; +import jalview.util.StringUtils; import java.util.ArrayList; import java.util.Arrays; @@ -322,7 +323,7 @@ public class AlignmentUtils } else { - MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq); + MapList map = mapCdnaToProtein(aaSeq, cdnaSeq); if (map != null) { acf.addMap(cdnaSeq, aaSeq, map); @@ -362,16 +363,22 @@ public class AlignmentUtils } /** - * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA - * must be three times the length of the protein, possibly after ignoring - * start and/or stop codons, and must translate to the protein. Returns null - * if no mapping is determined. + * Builds a mapping (if possible) of a cDNA to a protein sequence. + * + * Returns null if no mapping is determined. * - * @param proteinSeqs + * @param proteinSeq + * the aligned protein sequence * @param cdnaSeq + * the aligned cdna sequence * @return */ - public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq, + public static MapList mapCdnaToProtein(SequenceI proteinSeq, SequenceI cdnaSeq) { /* @@ -401,7 +408,7 @@ public class AlignmentUtils final int proteinEnd = proteinSeq.getEnd(); /* - * If lengths don't match, try ignoring stop codon. + * If lengths don't match, try ignoring stop codon (if present) */ if (cdnaLength != mappedLength && cdnaLength > 2) { @@ -432,17 +439,20 @@ public class AlignmentUtils cdnaLength -= 3; } - if (cdnaLength != mappedLength) - { - return null; - } - if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars)) + if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars)) { - return null; + /* + * protein is translation of dna (+/- start/stop codons) + */ + MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] + { proteinStart, proteinEnd }, 3, 1); + return map; } - MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] { - proteinStart, proteinEnd }, 3, 1); - return map; + + /* + * translation failed - try mapping CDS annotated regions of dna + */ + return mapCdsToProtein(cdnaSeq, proteinSeq); } /** @@ -463,16 +473,18 @@ public class AlignmentUtils return false; } - int aaResidue = 0; - for (int i = cdnaStart; i < cdnaSeqChars.length - 2 - && aaResidue < aaSeqChars.length; i += 3, aaResidue++) + int aaPos = 0; + int dnaPos = cdnaStart; + for (; dnaPos < cdnaSeqChars.length - 2 + && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++) { - String codon = String.valueOf(cdnaSeqChars, i, 3); + String codon = String.valueOf(cdnaSeqChars, dnaPos, 3); final String translated = ResidueProperties.codonTranslate(codon); + /* * allow * in protein to match untranslatable in dna */ - final char aaRes = aaSeqChars[aaResidue]; + final char aaRes = aaSeqChars[aaPos]; if ((translated == null || "STOP".equals(translated)) && aaRes == '*') { continue; @@ -485,8 +497,32 @@ public class AlignmentUtils return false; } } - // fail if we didn't match all of the aa sequence - return (aaResidue == aaSeqChars.length); + + /* + * check we matched all of the protein sequence + */ + if (aaPos != aaSeqChars.length) + { + return false; + } + + /* + * check we matched all of the dna except + * for optional trailing STOP codon + */ + if (dnaPos == cdnaSeqChars.length) + { + return true; + } + if (dnaPos == cdnaSeqChars.length - 3) + { + String codon = String.valueOf(cdnaSeqChars, dnaPos, 3); + if ("STOP".equals(ResidueProperties.codonTranslate(codon))) + { + return true; + } + } + return false; } /** @@ -1011,8 +1047,9 @@ public class AlignmentUtils Map> alignedCodons, int mappedSequenceCount) { - // TODO there must be an easier way! root problem is that our mapping data - // model does not include phase so can't map part of a codon to a peptide + // TODO delete this ugly hack once JAL-2022 is resolved + // i.e. we can model startPhase > 0 (incomplete start codon) + List sequencesChecked = new ArrayList(); AlignedCodon lastCodon = null; Map toAdd = new HashMap(); @@ -1270,7 +1307,7 @@ public class AlignmentUtils * Just try to make a mapping (it is not yet stored), test whether * successful. */ - return mapProteinSequenceToCdna(proteinDs, dnaDs) != null; + return mapCdnaToProtein(proteinDs, dnaDs) != null; } /** @@ -1474,7 +1511,8 @@ public class AlignmentUtils /** * Constructs an alignment consisting of the mapped (CDS) regions in the given * nucleotide sequences, and updates mappings to match. The new sequences are - * aligned as per the original sequences (with gapped columns omitted). + * aligned as per the original sequence, with entirely gapped columns (codon + * interrupted by intron) omitted. * * @param dna * aligned dna sequences @@ -1928,4 +1966,349 @@ public class AlignmentUtils to.setName(to.getName() + "|" + cdsAccId); } } + + /** + * Returns a mapping from dna to protein by inspecting sequence features of + * type "CDS" on the dna. + * + * @param dnaSeq + * @param proteinSeq + * @return + */ + public static MapList mapCdsToProtein(SequenceI dnaSeq, + SequenceI proteinSeq) + { + List ranges = findCdsPositions(dnaSeq); + int mappedDnaLength = MappingUtils.getLength(ranges); + + int proteinLength = proteinSeq.getLength(); + int proteinStart = proteinSeq.getStart(); + int proteinEnd = proteinSeq.getEnd(); + + /* + * incomplete start codon may mean X at start of peptide + * we ignore both for mapping purposes + */ + if (proteinSeq.getCharAt(0) == 'X') + { + // todo JAL-2022 support startPhase > 0 + proteinStart++; + proteinLength--; + } + List proteinRange = new ArrayList(); + + /* + * dna length should map to protein (or protein plus stop codon) + */ + int codesForResidues = mappedDnaLength / 3; + if (codesForResidues == (proteinLength + 1)) + { + // assuming extra codon is for STOP and not in peptide + codesForResidues--; + } + if (codesForResidues == proteinLength) + { + proteinRange.add(new int[] { proteinStart, proteinEnd }); + return new MapList(ranges, proteinRange, 3, 1); + } + return null; + } + + /** + * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of + * start/end positions of sequence features of type "CDS" (or a sub-type of + * CDS in the Sequence Ontology) + * + * @param dnaSeq + * @return + */ + public static List findCdsPositions(SequenceI dnaSeq) + { + List result = new ArrayList(); + SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); + if (sfs == null) + { + return result; + } + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + for (SequenceFeature sf : sfs) + { + /* + * process a CDS feature (or a sub-type of CDS) + */ + if (so.isA(sf.getType(), SequenceOntologyI.CDS)) + { + int phase = 0; + try { + phase = Integer.parseInt(sf.getPhase()); + } catch (NumberFormatException e) + { + // ignore + } + /* + * phase > 0 on first codon means 5' incomplete - skip to the start + * of the next codon; example ENST00000496384 + */ + int begin = sf.getBegin(); + int end = sf.getEnd(); + if (result.isEmpty()) + { + // TODO JAL-2022 support start phase > 0 + begin += phase; + if (begin > end) + { + continue; // shouldn't happen? + } + } + result.add(new int[] { begin, end }); + } + } + return result; + } + + /** + * Maps exon features from dna to protein, and computes variants in peptide + * product generated by variants in dna, and adds them as sequence_variant + * features on the protein sequence. Returns the number of variant features + * added. + * + * @param dnaSeq + * @param peptide + * @param dnaToProtein + */ + public static int computeProteinFeatures(SequenceI dnaSeq, + SequenceI peptide, MapList dnaToProtein) + { + while (dnaSeq.getDatasetSequence() != null) + { + dnaSeq = dnaSeq.getDatasetSequence(); + } + while (peptide.getDatasetSequence() != null) + { + peptide = peptide.getDatasetSequence(); + } + + transferFeatures(dnaSeq, peptide, dnaToProtein, + SequenceOntologyI.EXON); + + LinkedHashMap variants = buildDnaVariantsMap( + dnaSeq, dnaToProtein); + + /* + * scan codon variations, compute peptide variants and add to peptide sequence + */ + int count = 0; + for (Entry variant : variants.entrySet()) + { + int peptidePos = variant.getKey(); + String[][] codonVariants = variant.getValue(); + String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based + List peptideVariants = computePeptideVariants(codonVariants, + residue); + if (!peptideVariants.isEmpty()) + { + String desc = StringUtils.listToDelimitedString(peptideVariants, + ", "); + SequenceFeature sf = new SequenceFeature( + SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, + peptidePos, 0f, null); + peptide.addSequenceFeature(sf); + count++; + } + } + + /* + * ugly sort to get sequence features in start position order + * - would be better to store in Sequence as a TreeSet instead? + */ + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() + { + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + return count; + } + + /** + * Builds a map whose key is position in the protein sequence, and value is an + * array of all variants for the coding codon positions + * + * @param dnaSeq + * @param dnaToProtein + * @return + */ + static LinkedHashMap buildDnaVariantsMap( + SequenceI dnaSeq, MapList dnaToProtein) + { + /* + * map from peptide position to all variant features of the codon for it + * LinkedHashMap ensures we add the peptide features in sequence order + */ + LinkedHashMap variants = new LinkedHashMap(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + + SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); + if (dnaFeatures == null) + { + return variants; + } + + int dnaStart = dnaSeq.getStart(); + int[] lastCodon = null; + int lastPeptidePostion = 0; + + /* + * build a map of codon variations for peptides + */ + for (SequenceFeature sf : dnaFeatures) + { + int dnaCol = sf.getBegin(); + if (dnaCol != sf.getEnd()) + { + // not handling multi-locus variant features + continue; + } + if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) + { + int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); + if (mapsTo == null) + { + // feature doesn't lie within coding region + continue; + } + int peptidePosition = mapsTo[0]; + String[][] codonVariants = variants.get(peptidePosition); + if (codonVariants == null) + { + codonVariants = new String[3][]; + variants.put(peptidePosition, codonVariants); + } + + /* + * extract dna variants to a string array + */ + String alls = (String) sf.getValue("alleles"); + if (alls == null) + { + continue; + } + String[] alleles = alls.toUpperCase().split(","); + int i = 0; + for (String allele : alleles) + { + alleles[i++] = allele.trim(); // lose any space characters "A, G" + } + + /* + * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10] + */ + int[] codon = peptidePosition == lastPeptidePostion ? lastCodon + : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( + peptidePosition, peptidePosition)); + lastPeptidePostion = peptidePosition; + lastCodon = codon; + + /* + * save nucleotide (and this variant) for each codon position + */ + for (int codonPos = 0; codonPos < 3; codonPos++) + { + String nucleotide = String.valueOf( + dnaSeq.getCharAt(codon[codonPos] - dnaStart)) + .toUpperCase(); + if (codonVariants[codonPos] == null) + { + /* + * record current dna base + */ + codonVariants[codonPos] = new String[] { nucleotide }; + } + if (codon[codonPos] == dnaCol) + { + /* + * add alleles to dna base (and any previously found alleles) + */ + String[] known = codonVariants[codonPos]; + String[] dnaVariants = new String[alleles.length + known.length]; + System.arraycopy(known, 0, dnaVariants, 0, known.length); + System.arraycopy(alleles, 0, dnaVariants, known.length, + alleles.length); + codonVariants[codonPos] = dnaVariants; + } + } + } + } + return variants; + } + + /** + * Returns a sorted, non-redundant list of all peptide translations generated + * by the given dna variants, excluding the current residue value + * + * @param codonVariants + * an array of base values (acgtACGT) for codon positions 1, 2, 3 + * @param residue + * the current residue translation + * @return + */ + static List computePeptideVariants( + String[][] codonVariants, String residue) + { + List result = new ArrayList(); + for (String base1 : codonVariants[0]) + { + for (String base2 : codonVariants[1]) + { + for (String base3 : codonVariants[2]) + { + String codon = base1 + base2 + base3; + /* + * get peptide translation of codon e.g. GAT -> D + * note that variants which are not single alleles, + * e.g. multibase variants or HGMD_MUTATION etc + * are ignored here + */ + String peptide = codon.contains("-") ? "-" + : (codon.length() > 3 ? null : ResidueProperties + .codonTranslate(codon)); + if (peptide != null && !result.contains(peptide) + && !peptide.equalsIgnoreCase(residue)) + { + result.add(peptide); + } + } + } + } + + /* + * sort alphabetically with STOP at the end + */ + Collections.sort(result, new Comparator() + { + + @Override + public int compare(String o1, String o2) + { + if ("STOP".equals(o1)) + { + return 1; + } + else if ("STOP".equals(o2)) + { + return -1; + } + else + { + return o1.compareTo(o2); + } + } + }); + return result; + } } diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 2f6076a..7d09a3b 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -30,6 +30,7 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; +import jalview.util.MapList; import jalview.ws.SequenceFetcher; import jalview.ws.seqfetcher.ASequenceFetcher; @@ -232,8 +233,9 @@ public class CrossRef * are found e.g. alternative protein products for a protein's gene * @return products (as dataset sequences) */ - public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna, - String source, AlignmentI al, List addedPeers) + public static Alignment findXrefSequences(SequenceI[] seqs, + final boolean dna, final String source, AlignmentI al, + List addedPeers) { AlignmentI dataset = al.getDataset() == null ? al : al.getDataset(); List rseqs = new ArrayList(); @@ -294,7 +296,9 @@ public class CrossRef // xrefs on this sequence. if (dataset != null) { - found |= searchDataset(dss, xref, dataset, rseqs, cf); // ,false,!dna); + found |= searchDataset(dss, xref, dataset, rseqs, cf, false, + !dna); + // ,false,!dna); if (found) { xrfs[r] = null; // we've recovered seqs for this one. @@ -355,6 +359,8 @@ public class CrossRef if (retrieved != null) { + updateDbrefMappings(dna, seq, xrfs, retrieved, cf); + List copiedFeatures = new ArrayList(); CrossRef me = new CrossRef(); for (int rs = 0; rs < retrieved.length; rs++) @@ -463,6 +469,69 @@ public class CrossRef } /** + * Updates any empty mappings in the cross-references with one to a compatible + * retrieved sequence if found, and adds any new mappings to the + * AlignedCodonFrame + * + * @param dna + * @param mapFrom + * @param xrefs + * @param retrieved + * @param acf + */ + static void updateDbrefMappings(boolean dna, SequenceI mapFrom, + DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf) + { + SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved); + for (DBRefEntry xref : xrefs) + { + if (!xref.hasMap()) + { + String targetSeqName = xref.getSource() + "|" + + xref.getAccessionId(); + SequenceI[] matches = matcher.findAllIdMatches(targetSeqName); + if (matches == null) + { + return; + } + for (SequenceI seq : matches) + { + MapList mapping = null; + if (dna) + { + mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom); + } + else + { + mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq); + if (mapping != null) + { + mapping = mapping.getInverse(); + } + } + if (mapping != null) + { + xref.setMap(new Mapping(seq, mapping)); + if (dna) + { + AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping); + } + if (dna) + { + acf.addMap(mapFrom, seq, mapping); + } + else + { + acf.addMap(seq, mapFrom, mapping.getInverse()); + } + continue; + } + } + } + } + } + + /** * find references to lrfs in the cross-reference set of each sequence in * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry * based on source and accession string only - Map and Version are nulled. diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 028492e..856be74 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -4,6 +4,7 @@ import jalview.datamodel.SequenceFeature; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; +import java.util.Arrays; import java.util.List; import com.stevesoft.pat.Regex; @@ -18,6 +19,9 @@ import com.stevesoft.pat.Regex; */ public class EnsemblCdna extends EnsemblSeqProxy { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "Uniprot/SWISSPROT", "Uniprot/SPTREMBL" }); + /* * accepts ENST or ENSTG with 11 digits * or ENSMUST or similar for other species @@ -113,7 +117,7 @@ public class EnsemblCdna extends EnsemblSeqProxy @Override protected List getCrossReferenceDatabases() { - return super.getCrossReferenceDatabases(); + return CROSS_REFERENCES; // 30/01/16 also found Vega_transcript, OTTT, ENS_LRG_transcript, UCSC, // HGNC_trans_name, RefSeq_mRNA, RefSeq_mRNA_predicted } diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 63df7a7..2086eba 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -107,7 +107,6 @@ public class EnsemblCds extends EnsemblSeqProxy * and also means we don't need to keep CDS features on CDS sequence (where * they are redundant information). */ - @Override protected List getCdsRanges(SequenceI dnaSeq) { int len = dnaSeq.getLength(); diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index aa5e0ab..3b32797 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -29,6 +29,9 @@ import com.stevesoft.pat.Regex; */ public class EnsemblGene extends EnsemblSeqProxy { + private static final List CROSS_REFERENCES = Arrays + .asList(new String[] { "CCDS" }); + private static final String GENE_PREFIX = "gene:"; /* @@ -332,7 +335,7 @@ public class EnsemblGene extends EnsemblSeqProxy /* * fetch and save cross-references */ - super.getCrossReferences(transcript); + new EnsemblCdna(getDomain()).getCrossReferences(transcript); /* * and finally fetch the protein product and save as a cross-reference @@ -468,7 +471,7 @@ public class EnsemblGene extends EnsemblSeqProxy // found these for ENSG00000157764 on 30/01/2016: // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress", // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"}; - return super.getCrossReferenceDatabases(); + return CROSS_REFERENCES; } /** diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 5e27158..4af6525 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -13,11 +13,8 @@ import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; -import jalview.schemes.ResidueProperties; import jalview.util.DBRefUtils; import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.StringUtils; import java.io.IOException; import java.net.MalformedURLException; @@ -26,9 +23,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map.Entry; /** * Base class for Ensembl sequence fetchers @@ -39,7 +34,8 @@ import java.util.Map.Entry; public abstract class EnsemblSeqProxy extends EnsemblRestClient { private static final List CROSS_REFERENCES = Arrays - .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" }); + .asList(new String[] { "CCDS", "Uniprot/SWISSPROT", + "Uniprot/SPTREMBL" }); protected static final String CONSEQUENCE_TYPE = "consequence_type"; @@ -267,7 +263,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient .getSequenceRecords(accId); if (protein == null || protein.getHeight() == 0) { - System.out.println("Failed to retrieve protein for " + accId); + System.out.println("No protein product found for " + accId); return; } SequenceI proteinSeq = protein.getSequenceAt(0); @@ -278,7 +274,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient proteinSeq.createDatasetSequence(); querySeq.createDatasetSequence(); - MapList mapList = mapCdsToProtein(querySeq, proteinSeq); + MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq); if (mapList != null) { // clunky: ensure Uniprot xref if we have one is on mapped sequence @@ -290,10 +286,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient querySeq.getDatasetSequence().addDBRef(dbr); /* - * compute peptide variants from dna variants and add as - * sequence features on the protein sequence ta-da + * copy exon features to protein, compute peptide variants from dna + * variants and add as features on the protein sequence ta-da */ - computeProteinFeatures(querySeq, proteinSeq, mapList); + AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); } } catch (Exception e) { @@ -333,6 +329,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Returns a list of database names to be used when fetching cross-references. + * Specifically, the names are used to filter data returned by the Ensembl + * xrefs REST service on the value in field 'dbname'. * * @return */ @@ -342,104 +340,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. - * - * @param dnaSeq - * @param proteinSeq - * @return - */ - protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) - { - List ranges = getCdsRanges(dnaSeq); - int mappedDnaLength = MappingUtils.getLength(ranges); - - int proteinLength = proteinSeq.getLength(); - int proteinEnd = proteinLength; - int proteinStart = 1; - - /* - * incomplete start codon may mean X at start of peptide - * we ignore both for mapping purposes - */ - if (proteinSeq.getCharAt(0) == 'X') - { - proteinStart = 2; - proteinLength--; - } - List proteinRange = new ArrayList(); - - /* - * dna length should map to protein (or protein plus stop codon) - */ - int codesForResidues = mappedDnaLength / 3; - if (codesForResidues == (proteinLength + 1)) - { - // assuming extra codon is for STOP and not in peptide - codesForResidues--; - } - if (codesForResidues == proteinLength) - { - proteinRange.add(new int[] { proteinStart, proteinEnd }); - return new MapList(ranges, proteinRange, 3, 1); - } - return null; - } - - /** - * Returns a list of CDS ranges found. - * - * No need to worry about reverse strand dna, here since the retrieved - * sequence is as transcribed (reverse complement for reverse strand), i.e in - * the same sense as the peptide. - * - * @param dnaSeq - * @return - */ - protected List getCdsRanges(SequenceI dnaSeq) - { - List result = new ArrayList(); - SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); - if (sfs == null) - { - return result; - } - SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - for (SequenceFeature sf : sfs) - { - /* - * process a CDS feature (or a sub-type of CDS) - */ - if (so.isA(sf.getType(), SequenceOntologyI.CDS)) - { - int phase = 0; - try { - phase = Integer.parseInt(sf.getPhase()); - } catch (NumberFormatException e) - { - // ignore - } - /* - * phase > 0 on first codon means 5' incomplete - skip to the start - * of the next codon; example ENST00000496384 - */ - int begin = sf.getBegin(); - int end = sf.getEnd(); - if (result.isEmpty()) - { - begin += phase; - if (begin > end) - { - continue; // shouldn't happen? - } - } - result.add(new int[] { begin, end }); - } - } - return result; - } - - /** * Fetches sequences for the list of accession ids and adds them to the * alignment. Returns the extended (or created) alignment. * @@ -893,240 +793,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Maps exon features from dna to protein, and computes variants in peptide - * product generated by variants in dna, and adds them as sequence_variant - * features on the protein sequence. Returns the number of variant features - * added. - * - * @param dnaSeq - * @param peptide - * @param dnaToProtein - */ - static int computeProteinFeatures(SequenceI dnaSeq, - SequenceI peptide, MapList dnaToProtein) - { - while (dnaSeq.getDatasetSequence() != null) - { - dnaSeq = dnaSeq.getDatasetSequence(); - } - while (peptide.getDatasetSequence() != null) - { - peptide = peptide.getDatasetSequence(); - } - - AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein, - SequenceOntologyI.EXON); - - LinkedHashMap variants = buildDnaVariantsMap( - dnaSeq, dnaToProtein); - - /* - * scan codon variations, compute peptide variants and add to peptide sequence - */ - int count = 0; - for (Entry variant : variants.entrySet()) - { - int peptidePos = variant.getKey(); - String[][] codonVariants = variant.getValue(); - String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based - List peptideVariants = computePeptideVariants(codonVariants, - residue); - if (!peptideVariants.isEmpty()) - { - String desc = StringUtils.listToDelimitedString(peptideVariants, - ", "); - SequenceFeature sf = new SequenceFeature( - SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, - peptidePos, 0f, null); - peptide.addSequenceFeature(sf); - count++; - } - } - - /* - * ugly sort to get sequence features in start position order - * - would be better to store in Sequence as a TreeSet instead? - */ - Arrays.sort(peptide.getSequenceFeatures(), - new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) - { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) - : c; - } - }); - return count; - } - - /** - * Builds a map whose key is position in the protein sequence, and value is an - * array of all variants for the coding codon positions - * - * @param dnaSeq - * @param dnaToProtein - * @return - */ - static LinkedHashMap buildDnaVariantsMap( - SequenceI dnaSeq, MapList dnaToProtein) - { - /* - * map from peptide position to all variant features of the codon for it - * LinkedHashMap ensures we add the peptide features in sequence order - */ - LinkedHashMap variants = new LinkedHashMap(); - SequenceOntologyI so = SequenceOntologyFactory.getInstance(); - - SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); - if (dnaFeatures == null) - { - return variants; - } - - int dnaStart = dnaSeq.getStart(); - int[] lastCodon = null; - int lastPeptidePostion = 0; - - /* - * build a map of codon variations for peptides - */ - for (SequenceFeature sf : dnaFeatures) - { - int dnaCol = sf.getBegin(); - if (dnaCol != sf.getEnd()) - { - // not handling multi-locus variant features - continue; - } - if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) - { - int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); - if (mapsTo == null) - { - // feature doesn't lie within coding region - continue; - } - int peptidePosition = mapsTo[0]; - String[][] codonVariants = variants.get(peptidePosition); - if (codonVariants == null) - { - codonVariants = new String[3][]; - variants.put(peptidePosition, codonVariants); - } - - /* - * extract dna variants to a string array - */ - String alls = (String) sf.getValue("alleles"); - if (alls == null) - { - continue; - } - String[] alleles = alls.split(","); - - /* - * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10] - */ - int[] codon = peptidePosition == lastPeptidePostion ? lastCodon - : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( - peptidePosition, peptidePosition)); - lastPeptidePostion = peptidePosition; - lastCodon = codon; - - /* - * save nucleotide (and this variant) for each codon position - */ - for (int codonPos = 0; codonPos < 3; codonPos++) - { - String nucleotide = String.valueOf(dnaSeq - .getCharAt(codon[codonPos] - dnaStart)); - if (codon[codonPos] == dnaCol) - { - /* - * record current dna base and its alleles - */ - String[] dnaVariants = new String[alleles.length + 1]; - dnaVariants[0] = nucleotide; - System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length); - codonVariants[codonPos] = dnaVariants; - } - else if (codonVariants[codonPos] == null) - { - /* - * record current dna base only - * (at least until we find any variation and overwrite it) - */ - codonVariants[codonPos] = new String[] { nucleotide }; - } - } - } - } - return variants; - } - - /** - * Returns a sorted, non-redundant list of all peptide translations generated - * by the given dna variants, excluding the current residue value - * - * @param codonVariants - * an array of base values (acgtACGT) for codon positions 1, 2, 3 - * @param residue - * the current residue translation - * @return - */ - static List computePeptideVariants( - String[][] codonVariants, String residue) - { - List result = new ArrayList(); - for (String base1 : codonVariants[0]) - { - for (String base2 : codonVariants[1]) - { - for (String base3 : codonVariants[2]) - { - String codon = base1 + base2 + base3; - // TODO: report frameshift/insertion/deletion - // and multiple-base variants?! - String peptide = codon.contains("-") ? "-" : ResidueProperties - .codonTranslate(codon); - if (peptide != null && !result.contains(peptide) - && !peptide.equalsIgnoreCase(residue)) - { - result.add(peptide); - } - } - } - } - - /* - * sort alphabetically with STOP at the end - */ - Collections.sort(result, new Comparator() - { - - @Override - public int compare(String o1, String o2) - { - if ("STOP".equals(o1)) - { - return 1; - } - else if ("STOP".equals(o2)) - { - return -1; - } - else - { - return o1.compareTo(o2); - } - } - }); - return result; - } - - /** * Answers true if the feature type is either 'NMD_transcript_variant' or * 'transcript' or one of its sub-types in the Sequence Ontology. This is * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl diff --git a/src/jalview/gui/AlignFrame.java b/src/jalview/gui/AlignFrame.java index 433afba..0d6efe4 100644 --- a/src/jalview/gui/AlignFrame.java +++ b/src/jalview/gui/AlignFrame.java @@ -54,6 +54,7 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.AlignmentOrder; import jalview.datamodel.AlignmentView; import jalview.datamodel.ColumnSelection; +import jalview.datamodel.DBRefSource; import jalview.datamodel.HiddenSequences; import jalview.datamodel.PDBEntry; import jalview.datamodel.SeqCigar; @@ -4750,17 +4751,6 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, Alignment al = makeCrossReferencesAlignment( alignment.getDataset(), xrefs); - /* - * Copy dna-to-protein mappings to new alignment - */ - // TODO 1: no mappings are set up for EMBL product - // TODO 2: if they were, should add them to protein alignment, not - // dna - // List cf = xrefs.getCodonFrames(); - // for (AlignedCodonFrame acf : cf) - // { - // al.addCodonFrame(acf); - // } AlignFrame newFrame = new AlignFrame(al, DEFAULT_WIDTH, DEFAULT_HEIGHT); String newtitle = String.format("%s %s %s", @@ -4785,6 +4775,10 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, { copyAlignment = AlignmentUtils.makeCdsAlignment( sequenceSelection, cf, alignment); + if (copyAlignment.getHeight() == 0) + { + System.err.println("Failed to make CDS alignment"); + } al.getCodonFrames().clear(); al.getCodonFrames().addAll(cf); } @@ -4809,56 +4803,69 @@ public class AlignFrame extends GAlignFrame implements DropTargetListener, copyAlignment.addSequence(peer); } - /* - * align protein to dna - */ - if (dna) + if (copyAlignment.getHeight() > 0) { - al.alignAs(copyAlignment); - } - else - { - copyAlignment.alignAs(al); - } - - AlignFrame copyThis = new AlignFrame(copyAlignment, - AlignFrame.DEFAULT_WIDTH, AlignFrame.DEFAULT_HEIGHT); - copyThis.setTitle(AlignFrame.this.getTitle()); - - boolean showSequenceFeatures = viewport - .isShowSequenceFeatures(); - newFrame.setShowSeqFeatures(showSequenceFeatures); - copyThis.setShowSeqFeatures(showSequenceFeatures); - FeatureRenderer myFeatureStyling = alignPanel.getSeqPanel().seqCanvas - .getFeatureRenderer(); - - /* - * copy feature rendering settings to split frame - */ - newFrame.alignPanel.getSeqPanel().seqCanvas - .getFeatureRenderer().transferSettings( - myFeatureStyling); - copyThis.alignPanel.getSeqPanel().seqCanvas - .getFeatureRenderer().transferSettings( - myFeatureStyling); + /* + * align protein to dna + */ + // FIXME what if the dna is not aligned :-O + if (dna) + { + al.alignAs(copyAlignment); + } + else + { + /* + * align cdna to protein - currently only if + * fetching and aligning Ensembl transcripts! + */ + if (DBRefSource.ENSEMBL.equalsIgnoreCase(source)) + { + copyAlignment.alignAs(al); + } + } - /* - * apply 'database source' feature configuration - * if any was found - */ - newFrame.getViewport() - .applyFeaturesStyle(featureColourScheme); - copyThis.getViewport() - .applyFeaturesStyle(featureColourScheme); - - SplitFrame sf = new SplitFrame(dna ? copyThis : newFrame, - dna ? newFrame : copyThis); - newFrame.setVisible(true); - copyThis.setVisible(true); - String linkedTitle = MessageManager - .getString("label.linked_view_title"); - Desktop.addInternalFrame(sf, linkedTitle, -1, -1); - sf.adjustDivider(); + AlignFrame copyThis = new AlignFrame(copyAlignment, + AlignFrame.DEFAULT_WIDTH, AlignFrame.DEFAULT_HEIGHT); + copyThis.setTitle(AlignFrame.this.getTitle()); + + boolean showSequenceFeatures = viewport + .isShowSequenceFeatures(); + newFrame.setShowSeqFeatures(showSequenceFeatures); + copyThis.setShowSeqFeatures(showSequenceFeatures); + FeatureRenderer myFeatureStyling = alignPanel.getSeqPanel().seqCanvas + .getFeatureRenderer(); + + /* + * copy feature rendering settings to split frame + */ + newFrame.alignPanel.getSeqPanel().seqCanvas + .getFeatureRenderer().transferSettings( + myFeatureStyling); + copyThis.alignPanel.getSeqPanel().seqCanvas + .getFeatureRenderer().transferSettings( + myFeatureStyling); + + /* + * apply 'database source' feature configuration + * if any was found + */ + // TODO is this the feature colouring for the original + // alignment or the fetched xrefs? either could be Ensembl + newFrame.getViewport().applyFeaturesStyle( + featureColourScheme); + copyThis.getViewport().applyFeaturesStyle( + featureColourScheme); + + SplitFrame sf = new SplitFrame(dna ? copyThis : newFrame, + dna ? newFrame : copyThis); + newFrame.setVisible(true); + copyThis.setVisible(true); + String linkedTitle = MessageManager + .getString("label.linked_view_title"); + Desktop.addInternalFrame(sf, linkedTitle, -1, -1); + sf.adjustDivider(); + } } else { diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 455dcb5..12ebe90 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -271,7 +271,9 @@ public class Uniprot extends DbSourceProxyImpl public static String getUniprotEntryId(UniprotEntry entry) { StringBuilder name = new StringBuilder(32); - name.append("UniProt/Swiss-Prot"); + // name.append("UniProt/Swiss-Prot"); + // use 'canonicalised' name for optimal id matching + name.append(DBRefSource.UNIPROT); for (String accessionId : entry.getAccession()) { name.append(BAR_DELIMITER); diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index abe3f55..8bdd740 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -48,6 +48,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -571,9 +572,15 @@ public class AlignmentUtilsTests @Test(groups = { "Functional" }) public void testTranslatesAs() { + // null arguments check + assertFalse(AlignmentUtils.translatesAs(null, 0, null)); + assertFalse(AlignmentUtils.translatesAs(new char[] { 't' }, 0, null)); + assertFalse(AlignmentUtils.translatesAs(null, 0, new char[] { 'a' })); + + // straight translation assertTrue(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPKG".toCharArray())); - // with start codon (not in protein) + // with extra start codon (not in protein) assertTrue(AlignmentUtils.translatesAs("atgtttcccaaaggg".toCharArray(), 3, "FPKG".toCharArray())); // with stop codon1 (not in protein) @@ -601,7 +608,7 @@ public class AlignmentUtilsTests assertTrue(AlignmentUtils.translatesAs( "atgtttcccaaagggtga".toCharArray(), 3, "FPKG".toCharArray())); - // with embedded stop codon + // with embedded stop codons assertTrue(AlignmentUtils.translatesAs( "atgtttTAGcccaaaTAAgggtga".toCharArray(), 3, "F*PK*G".toCharArray())); @@ -609,6 +616,26 @@ public class AlignmentUtilsTests // wrong protein assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), 0, "FPMG".toCharArray())); + + // truncated dna + assertFalse(AlignmentUtils.translatesAs("tttcccaaagg".toCharArray(), 0, + "FPKG".toCharArray())); + + // truncated protein + assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), + 0, "FPK".toCharArray())); + + // overlong dna (doesn't end in stop codon) + assertFalse(AlignmentUtils.translatesAs( + "tttcccaaagggttt".toCharArray(), 0, "FPKG".toCharArray())); + + // dna + stop codon + more + assertFalse(AlignmentUtils.translatesAs( + "tttcccaaagggttaga".toCharArray(), 0, "FPKG".toCharArray())); + + // overlong protein + assertFalse(AlignmentUtils.translatesAs("tttcccaaaggg".toCharArray(), + 0, "FPKGQ".toCharArray())); } /** @@ -1342,7 +1369,7 @@ public class AlignmentUtilsTests * @throws IOException */ @Test(groups = { "Functional" }) - public void testMapProteinSequenceToCdna_forSubsequence() + public void testMapCdnaToProtein_forSubsequence() throws IOException { SequenceI prot = new Sequence("UNIPROT|V12345", "E-I--Q", 10, 12); @@ -1351,7 +1378,7 @@ public class AlignmentUtilsTests SequenceI dna = new Sequence("EMBL|A33333", "GAA--AT-C-CAG", 40, 48); dna.createDatasetSequence(); - MapList map = AlignmentUtils.mapProteinSequenceToCdna(prot, dna); + MapList map = AlignmentUtils.mapCdnaToProtein(prot, dna); assertEquals(10, map.getToLowest()); assertEquals(12, map.getToHighest()); assertEquals(40, map.getFromLowest()); @@ -1786,4 +1813,197 @@ public class AlignmentUtilsTests assertEquals("--N-G", prot2.getSequenceAsString()); assertEquals("---XG", prot3.getSequenceAsString()); } + + /** + * Tests for the method that maps the subset of a dna sequence that has CDS + * (or subtype) feature - case where the start codon is incomplete. + */ + @Test(groups = "Functional") + public void testGetCdsRanges_fivePrimeIncomplete() + { + SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); + dnaSeq.createDatasetSequence(); + SequenceI ds = dnaSeq.getDatasetSequence(); + + // CDS for dna 5-6 (incomplete codon), 7-9 + SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); + sf.setPhase("2"); // skip 2 bases to start of next codon + ds.addSequenceFeature(sf); + // CDS for dna 13-15 + sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); + ds.addSequenceFeature(sf); + + List ranges = AlignmentUtils.findCdsPositions(dnaSeq); + + /* + * check the mapping starts with the first complete codon + */ + assertEquals(6, MappingUtils.getLength(ranges)); + assertEquals(2, ranges.size()); + assertEquals(7, ranges.get(0)[0]); + assertEquals(9, ranges.get(0)[1]); + assertEquals(13, ranges.get(1)[0]); + assertEquals(15, ranges.get(1)[1]); + } + + /** + * Tests for the method that maps the subset of a dna sequence that has CDS + * (or subtype) feature. + */ + @Test(groups = "Functional") + public void testGetCdsRanges() + { + SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); + dnaSeq.createDatasetSequence(); + SequenceI ds = dnaSeq.getDatasetSequence(); + + // CDS for dna 3-6 + SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); + ds.addSequenceFeature(sf); + // exon feature should be ignored here + sf = new SequenceFeature("exon", "", 7, 9, 0f, null); + ds.addSequenceFeature(sf); + // CDS for dna 10-12 + sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null); + ds.addSequenceFeature(sf); + + List ranges = AlignmentUtils.findCdsPositions(dnaSeq); + assertEquals(6, MappingUtils.getLength(ranges)); + assertEquals(2, ranges.size()); + assertEquals(4, ranges.get(0)[0]); + assertEquals(6, ranges.get(0)[1]); + assertEquals(10, ranges.get(1)[0]); + assertEquals(12, ranges.get(1)[1]); + } + + /** + * Test the method that computes a map of codon variants for each protein + * position from "sequence_variant" features on dna + */ + @Test(groups = "Functional") + public void testBuildDnaVariantsMap() + { + SequenceI dna = new Sequence("dna", "atgAAATTTGGGCCCtag"); + MapList map = new MapList(new int[] { 1, 18 }, new int[] { 1, 5 }, 3, 1); + + /* + * first with no variants on dna + */ + LinkedHashMap variantsMap = AlignmentUtils + .buildDnaVariantsMap(dna, map); + assertTrue(variantsMap.isEmpty()); + + // single allele codon 1, on base 1 + SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1, + 0f, null); + sf.setValue("alleles", "T"); + dna.addSequenceFeature(sf); + + // two alleles codon 2, on bases 2 and 3 + sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null); + sf.setValue("alleles", "T"); + dna.addSequenceFeature(sf); + sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null); + sf.setValue("alleles", "G"); + dna.addSequenceFeature(sf); + + // two alleles codon 3, both on base 2 + sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null); + sf.setValue("alleles", "C, G"); + dna.addSequenceFeature(sf); + + // no alleles on codon 4 + // alleles on codon 5 on all 3 bases + sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null); + sf.setValue("alleles", "C, G"); // (C duplicates given base value) + dna.addSequenceFeature(sf); + sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null); + sf.setValue("alleles", "g, a"); // should force to upper-case + dna.addSequenceFeature(sf); + sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null); + sf.setValue("alleles", "A, T"); + dna.addSequenceFeature(sf); + + variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map); + assertEquals(4, variantsMap.size()); + assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" }, + { "G" } }, variantsMap.get(1))); + assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" }, + { "A", "G" } }, variantsMap.get(2))); + assertTrue(Arrays.deepEquals(new String[][] { { "T" }, + { "T", "C", "G" }, { "T" } }, variantsMap.get(3))); + // duplicated bases are not removed here, handled in computePeptideVariants + assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" }, + { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5))); + } + + /** + * Tests for the method that computes all peptide variants given codon + * variants + */ + @Test(groups = "Functional") + public void testComputePeptideVariants() + { + String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } }; + + /* + * AGT codes for S - this is not included in the variants returned + */ + List variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[]", variants.toString()); + + // S is reported if it differs from the current value (A): + variants = AlignmentUtils.computePeptideVariants(codonVariants, "A"); + assertEquals("[S]", variants.toString()); + + /* + * synonymous variant is not reported + */ + codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } }; + // AGC and AGT both code for S + variants = AlignmentUtils.computePeptideVariants(codonVariants, "s"); + assertEquals("[]", variants.toString()); + + /* + * equivalent variants are only reported once + */ + codonVariants = new String[][] { { "C" }, { "T" }, + { "A", "C", "G", "T" } }; + // CTA CTC CTG CTT all code for L + variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[L]", variants.toString()); + + /* + * vary codons 1 and 2; variant products are sorted and non-redundant + */ + codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } }; + // aga ata cga cta code for R, I, R, L + variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[I, L, R]", variants.toString()); + + /* + * vary codons 2 and 3 + */ + codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } }; + // aga agc ata atc code for R, S, I, I + variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[I, R]", variants.toString()); + + /* + * vary codons 1 and 3 + */ + codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } }; + // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end + variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[K, N, Y, STOP]", variants.toString()); + + /* + * vary codons 1, 2 and 3 + */ + codonVariants = new String[][] { { "a", "t" }, { "G", "C" }, + { "t", "g" } }; + // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S + variants = AlignmentUtils.computePeptideVariants(codonVariants, "S"); + assertEquals("[C, R, T, W]", variants.toString()); + } } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 5d95a3c..f9c2c4b 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -4,17 +4,15 @@ import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertTrue; +import jalview.analysis.AlignmentUtils; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; -import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.AppletFormatAdapter; import jalview.io.FastaFile; import jalview.io.FileParse; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyLite; -import jalview.util.MappingUtils; import java.lang.reflect.Method; import java.net.MalformedURLException; @@ -216,149 +214,12 @@ public class EnsemblSeqProxyTest : "DOWN or unreachable ******************* BAD!")); } - /** - * Tests for the method that computes all peptide variants given codon - * variants - */ - @Test(groups = "Functional") - public void testComputePeptideVariants() - { - String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } }; - - /* - * AGT codes for S - this is not included in the variants returned - */ - List variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[]", variants.toString()); - - // S is reported if it differs from the current value (A): - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A"); - assertEquals("[S]", variants.toString()); - - /* - * synonymous variant is not reported - */ - codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } }; - // AGC and AGT both code for S - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s"); - assertEquals("[]", variants.toString()); - - /* - * equivalent variants are only reported once - */ - codonVariants = new String[][] { { "C" }, { "T" }, - { "A", "C", "G", "T" } }; - // CTA CTC CTG CTT all code for L - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[L]", variants.toString()); - - /* - * vary codons 1 and 2; variant products are sorted and non-redundant - */ - codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } }; - // aga ata cga cta code for R, I, R, L - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[I, L, R]", variants.toString()); - - /* - * vary codons 2 and 3 - */ - codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } }; - // aga agc ata atc code for R, S, I, I - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[I, R]", variants.toString()); - - /* - * vary codons 1 and 3 - */ - codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } }; - // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[K, N, Y, STOP]", variants.toString()); - - /* - * vary codons 1, 2 and 3 - */ - codonVariants = new String[][] { { "a", "t" }, { "G", "C" }, - { "t", "g" } }; - // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S - variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S"); - assertEquals("[C, R, T, W]", variants.toString()); - } - - /** - * Tests for the method that maps the subset of a dna sequence that has CDS - * (or subtype) feature. - */ - @Test(groups = "Functional") - public void testGetCdsRanges() - { - EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter(); - - SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt"); - dnaSeq.createDatasetSequence(); - SequenceI ds = dnaSeq.getDatasetSequence(); - - // CDS for dna 3-6 - SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null); - ds.addSequenceFeature(sf); - // exon feature should be ignored here - sf = new SequenceFeature("exon", "", 7, 9, 0f, null); - ds.addSequenceFeature(sf); - // CDS for dna 10-12 - sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null); - ds.addSequenceFeature(sf); - - List ranges = testee.getCdsRanges(dnaSeq); - assertEquals(6, MappingUtils.getLength(ranges)); - assertEquals(2, ranges.size()); - assertEquals(4, ranges.get(0)[0]); - assertEquals(6, ranges.get(0)[1]); - assertEquals(10, ranges.get(1)[0]); - assertEquals(12, ranges.get(1)[1]); - - } - @Test(groups = "Functional") public void getGenomicRangesFromFeatures() { } - /** - * Tests for the method that maps the subset of a dna sequence that has CDS - * (or subtype) feature - case where the start codon is incomplete. - */ - @Test(groups = "Functional") - public void testGetCdsRanges_fivePrimeIncomplete() - { - EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter(); - - SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt"); - dnaSeq.createDatasetSequence(); - SequenceI ds = dnaSeq.getDatasetSequence(); - - // CDS for dna 5-6 (incomplete codon), 7-9 - SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null); - sf.setPhase("2"); // skip 2 bases to start of next codon - ds.addSequenceFeature(sf); - // CDS for dna 13-15 - sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null); - ds.addSequenceFeature(sf); - - List ranges = testee.getCdsRanges(dnaSeq); - - /* - * check the mapping starts with the first complete codon - */ - assertEquals(6, MappingUtils.getLength(ranges)); - assertEquals(2, ranges.size()); - assertEquals(7, ranges.get(0)[0]); - assertEquals(9, ranges.get(0)[1]); - assertEquals(13, ranges.get(1)[0]); - assertEquals(15, ranges.get(1)[1]); - } - @Test(groups = "Functional") public void testIsTranscriptIdentifier() { -- 1.7.10.2