X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=bef667dfb140e63241fe91059d1d0cac4bd78cb9;hb=953077c86d17dccbf571fca50ccb86fb4f93c40a;hp=c88a4628302e52f2afb5cb97f2c2462ef532e2d9;hpb=6fe36904fddf9ecb85e67974f48081bba373e8ab;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index c88a462..bef667d 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -29,6 +29,7 @@ import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; import jalview.datamodel.IncompleteCodonException; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; @@ -394,7 +395,7 @@ public class AlignmentUtils * Answers true if the mappings include one between the given (dataset) * sequences. */ - public static boolean mappingExists(List mappings, + protected static boolean mappingExists(List mappings, SequenceI aaSeq, SequenceI cdnaSeq) { if (mappings != null) @@ -1633,7 +1634,7 @@ public class AlignmentUtils AlignmentI dataset, SequenceI[] products) { if (dataset == null || dataset.getDataset() != null) - { + { throw new IllegalArgumentException( "IMPLEMENTATION ERROR: dataset.getDataset() must be null!"); } @@ -1645,10 +1646,10 @@ public class AlignmentUtils { productSeqs = new HashSet(); for (SequenceI seq : products) - { + { productSeqs.add(seq.getDatasetSequence() == null ? seq : seq .getDatasetSequence()); - } + } } /* @@ -1670,15 +1671,15 @@ public class AlignmentUtils List seqMappings = MappingUtils .findMappingsForSequence(dnaSeq, mappings); for (AlignedCodonFrame mapping : seqMappings) - { + { List mappingsFromSequence = mapping .getMappingsFromSequence(dnaSeq); for (Mapping aMapping : mappingsFromSequence) - { + { MapList mapList = aMapping.getMap(); if (mapList.getFromRatio() == 1) - { + { /* * not a dna-to-protein mapping (likely dna-to-cds) */ @@ -1704,15 +1705,15 @@ public class AlignmentUtils if (cdsSeq != null) { if (!foundSeqs.contains(cdsSeq)) - { + { foundSeqs.add(cdsSeq); SequenceI derivedSequence = cdsSeq.deriveSequence(); cdsSeqs.add(derivedSequence); if (!dataset.getSequences().contains(cdsSeq)) - { + { dataset.addSequence(cdsSeq); + } } - } continue; } @@ -1763,7 +1764,7 @@ public class AlignmentUtils * add another mapping from original 'from' range to CDS */ AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame(); - MapList dnaToCdsMap = new MapList(mapList.getFromRanges(), + final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(), cdsRange, 1, 1); dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss, dnaToCdsMap); @@ -1773,6 +1774,13 @@ public class AlignmentUtils } /* + * transfer dna chromosomal loci (if known) to the CDS + * sequence (via the mapping) + */ + final MapList cdsToDnaMap = dnaToCdsMap.getInverse(); + transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq); + + /* * add DBRef with mapping from protein to CDS * (this enables Get Cross-References from protein alignment) * This is tricky because we can't have two DBRefs with the @@ -1795,13 +1803,11 @@ public class AlignmentUtils * create a cross-reference from CDS to the source sequence's * primary reference and vice versa */ - String source = primRef.getSource(); String version = primRef.getVersion(); DBRefEntry cdsCrossRef = new DBRefEntry(source, source + ":" + version, primRef.getAccessionId()); - cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(dnaToCdsMap - .getInverse()))); + cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap))); cdsSeqDss.addDBRef(cdsCrossRef); dnaSeq.addDBRef(new DBRefEntry(source, version, cdsSeq @@ -1818,16 +1824,16 @@ public class AlignmentUtils proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap .getInverse())); proteinProduct.addDBRef(proteinToCdsRef); - } + } /* * transfer any features on dna that overlap the CDS */ transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null, SequenceOntologyI.CDS); + } } } - } AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs .size()])); @@ -1837,6 +1843,38 @@ public class AlignmentUtils } /** + * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to + * toSeq, mediated by the given mapping between the sequences + * + * @param fromSeq + * @param targetToFrom + * Map + * @param targetSeq + */ + protected static void transferGeneLoci(SequenceI fromSeq, + MapList targetToFrom, SequenceI targetSeq) + { + if (targetSeq.getGeneLoci() != null) + { + // already have - don't override + return; + } + GeneLociI fromLoci = fromSeq.getGeneLoci(); + if (fromLoci == null) + { + return; + } + + MapList newMap = targetToFrom.traverse(fromLoci.getMap()); + + if (newMap != null) + { + targetSeq.setGeneLoci(fromLoci.getSpeciesId(), + fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap); + } + } + + /** * A helper method that finds a CDS sequence in the alignment dataset that is * mapped to the given protein sequence, and either is, or has a mapping from, * the given dna sequence. @@ -2004,19 +2042,19 @@ public class AlignmentUtils } /** - * add any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to + * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to * the given mapping. * * @param cdsSeq * @param contig + * @param proteinProduct * @param mapping - * @return list of DBRefEntrys added. + * @return list of DBRefEntrys added */ - public static List propagateDBRefsToCDS(SequenceI cdsSeq, + protected static List propagateDBRefsToCDS(SequenceI cdsSeq, SequenceI contig, SequenceI proteinProduct, Mapping mapping) { - - // gather direct refs from contig congrent with mapping + // gather direct refs from contig congruent with mapping List direct = new ArrayList(); HashSet directSources = new HashSet(); if (contig.getDBRefs() != null) @@ -2096,7 +2134,7 @@ public class AlignmentUtils * subtypes in the Sequence Ontology) * @param omitting */ - public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq, + protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq, MapList mapping, String select, String... omitting) { SequenceI copyTo = toSeq; @@ -2179,7 +2217,10 @@ public class AlignmentUtils /** * Returns a mapping from dna to protein by inspecting sequence features of - * type "CDS" on the dna. + * type "CDS" on the dna. A mapping is constructed if the total CDS feature + * length is 3 times the peptide length (optionally after dropping a trailing + * stop codon). This method does not check whether the CDS nucleotide sequence + * translates to the peptide sequence. * * @param dnaSeq * @param proteinSeq @@ -2191,6 +2232,15 @@ public class AlignmentUtils List ranges = findCdsPositions(dnaSeq); int mappedDnaLength = MappingUtils.getLength(ranges); + /* + * if not a whole number of codons, something is wrong, + * abort mapping + */ + if (mappedDnaLength % CODON_LENGTH > 0) + { + return null; + } + int proteinLength = proteinSeq.getLength(); int proteinStart = proteinSeq.getStart(); int proteinEnd = proteinSeq.getEnd(); @@ -2214,8 +2264,12 @@ public class AlignmentUtils if (codesForResidues == (proteinLength + 1)) { // assuming extra codon is for STOP and not in peptide + // todo: check trailing codon is indeed a STOP codon codesForResidues--; + mappedDnaLength -= CODON_LENGTH; + MappingUtils.removeEndPositions(CODON_LENGTH, ranges); } + if (codesForResidues == proteinLength) { proteinRange.add(new int[] { proteinStart, proteinEnd }); @@ -2226,7 +2280,7 @@ public class AlignmentUtils /** * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of - * start/end positions of sequence features of type "CDS" (or a sub-type of + * [start, end] positions of sequence features of type "CDS" (or a sub-type of * CDS in the Sequence Ontology). The ranges are sorted into ascending start * position order, so this method is only valid for linear CDS in the same * sense as the protein product. @@ -2234,7 +2288,7 @@ public class AlignmentUtils * @param dnaSeq * @return */ - public static List findCdsPositions(SequenceI dnaSeq) + protected static List findCdsPositions(SequenceI dnaSeq) { List result = new ArrayList(); @@ -2245,7 +2299,6 @@ public class AlignmentUtils return result; } SequenceFeatures.sortFeatures(sfs, true); - int startPhase = 0; for (SequenceFeature sf : sfs) { @@ -2263,7 +2316,7 @@ public class AlignmentUtils */ int begin = sf.getBegin(); int end = sf.getEnd(); - if (result.isEmpty()) + if (result.isEmpty() && phase > 0) { begin += phase; if (begin > end) @@ -2278,16 +2331,6 @@ public class AlignmentUtils } /* - * remove 'startPhase' positions (usually 0) from the first range - * so we begin at the start of a complete codon - */ - if (!result.isEmpty()) - { - // TODO JAL-2022 correctly model start phase > 0 - result.get(0)[0] += startPhase; - } - - /* * Finally sort ranges by start position. This avoids a dependency on * keeping features in order on the sequence (if they are in order anyway, * the sort will have almost no work to do). The implicit assumption is CDS @@ -2519,7 +2562,10 @@ public class AlignmentUtils /** * Builds a map whose key is position in the protein sequence, and value is a - * list of the base and all variants for each corresponding codon position + * list of the base and all variants for each corresponding codon position. + *

+ * This depends on dna variants being held as a comma-separated list as + * property "alleles" on variant features. * * @param dnaSeq * @param dnaToProtein @@ -2559,18 +2605,26 @@ public class AlignmentUtils } /* - * extract dna variants to a string array + * ignore variant if not a SNP */ String alls = (String) sf.getValue(Gff3Helper.ALLELES); if (alls == null) { continue; // non-SNP VCF variant perhaps - can't process this } + String[] alleles = alls.toUpperCase().split(","); - int i = 0; + boolean isSnp = true; for (String allele : alleles) { - alleles[i++] = allele.trim(); // lose any space characters "A, G" + if (allele.trim().length() > 1) + { + isSnp = false; + } + } + if (!isSnp) + { + continue; } int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);