X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=8e0335fee4a161b0414377a99fe60992228f6f12;hb=5c6564f903f75960af960720a8635ab8709afc37;hp=4b4e2a7a767ea7d90aebb072b2248135ffd525ff;hpb=b2eb64f196223039a95348a4026f0453693ee0e7;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 4b4e2a7..8e0335f 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -22,6 +22,7 @@ package jalview.analysis; import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; +import jalview.commands.RemoveGapColCommand; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; @@ -1107,7 +1108,7 @@ public class AlignmentUtils SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein); if (prot != null) { - Mapping seqMap = mapping.getMappingForSequence(dnaSeq, false); + Mapping seqMap = mapping.getMappingForSequence(dnaSeq); addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); unmappedProtein.remove(prot); @@ -1746,8 +1747,10 @@ public class AlignmentUtils /* * add a mapping from CDS to the (unchanged) mapped to range */ - List cdsRange = Collections.singletonList(new int[] { 1, - cdsSeq.getLength() }); + List cdsRange = Collections + .singletonList(new int[] + { cdsSeq.getStart(), + cdsSeq.getLength() + cdsSeq.getStart() - 1 }); MapList cdsToProteinMap = new MapList(cdsRange, mapList.getToRanges(), mapList.getFromRatio(), mapList.getToRatio()); @@ -1984,39 +1987,61 @@ public class AlignmentUtils static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping, AlignmentI dataset) { - char[] seqChars = seq.getSequence(); - List fromRanges = mapping.getMap().getFromRanges(); - int cdsWidth = MappingUtils.getLength(fromRanges); - char[] newSeqChars = new char[cdsWidth]; + /* + * construct CDS sequence name as "CDS|" with 'from id' held in the mapping + * if set (e.g. EMBL protein_id), else sequence name appended + */ + String mapFromId = mapping.getMappedFromId(); + final String seqId = "CDS|" + + (mapFromId != null ? mapFromId : seq.getName()); + + SequenceI newSeq = null; - int newPos = 0; - for (int[] range : fromRanges) + final MapList maplist = mapping.getMap(); + if (maplist.isContiguous() && maplist.isFromForwardStrand()) { - if (range[0] <= range[1]) - { - // forward strand mapping - just copy the range - int length = range[1] - range[0] + 1; - System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos, - length); - newPos += length; - } - else + /* + * just a subsequence, keep same dataset sequence + */ + int start = maplist.getFromLowest(); + int end = maplist.getFromHighest(); + newSeq = seq.getSubSequence(start - 1, end); + newSeq.setName(seqId); + } + else + { + /* + * construct by splicing mapped from ranges + */ + char[] seqChars = seq.getSequence(); + List fromRanges = maplist.getFromRanges(); + int cdsWidth = MappingUtils.getLength(fromRanges); + char[] newSeqChars = new char[cdsWidth]; + + int newPos = 0; + for (int[] range : fromRanges) { - // reverse strand mapping - copy and complement one by one - for (int i = range[0]; i >= range[1]; i--) + if (range[0] <= range[1]) + { + // forward strand mapping - just copy the range + int length = range[1] - range[0] + 1; + System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos, + length); + newPos += length; + } + else { - newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]); + // reverse strand mapping - copy and complement one by one + for (int i = range[0]; i >= range[1]; i--) + { + newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]); + } } } + + newSeq = new Sequence(seqId, newSeqChars, 1, newPos); } - /* - * assign 'from id' held in the mapping if set (e.g. EMBL protein_id), - * else generate a sequence name - */ - String mapFromId = mapping.getMappedFromId(); - String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName()); - SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos); if (dataset != null) { SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName()); @@ -2159,6 +2184,10 @@ public class AlignmentUtils { copyTo = copyTo.getDatasetSequence(); } + if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo) + { + return 0; // shared dataset sequence + } /* * get features, optionally restricted by an ontology term @@ -2412,17 +2441,17 @@ public class AlignmentUtils } /** - * Computes non-synonymous peptide variants from codon variants and adds them - * as sequence_variant features on the protein sequence (one feature per - * allele variant). Selected attributes (variant id, clinical significance) - * are copied over to the new features. + * Computes non-synonymous peptide variants from codon variants and adds them as + * sequence_variant features on the protein sequence (one feature per allele + * variant). Selected attributes (variant id, clinical significance) are copied + * over to the new features. * * @param peptide - * the protein sequence + * the protein dataset (ungapped) sequence * @param peptidePos - * the position to compute peptide variants for + * the position to compute peptide variants for * @param codonVariants - * a list of dna variants per codon position + * a list of dna variants per codon position * @return the number of features added */ static int computePeptideVariants(SequenceI peptide, int peptidePos, @@ -2880,10 +2909,10 @@ public class AlignmentUtils * true; else returns false * * @param unaligned - * - sequences to be aligned based on aligned + * - sequences to be aligned based on aligned * @param aligned - * - 'guide' alignment containing sequences derived from same dataset - * as unaligned + * - 'guide' alignment containing sequences derived from same + * dataset as unaligned * @return */ static boolean alignAsSameSequences(AlignmentI unaligned, @@ -2907,15 +2936,22 @@ public class AlignmentUtils } /* - * first pass - check whether all sequences to be aligned share a dataset - * sequence with an aligned sequence + * first pass - check whether all sequences to be aligned share a + * dataset sequence with an aligned sequence; also note the leftmost + * ungapped column from which to copy */ + int leftmost = Integer.MAX_VALUE; for (SequenceI seq : unaligned.getSequences()) { - if (!alignedDatasets.containsKey(seq.getDatasetSequence())) + final SequenceI ds = seq.getDatasetSequence(); + if (!alignedDatasets.containsKey(ds)) { return false; } + SequenceI alignedSeq = alignedDatasets.get(ds) + .get(0); + int startCol = alignedSeq.findIndex(seq.getStart()); // 1.. + leftmost = Math.min(leftmost, startCol); } /* @@ -2923,13 +2959,25 @@ public class AlignmentUtils * heuristic rule: pair off sequences in order for the case where * more than one shares the same dataset sequence */ + final char gapCharacter = aligned.getGapCharacter(); for (SequenceI seq : unaligned.getSequences()) { List alignedSequences = alignedDatasets .get(seq.getDatasetSequence()); - // TODO: getSequenceAsString() will be deprecated in the future - // TODO: need to leave to SequenceI implementor to update gaps - seq.setSequence(alignedSequences.get(0).getSequenceAsString()); + SequenceI alignedSeq = alignedSequences.get(0); + + /* + * gap fill for leading (5') UTR if any + */ + // TODO this copies intron columns - wrong! + int startCol = alignedSeq.findIndex(seq.getStart()); // 1.. + int endCol = alignedSeq.findIndex(seq.getEnd()); + char[] seqchars = new char[endCol - leftmost + 1]; + Arrays.fill(seqchars, gapCharacter); + char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol); + System.arraycopy(toCopy, 0, seqchars, startCol - leftmost, + toCopy.length); + seq.setSequence(String.valueOf(seqchars)); if (alignedSequences.size() > 0) { // pop off aligned sequences (except the last one) @@ -2937,6 +2985,12 @@ public class AlignmentUtils } } + /* + * finally remove gapped columns (e.g. introns) + */ + new RemoveGapColCommand("", unaligned.getSequencesArray(), 0, + unaligned.getWidth() - 1, unaligned); + return true; }