From 2b237dfb6fc6d232af79673700b2f57ae8d5a10f Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 6 Jul 2016 16:34:01 +0100 Subject: [PATCH] JAL-2110 AlignmentUtils.alignCdsAsProtein implemented --- src/jalview/analysis/AlignmentUtils.java | 116 ++++++++++++++++++++++++++--- test/jalview/datamodel/AlignmentTest.java | 9 ++- 2 files changed, 112 insertions(+), 13 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 35aa91d..20fcf25 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -885,7 +885,8 @@ public class AlignmentUtils int alignedCount = 0; for (SequenceI dnaSeq : dna.getSequences()) { - if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings)) + if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings, + dna.getGapCharacter())) { alignedCount++; } @@ -894,31 +895,124 @@ public class AlignmentUtils } /** - * Helper method to align (if possible) the dna sequence (cds only) to match - * the alignment of a mapped protein sequence + * Helper method to align (if possible) the dna sequence to match the + * alignment of a mapped protein sequence. This is currently limited to + * handling coding sequence only. * - * @param dnaSeq + * @param cdsSeq * @param protein * @param mappings + * @param gapChar * @return */ - static boolean alignCdsSequenceAsProtein(SequenceI dnaSeq, - AlignmentI protein, List mappings) + static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq, + AlignmentI protein, List mappings, char gapChar) { + SequenceI cdsDss = cdsSeq.getDatasetSequence(); + if (cdsDss == null) + { + System.err + .println("alignCdsSequenceAsProtein needs aligned sequence!"); + return false; + } + List dnaMappings = MappingUtils - .findMappingsForSequence(dnaSeq, mappings); + .findMappingsForSequence(cdsSeq, mappings); for (AlignedCodonFrame mapping : dnaMappings) { - SequenceI peptide = mapping.findAlignedSequence(dnaSeq, protein); + SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein); + int peptideLength = peptide.getLength(); if (peptide != null) { - Mapping map = mapping.getMappingBetween(dnaSeq, peptide); + Mapping map = mapping.getMappingBetween(cdsSeq, peptide); if (map != null) { + MapList mapList = map.getMap(); + if (map.getTo() == peptide.getDatasetSequence()) + { + mapList = mapList.getInverse(); + } + int cdsLength = cdsDss.getLength(); + int mappedFromLength = MappingUtils.getLength(mapList + .getFromRanges()); + int mappedToLength = MappingUtils + .getLength(mapList.getToRanges()); + boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3) + || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1); + if (cdsLength != mappedToLength && !addStopCodon) + { + System.err + .println(String + .format("Can't align cds as protein (length mismatch %d/%d): %s", + cdsLength, mappedToLength, + cdsSeq.getName())); + } + /* - * traverse peptide adding gaps or codons to new cds sequence + * pre-fill the aligned cds sequence with gaps */ - MapList mapList = map.getMap(); + char[] alignedCds = new char[peptideLength * 3 + + (addStopCodon ? 3 : 0)]; + Arrays.fill(alignedCds, gapChar); + + /* + * walk over the aligned peptide sequence and insert mapped + * codons for residues in the aligned cds sequence + */ + char[] alignedPeptide = peptide.getSequence(); + char[] nucleotides = cdsDss.getSequence(); + int copiedBases = 0; + int cdsStart = cdsDss.getStart(); + int proteinPos = peptide.getStart() - 1; + int cdsCol = 0; + for (char residue : alignedPeptide) + { + if (Comparison.isGap(residue)) + { + cdsCol += 3; + } + else + { + proteinPos++; + int[] codon = mapList.locateInTo(proteinPos, proteinPos); + if (codon == null) + { + // e.g. incomplete start codon, X in peptide + cdsCol += 3; + } + else + { + for (int j = codon[0]; j <= codon[1]; j++) + { + char mappedBase = nucleotides[j - cdsStart]; + alignedCds[cdsCol++] = mappedBase; + copiedBases++; + } + } + } + } + + /* + * append stop codon if not mapped from protein, + * closing it up to the end of the mapped sequence + */ + if (copiedBases == nucleotides.length - 3) + { + for (int i = alignedCds.length - 1; i >= 0; i--) + { + if (!Comparison.isGap(alignedCds[i])) + { + cdsCol = i + 1; // gap just after end of sequence + break; + } + } + for (int i = nucleotides.length - 3; i < nucleotides.length; i++) + { + alignedCds[cdsCol++] = nucleotides[i]; + } + } + cdsSeq.setSequence(new String(alignedCds)); + return true; } } } diff --git a/test/jalview/datamodel/AlignmentTest.java b/test/jalview/datamodel/AlignmentTest.java index 08a9441..07b8abf 100644 --- a/test/jalview/datamodel/AlignmentTest.java +++ b/test/jalview/datamodel/AlignmentTest.java @@ -217,7 +217,7 @@ public class AlignmentTest * * @throws IOException */ - @Test(groups = { "Functional" }, enabled = false) + @Test(groups = { "Functional" }, enabled = true) // TODO review / update this test after redesign of alignAs method public void testAlignAs_cdnaAsProtein() throws IOException { @@ -243,7 +243,7 @@ public class AlignmentTest * * @throws IOException */ - @Test(groups = { "Functional" }, enabled = false) + @Test(groups = { "Functional" }, enabled = true) // TODO review / update this test after redesign of alignAs method public void testAlignAs_cdnaAsProtein_singleSequence() throws IOException { @@ -315,7 +315,12 @@ public class AlignmentTest acf.addMap(seqFrom, seqTo, ml); } + /* + * not sure whether mappings 'belong' or protein or nucleotide + * alignment, so adding to both ;~) + */ alFrom.addCodonFrame(acf); + alTo.addCodonFrame(acf); } /** -- 1.7.10.2