From 933ca77e913c56fc4c71cc0fe534df208bfd742d Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 17 Jun 2019 13:18:44 +0100 Subject: [PATCH] JAL-3187 update alignAs code and unit tests for 'shared dataset' case --- src/jalview/analysis/AlignmentUtils.java | 34 +++++-- test/jalview/analysis/AlignmentUtilsTests.java | 117 ++++++++++++++---------- 2 files changed, 92 insertions(+), 59 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 77cc4d6..8e0335f 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -22,6 +22,7 @@ package jalview.analysis; import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; +import jalview.commands.RemoveGapColCommand; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; @@ -2440,17 +2441,17 @@ public class AlignmentUtils } /** - * Computes non-synonymous peptide variants from codon variants and adds them - * as sequence_variant features on the protein sequence (one feature per - * allele variant). Selected attributes (variant id, clinical significance) - * are copied over to the new features. + * Computes non-synonymous peptide variants from codon variants and adds them as + * sequence_variant features on the protein sequence (one feature per allele + * variant). Selected attributes (variant id, clinical significance) are copied + * over to the new features. * * @param peptide - * the protein sequence + * the protein dataset (ungapped) sequence * @param peptidePos - * the position to compute peptide variants for + * the position to compute peptide variants for * @param codonVariants - * a list of dna variants per codon position + * a list of dna variants per codon position * @return the number of features added */ static int computePeptideVariants(SequenceI peptide, int peptidePos, @@ -2837,6 +2838,17 @@ public class AlignmentUtils */ public static int alignAs(AlignmentI unaligned, AlignmentI aligned) { + /* + * easy case - aligning a copy of aligned sequences + */ + if (alignAsSameSequences(unaligned, aligned)) + { + return unaligned.getHeight(); + } + + /* + * fancy case - aligning via mappings between sequences + */ List unmapped = new ArrayList<>(); Map> columnMap = buildMappedColumnsMap( unaligned, aligned, unmapped); @@ -2902,9 +2914,7 @@ public class AlignmentUtils * - 'guide' alignment containing sequences derived from same * dataset as unaligned * @return - * @deprecated probably obsolete and incomplete */ - @Deprecated static boolean alignAsSameSequences(AlignmentI unaligned, AlignmentI aligned) { @@ -2975,6 +2985,12 @@ public class AlignmentUtils } } + /* + * finally remove gapped columns (e.g. introns) + */ + new RemoveGapColCommand("", unaligned.getSequencesArray(), 0, + unaligned.getWidth() - 1, unaligned); + return true; } diff --git a/test/jalview/analysis/AlignmentUtilsTests.java b/test/jalview/analysis/AlignmentUtilsTests.java index 14559dd..dabd3ee 100644 --- a/test/jalview/analysis/AlignmentUtilsTests.java +++ b/test/jalview/analysis/AlignmentUtilsTests.java @@ -2044,41 +2044,48 @@ public class AlignmentUtilsTests * NB setting "id" (as returned by Ensembl for features in JSON format); * previously "ID" (as returned for GFF3 format) */ - SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 1, 1, + SequenceFeature sf1 = new SequenceFeature("sequence_variant", "", 10, + 10, 0f, ensembl); sf1.setValue("alleles", "A,G"); // AAA -> GAA -> K/E sf1.setValue("id", "var1.125A>G"); - SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 1, 1, + SequenceFeature sf2 = new SequenceFeature("sequence_variant", "", 10, + 10, 0f, dbSnp); sf2.setValue("alleles", "A,C"); // AAA -> CAA -> K/Q sf2.setValue("id", "var2"); sf2.setValue("clinical_significance", "Dodgy"); - SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 1, 1, + SequenceFeature sf3 = new SequenceFeature("sequence_variant", "", 11, + 11, 0f, dbSnp); sf3.setValue("alleles", "A,T"); // AAA -> TAA -> stop codon sf3.setValue("id", "var3"); sf3.setValue("clinical_significance", "Bad"); - SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 3, 3, + SequenceFeature sf4 = new SequenceFeature("sequence_variant", "", 12, + 12, 0f, cosmic); sf4.setValue("alleles", "A,G"); // AAA -> AAG synonymous sf4.setValue("id", "var4"); sf4.setValue("clinical_significance", "None"); - SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 3, 3, + SequenceFeature sf5 = new SequenceFeature("sequence_variant", "", 12, + 12, 0f, ensembl); sf5.setValue("alleles", "A,T"); // AAA -> AAT -> K/N sf5.setValue("id", "sequence_variant:var5"); // prefix gets stripped off sf5.setValue("clinical_significance", "Benign"); - SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 6, 6, + SequenceFeature sf6 = new SequenceFeature("sequence_variant", "", 15, + 15, 0f, dbSnp); sf6.setValue("alleles", "T,C"); // TTT -> TTC synonymous sf6.setValue("id", "var6"); - SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 8, 8, + SequenceFeature sf7 = new SequenceFeature("sequence_variant", "", 17, + 17, 0f, cosmic); sf7.setValue("alleles", "C,A,G"); // CCC -> CAC,CGC -> P/H/R sf7.setValue("id", "var7"); @@ -2103,7 +2110,7 @@ public class AlignmentUtilsTests // codon2Variants.add(new DnaVariant("A")); codon3Variants.add(new DnaVariant("A", sf4)); codon3Variants.add(new DnaVariant("A", sf5)); - AlignmentUtils.computePeptideVariants(peptide, 1, codonVariants); + AlignmentUtils.computePeptideVariants(peptide, 10, codonVariants); /* * compute variants for protein position 2 @@ -2114,7 +2121,7 @@ public class AlignmentUtilsTests codon1Variants.add(new DnaVariant("T")); codon2Variants.add(new DnaVariant("T")); codon3Variants.add(new DnaVariant("T", sf6)); - AlignmentUtils.computePeptideVariants(peptide, 2, codonVariants); + AlignmentUtils.computePeptideVariants(peptide, 11, codonVariants); /* * compute variants for protein position 3 @@ -2125,7 +2132,7 @@ public class AlignmentUtilsTests codon1Variants.add(new DnaVariant("C")); codon2Variants.add(new DnaVariant("C", sf7)); codon3Variants.add(new DnaVariant("C")); - AlignmentUtils.computePeptideVariants(peptide, 3, codonVariants); + AlignmentUtils.computePeptideVariants(peptide, 12, codonVariants); /* * verify added sequence features for @@ -2149,55 +2156,55 @@ public class AlignmentUtilsTests */ // AAA -> AAT -> K/N SequenceFeature sf = sfs.get(0); - assertEquals(1, sf.getBegin()); - assertEquals(1, sf.getEnd()); + assertEquals(10, sf.getBegin()); + assertEquals(10, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); - assertEquals("p.Lys1Asn", sf.getDescription()); + assertEquals("p.Lys10Asn", sf.getDescription()); assertEquals("var5", sf.getValue("id")); assertEquals("Benign", sf.getValue("clinical_significance")); assertEquals("id=var5;clinical_significance=Benign", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Lys1Asn var5|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var5", + "p.Lys10Asn var5|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var5", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); // AAA -> CAA -> K/Q sf = sfs.get(1); - assertEquals(1, sf.getBegin()); - assertEquals(1, sf.getEnd()); + assertEquals(10, sf.getBegin()); + assertEquals(10, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); - assertEquals("p.Lys1Gln", sf.getDescription()); + assertEquals("p.Lys10Gln", sf.getDescription()); assertEquals("var2", sf.getValue("id")); assertEquals("Dodgy", sf.getValue("clinical_significance")); assertEquals("id=var2;clinical_significance=Dodgy", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Lys1Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2", + "p.Lys10Gln var2|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var2", sf.links.get(0)); assertEquals(dbSnp, sf.getFeatureGroup()); // AAA -> GAA -> K/E sf = sfs.get(2); - assertEquals(1, sf.getBegin()); - assertEquals(1, sf.getEnd()); + assertEquals(10, sf.getBegin()); + assertEquals(10, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); - assertEquals("p.Lys1Glu", sf.getDescription()); + assertEquals("p.Lys10Glu", sf.getDescription()); assertEquals("var1.125A>G", sf.getValue("id")); assertNull(sf.getValue("clinical_significance")); assertEquals("id=var1.125A>G", sf.getAttributes()); assertEquals(1, sf.links.size()); // link to variation is urlencoded assertEquals( - "p.Lys1Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG", + "p.Lys10Glu var1.125A>G|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var1.125A%3EG", sf.links.get(0)); assertEquals(ensembl, sf.getFeatureGroup()); // AAA -> TAA -> stop codon sf = sfs.get(3); - assertEquals(1, sf.getBegin()); - assertEquals(1, sf.getEnd()); + assertEquals(10, sf.getBegin()); + assertEquals(10, sf.getEnd()); assertEquals("stop_gained", sf.getType()); assertEquals("Aaa/Taa", sf.getDescription()); assertEquals("var3", sf.getValue("id")); @@ -2211,8 +2218,8 @@ public class AlignmentUtilsTests // AAA -> AAG synonymous sf = sfs.get(4); - assertEquals(1, sf.getBegin()); - assertEquals(1, sf.getEnd()); + assertEquals(10, sf.getBegin()); + assertEquals(10, sf.getEnd()); assertEquals("synonymous_variant", sf.getType()); assertEquals("aaA/aaG", sf.getDescription()); assertEquals("var4", sf.getValue("id")); @@ -2226,8 +2233,8 @@ public class AlignmentUtilsTests // TTT -> TTC synonymous sf = sfs.get(5); - assertEquals(2, sf.getBegin()); - assertEquals(2, sf.getEnd()); + assertEquals(11, sf.getBegin()); + assertEquals(11, sf.getEnd()); assertEquals("synonymous_variant", sf.getType()); assertEquals("ttT/ttC", sf.getDescription()); assertEquals("var6", sf.getValue("id")); @@ -2242,31 +2249,31 @@ public class AlignmentUtilsTests // var7 generates two distinct protein variant features (two alleles) // CCC -> CGC -> P/R sf = sfs.get(6); - assertEquals(3, sf.getBegin()); - assertEquals(3, sf.getEnd()); + assertEquals(12, sf.getBegin()); + assertEquals(12, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); - assertEquals("p.Pro3Arg", sf.getDescription()); + assertEquals("p.Pro12Arg", sf.getDescription()); assertEquals("var7", sf.getValue("id")); assertEquals("Good", sf.getValue("clinical_significance")); assertEquals("id=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Pro3Arg var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", + "p.Pro12Arg var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); // CCC -> CAC -> P/H sf = sfs.get(7); - assertEquals(3, sf.getBegin()); - assertEquals(3, sf.getEnd()); + assertEquals(12, sf.getBegin()); + assertEquals(12, sf.getEnd()); assertEquals("nonsynonymous_variant", sf.getType()); - assertEquals("p.Pro3His", sf.getDescription()); + assertEquals("p.Pro12His", sf.getDescription()); assertEquals("var7", sf.getValue("id")); assertEquals("Good", sf.getValue("clinical_significance")); assertEquals("id=var7;clinical_significance=Good", sf.getAttributes()); assertEquals(1, sf.links.size()); assertEquals( - "p.Pro3His var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", + "p.Pro12His var7|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=var7", sf.links.get(0)); assertEquals(cosmic, sf.getFeatureGroup()); } @@ -2591,9 +2598,16 @@ public class AlignmentUtilsTests AlignmentI al2 = new Alignment(new SequenceI[] { dna3, dna4 }); ((Alignment) al2).createDatasetAlignment(); + /* + * alignment removes gapped columns (two internal, two trailing) + */ assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); - assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString()); - assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString()); + String aligned1 = "-cc-GG-GTTT-aaa"; + assertEquals(aligned1, + al1.getSequenceAt(0).getSequenceAsString()); + String aligned2 = "C--C-Cgg-gtttAAA"; + assertEquals(aligned2, + al1.getSequenceAt(1).getSequenceAsString()); /* * add another sequence to 'aligned' - should still succeed, since @@ -2603,8 +2617,8 @@ public class AlignmentUtilsTests dna5.createDatasetSequence(); al2.addSequence(dna5); assertTrue(AlignmentUtils.alignAsSameSequences(al1, al2)); - assertEquals(seq1, al1.getSequenceAt(0).getSequenceAsString()); - assertEquals(seq2, al1.getSequenceAt(1).getSequenceAsString()); + assertEquals(aligned1, al1.getSequenceAt(0).getSequenceAsString()); + assertEquals(aligned2, al1.getSequenceAt(1).getSequenceAsString()); /* * add another sequence to 'unaligned' - should fail, since now not @@ -2622,15 +2636,15 @@ public class AlignmentUtilsTests { SequenceI dna1 = new Sequence("dna1", "cccGGGTTTaaa"); SequenceI dna2 = new Sequence("dna2", "CCCgggtttAAA"); - SequenceI as1 = dna1.deriveSequence(); - SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7); - SequenceI as3 = dna2.deriveSequence(); + SequenceI as1 = dna1.deriveSequence(); // cccGGGTTTaaa/1-12 + SequenceI as2 = dna1.deriveSequence().getSubSequence(3, 7); // GGGT/4-7 + SequenceI as3 = dna2.deriveSequence(); // CCCgggtttAAA/1-12 as1.insertCharAt(6, 5, '-'); - String s_as1 = as1.getSequenceAsString(); + assertEquals("cccGGG-----TTTaaa", as1.getSequenceAsString()); as2.insertCharAt(6, 5, '-'); - String s_as2 = as2.getSequenceAsString(); - as3.insertCharAt(6, 5, '-'); - String s_as3 = as3.getSequenceAsString(); + assertEquals("GGGT-----", as2.getSequenceAsString()); + as3.insertCharAt(3, 5, '-'); + assertEquals("CCC-----gggtttAAA", as3.getSequenceAsString()); AlignmentI aligned = new Alignment(new SequenceI[] { as1, as2, as3 }); // why do we need to cast this still ? @@ -2642,10 +2656,13 @@ public class AlignmentUtilsTests uas3 }); ((Alignment) tobealigned).createDatasetAlignment(); + /* + * alignAs lines up dataset sequences and removes empty columns (two) + */ assertTrue(AlignmentUtils.alignAsSameSequences(tobealigned, aligned)); - assertEquals(s_as1, uas1.getSequenceAsString()); - assertEquals(s_as2, uas2.getSequenceAsString()); - assertEquals(s_as3, uas3.getSequenceAsString()); + assertEquals("cccGGG---TTTaaa", uas1.getSequenceAsString()); + assertEquals("GGGT", uas2.getSequenceAsString()); + assertEquals("CCC---gggtttAAA", uas3.getSequenceAsString()); } @Test(groups = { "Functional" }) -- 1.7.10.2