From 4b24105fbfd942b6ec58a407b9dc68a331ba13d7 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 28 Sep 2017 11:21:59 +0100 Subject: [PATCH] JAL-1796 GeneLociI as distinguished DBRefEntry --- src/jalview/datamodel/DBRefEntry.java | 25 ++++++++- src/jalview/datamodel/GeneLoci.java | 46 ----------------- src/jalview/datamodel/GeneLociI.java | 38 ++++++++++++++ src/jalview/datamodel/Sequence.java | 83 ++++++++++++++++-------------- src/jalview/datamodel/SequenceI.java | 6 ++- src/jalview/ext/ensembl/EnsemblGene.java | 63 +++++++++++++++++------ src/jalview/io/vcf/VCFLoader.java | 16 +++--- test/jalview/io/vcf/VCFLoaderTest.java | 36 ++++++++----- 8 files changed, 190 insertions(+), 123 deletions(-) delete mode 100644 src/jalview/datamodel/GeneLoci.java create mode 100644 src/jalview/datamodel/GeneLociI.java diff --git a/src/jalview/datamodel/DBRefEntry.java b/src/jalview/datamodel/DBRefEntry.java index f7837f7..98868ce 100755 --- a/src/jalview/datamodel/DBRefEntry.java +++ b/src/jalview/datamodel/DBRefEntry.java @@ -27,7 +27,20 @@ import java.util.List; public class DBRefEntry implements DBRefEntryI { - String source = "", version = "", accessionId = ""; + /* + * the mapping to chromosome (genome) is held as an instance with + * source = speciesId + * version = assemblyId + * accessionId = "chromosome:" + chromosomeId + * map = mapping from sequence to reference assembly + */ + public static final String CHROMOSOME = "chromosome"; + + String source = ""; + + String version = ""; + + String accessionId = ""; /** * maps from associated sequence to the database sequence's coordinate system @@ -331,4 +344,14 @@ public class DBRefEntry implements DBRefEntryI } return true; } + + /** + * Mappings to chromosome are held with accessionId as "chromosome:id" + * + * @return + */ + public boolean isChromosome() + { + return accessionId != null && accessionId.startsWith(CHROMOSOME + ":"); + } } diff --git a/src/jalview/datamodel/GeneLoci.java b/src/jalview/datamodel/GeneLoci.java deleted file mode 100644 index b4bd642..0000000 --- a/src/jalview/datamodel/GeneLoci.java +++ /dev/null @@ -1,46 +0,0 @@ -package jalview.datamodel; - -import jalview.util.MapList; - -/** - * A data bean to model one or more contiguous regions on one chromosome - */ -public class GeneLoci -{ - /* - * an identifier for the species - */ - public final String species; - - /* - * an identifier for a genome assembly, e.g. GRCh38 - */ - public final String assembly; - - /* - * a chromosome identifier, e.g. "5" or "X" - */ - public final String chromosome; - - /* - * mapping from sequence positions to chromosome locations; - * any regions with start > end are on the reverse strand - */ - public final MapList mapping; - - /** - * Constructor - * - * @param taxon - * @param ref - * @param chrId - * @param map - */ - public GeneLoci(String taxon, String ref, String chrId, MapList map) - { - species = taxon; - assembly = ref; - chromosome = chrId; - mapping = map; - } -} diff --git a/src/jalview/datamodel/GeneLociI.java b/src/jalview/datamodel/GeneLociI.java new file mode 100644 index 0000000..f8c7ec5 --- /dev/null +++ b/src/jalview/datamodel/GeneLociI.java @@ -0,0 +1,38 @@ +package jalview.datamodel; + +import jalview.util.MapList; + +/** + * An interface to model one or more contiguous regions on one chromosome + */ +public interface GeneLociI +{ + /** + * Answers the species identifier + * + * @return + */ + String getSpeciesId(); + + /** + * Answers the reference assembly identifier + * + * @return + */ + String getAssemblyId(); + + /** + * Answers the chromosome identifier e.g. "2", "Y", "II" + * + * @return + */ + String getChromosomeId(); + + /** + * Answers the mapping from sequence to chromosome loci. For a reverse strand + * mapping, the chromosomal ranges will have start > end. + * + * @return + */ + MapList getMap(); +} diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index d09d845..cd743d1 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -106,8 +106,6 @@ public class Sequence extends ASequence implements SequenceI */ private int changeCount; - private GeneLoci geneLoci; - /** * Creates a new Sequence object. * @@ -656,45 +654,14 @@ public class Sequence extends ASequence implements SequenceI public void setDescription(String desc) { this.description = desc; - parseDescription(); - } - - /** - * Parses and saves fields of an Ensembl-style description e.g. - * chromosome:GRCh38:17:45051610:45109016:1 - */ - protected void parseDescription() - { - if (description == null) - { - return; - } - String[] tokens = description.split(":"); - if (tokens.length == 6 && "chromosome".equals(tokens[0])) { - String ref = tokens[1]; - String chrom = tokens[2]; - try { - int chStart = Integer.parseInt(tokens[3]); - int chEnd = Integer.parseInt(tokens[4]); - boolean forwardStrand = "1".equals(tokens[5]); - String species = ""; // dunno yet! - int[] from = new int[] { start, end }; - int[] to = new int[] { forwardStrand ? chStart : chEnd, - forwardStrand ? chEnd : chStart }; - MapList map = new MapList(from, to, 1, 1); - GeneLoci gl = new GeneLoci(species, ref, chrom, map); - setGeneLoci(gl); - } catch (NumberFormatException e) - { - System.err.println("Bad integers in description " + description); - } - } } @Override - public void setGeneLoci(GeneLoci gl) + public void setGeneLoci(String speciesId, String assemblyId, + String chromosomeId, MapList map) { - geneLoci = gl; + addDBRef(new DBRefEntry(speciesId, assemblyId, DBRefEntry.CHROMOSOME + + ":" + chromosomeId, new Mapping(map))); } /** @@ -703,9 +670,47 @@ public class Sequence extends ASequence implements SequenceI * @return */ @Override - public GeneLoci getGeneLoci() + public GeneLociI getGeneLoci() { - return geneLoci; + DBRefEntry[] refs = getDBRefs(); + if (refs != null) + { + for (final DBRefEntry ref : refs) + { + if (ref.isChromosome()) + { + return new GeneLociI() + { + @Override + public String getSpeciesId() + { + return ref.getSource(); + } + + @Override + public String getAssemblyId() + { + return ref.getVersion(); + } + + @Override + public String getChromosomeId() + { + // strip of "chromosome:" prefix to chrId + return ref.getAccessionId().substring( + DBRefEntry.CHROMOSOME.length() + 1); + } + + @Override + public MapList getMap() + { + return ref.getMap().getMap(); + } + }; + } + } + } + return null; } /** diff --git a/src/jalview/datamodel/SequenceI.java b/src/jalview/datamodel/SequenceI.java index 7cbb730..03fc545 100755 --- a/src/jalview/datamodel/SequenceI.java +++ b/src/jalview/datamodel/SequenceI.java @@ -21,6 +21,7 @@ package jalview.datamodel; import jalview.datamodel.features.SequenceFeaturesI; +import jalview.util.MapList; import java.util.BitSet; import java.util.List; @@ -535,7 +536,8 @@ public interface SequenceI extends ASequenceI */ public int replace(char c1, char c2); - public abstract GeneLoci getGeneLoci(); + GeneLociI getGeneLoci(); - public abstract void setGeneLoci(GeneLoci gl); + void setGeneLoci(String speciesId, String assemblyId, + String chromosomeId, MapList map); } diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 5e970b1..bc914e5 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -23,7 +23,8 @@ package jalview.ext.ensembl; import jalview.api.FeatureColourI; import jalview.api.FeatureSettingsModelI; import jalview.datamodel.AlignmentI; -import jalview.datamodel.GeneLoci; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -145,6 +146,9 @@ public class EnsemblGene extends EnsemblSeqProxy { continue; } + + parseChromosomeLocations(geneAlignment); + if (geneAlignment.getHeight() == 1) { getTranscripts(geneAlignment, geneId); @@ -162,6 +166,45 @@ public class EnsemblGene extends EnsemblSeqProxy } /** + * Parses and saves fields of an Ensembl-style description e.g. + * chromosome:GRCh38:17:45051610:45109016:1 + * + * @param alignment + */ + private void parseChromosomeLocations(AlignmentI alignment) + { + for (SequenceI seq : alignment.getSequences()) + { + String description = seq.getDescription(); + if (description == null) + { + continue; + } + String[] tokens = description.split(":"); + if (tokens.length == 6 && tokens[0].startsWith(DBRefEntry.CHROMOSOME)) + { + String ref = tokens[1]; + String chrom = tokens[2]; + try + { + int chStart = Integer.parseInt(tokens[3]); + int chEnd = Integer.parseInt(tokens[4]); + boolean forwardStrand = "1".equals(tokens[5]); + String species = ""; // dunno yet! + int[] from = new int[] { seq.getStart(), seq.getEnd() }; + int[] to = new int[] { forwardStrand ? chStart : chEnd, + forwardStrand ? chEnd : chStart }; + MapList map = new MapList(from, to, 1, 1); + seq.setGeneLoci(species, ref, chrom, map); + } catch (NumberFormatException e) + { + System.err.println("Bad integers in description " + description); + } + } + } + } + + /** * Converts a query, which may contain one or more gene or transcript * identifiers, into a non-redundant list of gene identifiers. * @@ -429,22 +472,13 @@ public class EnsemblGene extends EnsemblSeqProxy protected void mapTranscriptToChromosome(SequenceI transcript, SequenceI gene, MapList mapping) { - GeneLoci loci = gene.getGeneLoci(); + GeneLociI loci = gene.getGeneLoci(); if (loci == null) { return; } - /* - * patch to ensure gene to chromosome mapping is complete - * (in case created before gene length was known) - */ - MapList geneMapping = loci.mapping; - if (geneMapping.getFromRanges().get(0)[1] == 0) - { - geneMapping.getFromRanges().get(0)[0] = gene.getStart(); - geneMapping.getFromRanges().get(0)[1] = gene.getEnd(); - } + MapList geneMapping = loci.getMap(); List exons = mapping.getFromRanges(); List transcriptLoci = new ArrayList<>(); @@ -457,10 +491,9 @@ public class EnsemblGene extends EnsemblSeqProxy List transcriptRange = Arrays.asList(new int[] { transcript.getStart(), transcript.getEnd() }); MapList mapList = new MapList(transcriptRange, transcriptLoci, 1, 1); - GeneLoci gl = new GeneLoci(loci.species, loci.assembly, - loci.chromosome, mapList); - transcript.setGeneLoci(gl); + transcript.setGeneLoci(loci.getSpeciesId(), loci.getAssemblyId(), + loci.getChromosomeId(), mapList); } /** diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 6c4526f..78d2ad5 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -11,7 +11,7 @@ import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLoci; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -199,13 +199,13 @@ public class VCFLoader boolean isVcfRefGrch37) { int count = 0; - GeneLoci seqCoords = seq.getGeneLoci(); + GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { return 0; } - List seqChromosomalContigs = seqCoords.mapping.getToRanges(); + List seqChromosomalContigs = seqCoords.getMap().getToRanges(); for (int[] range : seqChromosomalContigs) { count += addVcfVariants(seq, reader, range, isVcfRefGrch37); @@ -231,11 +231,11 @@ public class VCFLoader protected int addVcfVariants(SequenceI seq, VCFReader reader, int[] range, boolean isVcfRefGrch37) { - GeneLoci seqCoords = seq.getGeneLoci(); + GeneLociI seqCoords = seq.getGeneLoci(); - String chromosome = seqCoords.chromosome; - String seqRef = seqCoords.assembly; - String species = seqCoords.species; + String chromosome = seqCoords.getChromosomeId(); + String seqRef = seqCoords.getAssemblyId(); + String species = seqCoords.getSpeciesId(); // TODO handle species properly if ("".equals(species)) @@ -273,7 +273,7 @@ public class VCFLoader * (convert a reverse strand range to forwards) */ int count = 0; - MapList mapping = seqCoords.mapping; + MapList mapping = seqCoords.getMap(); int fromLocus = Math.min(range[0], range[1]); int toLocus = Math.max(range[0], range[1]); diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index cdfe298..7379ecd 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -4,7 +4,6 @@ import static org.testng.Assert.assertEquals; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLoci; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; @@ -100,8 +99,15 @@ public class VCFLoaderTest * verify variant feature(s) computed and added to protein * first codon AGC varies to ACC giving S/T */ - SequenceI peptide = al.getSequenceAt(1) - .getDBRefs()[0].getMap().getTo(); + DBRefEntry[] dbRefs = al.getSequenceAt(1).getDBRefs(); + SequenceI peptide = null; + for (DBRefEntry dbref : dbRefs) + { + if (dbref.getMap().getMap().getFromRatio() == 3) + { + peptide = dbref.getMap().getTo(); + } + } List proteinFeatures = peptide.getSequenceFeatures(); assertEquals(proteinFeatures.size(), 1); sf = proteinFeatures.get(0); @@ -143,8 +149,7 @@ public class VCFLoaderTest SequenceI gene1 = alignment.getSequenceAt(0); int[] to = new int[] { 45051610, 45051634 }; int[] from = new int[] { gene1.getStart(), gene1.getEnd() }; - gene1.setGeneLoci(new GeneLoci("human", "GRCh38", "17", new MapList( - from, to, 1, 1))); + gene1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1)); /* * map 'transcript1' to chromosome via 'gene1' @@ -154,8 +159,8 @@ public class VCFLoaderTest to = new int[] { 45051612, 45051619, 45051624, 45051633 }; SequenceI transcript1 = alignment.getSequenceAt(1); from = new int[] { transcript1.getStart(), transcript1.getEnd() }; - transcript1.setGeneLoci(new GeneLoci("human", "GRCh38", "17", - new MapList(from, to, 1, 1))); + transcript1.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, + 1, 1)); /* * map gene2 to chromosome reverse strand @@ -163,8 +168,7 @@ public class VCFLoaderTest SequenceI gene2 = alignment.getSequenceAt(2); to = new int[] { 45051634, 45051610 }; from = new int[] { gene2.getStart(), gene2.getEnd() }; - gene2.setGeneLoci(new GeneLoci("human", "GRCh38", "17", new MapList( - from, to, 1, 1))); + gene2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, 1, 1)); /* * map 'transcript2' to chromosome via 'gene2' @@ -174,8 +178,8 @@ public class VCFLoaderTest to = new int[] { 45051633, 45051624, 45051619, 45051612 }; SequenceI transcript2 = alignment.getSequenceAt(3); from = new int[] { transcript2.getStart(), transcript2.getEnd() }; - transcript2.setGeneLoci(new GeneLoci("human", "GRCh38", "17", - new MapList(from, to, 1, 1))); + transcript2.setGeneLoci("human", "GRCh38", "17", new MapList(from, to, + 1, 1)); /* * add a protein product as a DBRef on transcript1 @@ -262,7 +266,15 @@ public class VCFLoaderTest * verify variant feature(s) computed and added to protein * last codon GCT varies to GGT giving A/G in the last peptide position */ - SequenceI peptide = al.getSequenceAt(3).getDBRefs()[0].getMap().getTo(); + DBRefEntry[] dbRefs = al.getSequenceAt(3).getDBRefs(); + SequenceI peptide = null; + for (DBRefEntry dbref : dbRefs) + { + if (dbref.getMap().getMap().getFromRatio() == 3) + { + peptide = dbref.getMap().getTo(); + } + } List proteinFeatures = peptide.getSequenceFeatures(); assertEquals(proteinFeatures.size(), 1); sf = proteinFeatures.get(0); -- 1.7.10.2