From cba69cb983362b41c022745f2af42e54d7e86e5b Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 9 Oct 2015 15:54:38 +0100 Subject: [PATCH] JAL-1499 Gene and Domain parsed to AlignmentAnnotation (currently as well as SequenceFeature) --- src/jalview/io/MegaFile.java | 216 ++++++++++++++++++++++++----- src/jalview/schemes/UserColourScheme.java | 2 +- test/jalview/io/MegaFileTest.java | 54 ++++++++ 3 files changed, 235 insertions(+), 37 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index 69e7435..b9cc546 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -24,8 +24,10 @@ import jalview.datamodel.Annotation; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.schemes.UserColourScheme; import jalview.util.Comparison; +import java.awt.Color; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -49,15 +51,20 @@ import java.util.Vector; * http://www.megasoftware.net/manual.pdf
* Limitations: * * * @see http://www.megasoftware.net/ */ public class MegaFile extends AlignFile { - private static final String MEGA_ANNOTATION_LABEL = "MEGA Label"; + private static final String MEGA = "MEGA"; + + private static final String MEGA_ANNOTATION_LABEL = MEGA + " Label"; + + private static final String MEGA_ANNOTATION_GENE = MEGA + " Gene"; + + private static final String MEGA_ANNOTATION_DOMAIN = MEGA + " Domain"; private static final char UNDERSCORE = '_'; @@ -97,7 +104,7 @@ public class MegaFile extends AlignFile private static final String EQUALS = "="; - private static final String MEGA_ID = HASHSIGN + "MEGA"; + private static final String MEGA_ID = HASHSIGN + MEGA; private static final String TITLE = "Title"; @@ -113,6 +120,8 @@ public class MegaFile extends AlignFile private static final String CODONSTART = "CodonStart"; + private static final String DOMAINEND = "domainend"; + private static final String LABEL = "Label"; /* @@ -190,21 +199,33 @@ public class MegaFile extends AlignFile // current Gene if any we are parsing private String currentGene; - // start residue (base 1) per sequence of current gene - Map geneStart; + // start position in alignment (base 0) of current Gene + private int currentGeneStartCol; + + // start residue (base 1) per sequence of current Gene + private Map geneStart; // current Domain if any we are parsing private String currentDomain; - // start residue (base 1) per sequence of current domain - Map domainStart; + // start position in alignment (base 0) of current Domain + private int currentDomainStartCol; + + // start residue (base 1) per sequence of current Domain + private Map domainStart; // map of SequenceFeature's by sequence id - Map> sequenceFeatures; + private Map> sequenceFeatures; // each !Label line character becomes an Annotation (except underscores) List labelAnnotations; + // records any declared Gene positions (including null values) + List geneAnnotations; + + // records any declared Domain positions (including null values) + List domainAnnotations; + public MegaFile() { } @@ -231,6 +252,10 @@ public class MegaFile extends AlignFile domainStart = new HashMap(); residuesRead = new HashMap(); labelAnnotations = new ArrayList(); + geneAnnotations = new ArrayList(); + domainAnnotations = new ArrayList(); + currentDomainStartCol = -1; + currentGeneStartCol = -1; /* * Read and process MEGA and Title/Format/Description headers if present. @@ -249,7 +274,6 @@ public class MegaFile extends AlignFile */ currentSequenceId = ""; - boolean annotationAdded = false; while (dataLine != null) { dataLine = dataLine.trim(); @@ -264,7 +288,7 @@ public class MegaFile extends AlignFile } else if (upperCased.startsWith(BANG + LABEL.toUpperCase())) { - annotationAdded |= parseLabel(dataLine); + parseLabel(dataLine); } else { @@ -287,33 +311,60 @@ public class MegaFile extends AlignFile createFeature(GENE, currentGene, geneStart); createFeature(DOMAIN, currentDomain, domainStart); + extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol); + extendAnnotation(domainAnnotations, currentDomain, + currentDomainStartCol); + // remember the (longest) line length read in, so we can output the same setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); deriveSequencesAndFeatures(); - if (annotationAdded) - { - deriveAnnotations(); - } + deriveAnnotations(); } /** - * If we parsed !Label statements into a list of Annotation objects, create an - * AlignmentAnnotation + * Create AlignmentAnnotation for Label, Gene and Domain (provided at least + * one non-null annotation is present) */ protected void deriveAnnotations() { - if (this.labelAnnotations.size() > 0) + deriveAnnotation(this.labelAnnotations, MEGA_ANNOTATION_LABEL); + deriveAnnotation(this.geneAnnotations, MEGA_ANNOTATION_GENE); + deriveAnnotation(this.domainAnnotations, MEGA_ANNOTATION_DOMAIN); + } + + /** + * Create and ad an AlignmentAnnotation (provided at least one non-null + * annotation is present) + * + * @param anns + * @param label + */ + protected void deriveAnnotation(List anns, String label) + { + if (anns.size() > 0 && hasNonNullEntry(anns)) { - Annotation[] anns = labelAnnotations - .toArray(new Annotation[labelAnnotations.size()]); - AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "", - anns); + Annotation[] annotationArray = anns.toArray(new Annotation[anns + .size()]); + AlignmentAnnotation aa = new AlignmentAnnotation(label, "", + annotationArray); this.annotations.add(aa); } } + protected static boolean hasNonNullEntry(List l) + { + for (Object o : l) + { + if (o != null) + { + return true; + } + } + return false; + } + /** * Parse a !Label line. This contains a single character per position (column) * of the alignment block above. An underscore character represents no label. @@ -363,17 +414,36 @@ public class MegaFile extends AlignFile { this.firstDataBlockRead = true; - /* - * append null annotations to keep the annotations the same length as the - * sequences (in case some blocks have !Label lines and some don't) - */ + padAnnotations(labelAnnotations); + } - int sequenceLength = seqData.isEmpty() ? 0 : seqData.values() - .iterator().next().length(); - int annotationsToAdd = sequenceLength - labelAnnotations.size(); + /** + * Append null annotations to keep the annotations list the same length as the + * sequences. This ensures that when the list is converted to an array it is + * correctly aligned with the alignment columns. It is needed when there are + * gaps in declared 'annotations' in a MEGA file, such as lines with no !Label + * statement, or regions between marked genes or domains. + * + * @param anns + */ + protected void padAnnotations(List anns) + { + addNullAnnotations(anns, getAlignmentWidth()); + } + + /** + * Append null annotations for positions up to (and excluding) the given end + * column (base 0) + * + * @param anns + * @param upTo + */ + protected void addNullAnnotations(List anns, int upTo) + { + int annotationsToAdd = upTo - anns.size(); for (int i = 0; i < annotationsToAdd; i++) { - labelAnnotations.add(null); + anns.add(null); } } @@ -459,8 +529,8 @@ public class MegaFile extends AlignFile String property, String codonStart) { /* - * the order of processing below ensures that we correctly capture where a - * domain is in the context of an enclosing gene + * the order of processing below ensures that we correctly handle a domain + * in the context of an enclosing gene */ processDomainEnd(domain, gene, property); @@ -482,7 +552,7 @@ public class MegaFile extends AlignFile */ protected void processDomainStart(String domain, String property) { - if ("domainend".equalsIgnoreCase(property)) + if (DOMAINEND.equalsIgnoreCase(property)) { currentDomain = null; return; @@ -492,12 +562,26 @@ public class MegaFile extends AlignFile { String verboseDomain = makeVerboseDomainName(domain, property); startSequenceFeature(domainStart); + currentDomainStartCol = getAlignmentWidth(); currentDomain = verboseDomain; } } /** + * Returns the width of alignment parsed so far. Note we assume (as does MEGA) + * that all sequences are the same length, so we can just take the length of + * the first one. + * + * @return + */ + protected int getAlignmentWidth() + { + return seqData.isEmpty() ? 0 : seqData.values() + .iterator().next().length(); + } + + /** * If we have declared a gene, and it is not continuing, start a sequence * feature for it * @@ -508,6 +592,7 @@ public class MegaFile extends AlignFile if (gene != null && !gene.equals(currentGene)) { startSequenceFeature(geneStart); + currentGeneStartCol = getAlignmentWidth(); } currentGene = gene; } @@ -533,11 +618,15 @@ public class MegaFile extends AlignFile if (this.currentDomain != null) { boolean newDomain = !this.currentDomain.equals(verboseDomain); - boolean domainEnded = "domainend".equalsIgnoreCase(property); + boolean domainEnded = DOMAINEND.equalsIgnoreCase(property); if (newDomain || newGene || domainEnded) { createFeature(DOMAIN, currentDomain, domainStart); + // and/or... create annnotations for domain + extendAnnotation(domainAnnotations, currentDomain, + currentDomainStartCol); currentDomain = null; + currentDomainStartCol = -1; return true; } } @@ -561,7 +650,10 @@ public class MegaFile extends AlignFile if (this.currentGene != null && !this.currentGene.equals(gene)) { createFeature(GENE, currentGene, geneStart); + // and/or... add annotations for Gene + extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol); currentGene = null; + currentGeneStartCol = -1; created = true; } @@ -569,6 +661,37 @@ public class MegaFile extends AlignFile } /** + * Helper method to add Annotation elements, with the given description and + * starting at the given start column, up to the end of the sequence length + * parsed so far. Null elements are inserted for any skipped columns since the + * last annotation (if any), i.e. positions with no annotation of this type. + * + * @param anns + * @param description + * @param startColumn + * the start column of the annotations to add, or -1 if nothing to + * add + */ + protected void extendAnnotation(List anns, + String description, int startColumn) + { + int alignmentWidth = getAlignmentWidth(); + addNullAnnotations(anns, startColumn == -1 ? alignmentWidth + : startColumn); + + int numberToAdd = alignmentWidth - anns.size(); + if (numberToAdd > 0) + { + Color col = description == null ? Color.black : UserColourScheme + .createColourFromName(description); + for (int i = 0; i < numberToAdd; i++) + { + anns.add(new Annotation("X", description, ' ', 0f, col)); + } + } + } + + /** * Makes an expanded descriptive name for Domain if possible e.g. * "Intron1 (Adh Coding)". Currently incorporates the current gene name (if * any) and the Coding/Noncoding property value (if given). @@ -1632,8 +1755,8 @@ public class MegaFile extends AlignFile { int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; String none = ""; - if (annotations == null || annotations.isEmpty() - || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label)) + AlignmentAnnotation ann = findAnnotation(MEGA_ANNOTATION_LABEL); + if (ann == null) { return none; } @@ -1669,6 +1792,27 @@ public class MegaFile extends AlignFile } /** + * Returns the first stored annotation found with the given label, or null + * + * @param annotationLabel + * @return + */ + protected AlignmentAnnotation findAnnotation(String annotationLabel) + { + if (annotations != null) + { + for (AlignmentAnnotation ann : annotations) + { + if (annotationLabel.equals(ann.label)) + { + return ann; + } + } + } + return null; + } + + /** * Flag this file as interleaved or not, based on data format. Throws an * exception if has previously been determined to be otherwise. * diff --git a/src/jalview/schemes/UserColourScheme.java b/src/jalview/schemes/UserColourScheme.java index 92989fb..7ca211c 100755 --- a/src/jalview/schemes/UserColourScheme.java +++ b/src/jalview/schemes/UserColourScheme.java @@ -136,7 +136,7 @@ public class UserColourScheme extends ResidueColourScheme } - public Color createColourFromName(String name) + public static Color createColourFromName(String name) { int r, g, b; diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index bdae11a..40a7c6e 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -685,6 +685,60 @@ public class MegaFileTest verifySequenceFeature(sfs[7], "MEF2A", "Gene", 31, 36); verifySequenceFeature(sfs[8], "BindingSite", "Domain", 37, 42); } + + /* + * verify gene and domain alignment annotations + */ + assertEquals(2, testee.annotations.size()); + AlignmentAnnotation ann = testee.annotations.get(0); + assertEquals("MEGA Gene", ann.label); + assertEquals(42, ann.annotations.length); + verifyAnnotation(ann, 0, 6, null); + verifyAnnotation(ann, 6, 24, "Adh"); + verifyAnnotation(ann, 24, 30, "Opsin"); + verifyAnnotation(ann, 30, 36, "MEF2A"); + verifyAnnotation(ann, 37, 42, null); + + ann = testee.annotations.get(1); + assertEquals("MEGA Domain", ann.label); + assertEquals(42, ann.annotations.length); + verifyAnnotation(ann, 0, 6, null); + verifyAnnotation(ann, 6, 12, "Exon1 (Adh Coding)"); + verifyAnnotation(ann, 12, 18, "Intron1 (Adh Noncoding)"); + verifyAnnotation(ann, 19, 24, "Exon2 (Adh Coding)"); + verifyAnnotation(ann, 25, 30, "Intron1 (Opsin Noncoding)"); + verifyAnnotation(ann, 31, 36, "Exon1 (MEF2A Coding)"); + verifyAnnotation(ann, 37, 42, "BindingSite"); + + } + + /** + * Helper method to verify a range of annotation positions all have the given + * description + * + * @param ann + * array of annotations to check + * @param from + * start index to check + * @param to + * end index to check (exclusive) + * @param description + * value to assert + */ + protected void verifyAnnotation(AlignmentAnnotation ann, int from, + int to, String description) + { + for (int pos = from; pos < to; pos++) + { + if (description == null) + { + assertNull(ann.annotations[pos]); + } + else + { + assertEquals(description, ann.annotations[pos].description); + } + } } /** -- 1.7.10.2