From b44b679b7790d6f2abe4dfd54e488cb1a461c65b Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 19 Oct 2015 16:46:31 +0100 Subject: [PATCH] JAL-1499 output !Domain and !Gene statements from alignment annotations --- src/jalview/io/MegaFile.java | 281 +++++++++++++++++++++++++++++++------ test/jalview/io/MegaFileTest.java | 110 ++++++++++++++- 2 files changed, 341 insertions(+), 50 deletions(-) diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index d4ec4b4..5a76d07 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -1492,70 +1492,234 @@ public class MegaFile extends AlignFile { int maxIdLength = getMaxIdLength(s); int maxSequenceLength = getMaxSequenceLength(s); - int numLines = maxSequenceLength / positionsPerLine + 3; // approx - int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1; int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery; /* * Roughly size a buffer to hold the whole output */ + int numLines = maxSequenceLength / positionsPerLine + 3; // approx StringBuilder sb = new StringBuilder(numLines * (maxIdLength + positionsPerLine + chunksPerLine + 10)); /* * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide */ + AlignmentAnnotation geneAnnotation = findAnnotation(MEGA_ANNOTATION_GENE); + AlignmentAnnotation domainAnnotation = findAnnotation(MEGA_ANNOTATION_DOMAIN); + currentGene = null; + currentDomain = null; int from = 0; - for (int i = 0; i < numDataBlocks; i++) + + while (from < maxSequenceLength) { - sb.append(newline); - boolean first = true; - int advancedBy = 0; - for (SequenceI seq : s) + printGeneOrDomainHeader(sb, from, geneAnnotation, domainAnnotation); + int maxCol = from + positionsPerLine; // exclusive + for (int col = from; col <= maxCol; col++) { - int seqFrom = from; - String seqId = String.format("#%-" + maxIdLength + "s", - seq.getName()); - - /* - * output next line for this sequence - */ - sb.append(seqId); - int lastPos = seqFrom + positionsPerLine; // exclusive - for (int j = 0; j < chunksPerLine; j++) + if (geneOrDomainChanged(col, geneAnnotation, domainAnnotation) + || col == maxCol) { - char[] subSequence = seq.getSequence(seqFrom, - Math.min(lastPos, seqFrom + spaceEvery)); - if (subSequence.length > 0) - { - sb.append(SPACE).append(subSequence); - } - seqFrom += subSequence.length; - if (first) + /* + * print a block of sequences up to [col-1] + */ + sb.append(newline); + boolean first = true; + int advancedBy = 0; + for (SequenceI seq : s) { - // all sequences should be the same length in MEGA - advancedBy += subSequence.length; + int seqFrom = from; + String seqId = String.format("#%-" + maxIdLength + "s", + seq.getName()); + + /* + * output next line for this sequence + */ + sb.append(seqId); + for (int j = 0; j < chunksPerLine; j++) + { + char[] subSequence = seq.getSequence(seqFrom, + Math.min(col, seqFrom + spaceEvery)); + if (subSequence.length > 0) + { + sb.append(SPACE).append(subSequence); + } + seqFrom += subSequence.length; + if (first) + { + // all sequences should be the same length in MEGA + advancedBy += subSequence.length; + } + } + // write last position as a comment + if (writePositionNumbers) + { + sb.append(SPACE).append(COMMENT_START) + .append(String.valueOf(from + advancedBy)) + .append(COMMENT_END); + } + sb.append(newline); + first = false; } + sb.append(printLabel(from, advancedBy, maxIdLength)); + from += advancedBy; + break; } - // write last position as a comment - if (writePositionNumbers) - { - sb.append(SPACE).append(COMMENT_START).append(from + advancedBy) - .append(COMMENT_END); - } - sb.append(newline); - first = false; } - sb.append(printLabel(from, advancedBy, maxIdLength)); - from += advancedBy; } return new String(sb); } /** + * Returns true if we detect a change of gene or domain at the given column + * position. Currently done by inspecting any "MEGA Gene" or "MEGA Domain" + * annotation for the column. + * + * @param column + * @param geneAnnotation + * @param domainAnnotation + * @return + */ + protected boolean geneOrDomainChanged(int column, + AlignmentAnnotation geneAnnotation, + AlignmentAnnotation domainAnnotation) + { + String gene = getGeneFromAnnotation(column, geneAnnotation); + String domain = getDomainFromAnnotation(column, domainAnnotation); + boolean domainEnd = domain != null + && domain.toUpperCase().indexOf(DOMAINEND.toUpperCase()) != -1; + + if ((domainEnd || gene == null) && currentGene != null) + { + return true; + } + else if (gene != null && !gene.equals(currentGene)) + { + return true; + } + // todo: Domain + return false; + } + + /** + * Extracts the name of a domain from MEGA Domain annotation at the given + * column position, if any + * + * @param column + * @param domainAnnotation + * @return + */ + protected static String getDomainFromAnnotation(int column, + AlignmentAnnotation domainAnnotation) + { + Annotation domainAnn = domainAnnotation == null ? null + : (column >= domainAnnotation.annotations.length ? null + : domainAnnotation.annotations[column]); + String domain = domainAnn == null ? null : domainAnn.description; + if (domain != null && domain.indexOf("(") > 0) + { + domain = domain.substring(0, domain.indexOf("(")).trim(); + } + return domain; + } + + /** + * Extracts the name of a gene from MEGA Gene annotation at the given column + * position, if any + * + * @param column + * @param geneAnnotation + * @return + */ + protected static String getGeneFromAnnotation(int column, + AlignmentAnnotation geneAnnotation) + { + Annotation geneAnn = geneAnnotation == null ? null + : (column >= geneAnnotation.annotations.length ? null + : geneAnnotation.annotations[column]); + String gene = geneAnn == null ? null : geneAnn.description; + return gene; + } + + /** + * Appends a !Domain header on change of Domain (or Domain/Gene) annotation. + * If it changes to a null value, appends a !Domain Property=domainend; + * statement + * + * @param sb + * buffer to append to + * @param column + * of alignment being output + * @param geneAnnotation + * "MEGA Gene" annotations + * @param domainAnnotation + * "MEGA Domain" annotations + */ + protected void printGeneOrDomainHeader(StringBuilder sb, int column, + AlignmentAnnotation geneAnnotation, + AlignmentAnnotation domainAnnotation) + { + String gene = getGeneFromAnnotation(column, geneAnnotation); + String domain = getDomainFromAnnotation(column, domainAnnotation); + String property = getPropertyFromAnnotation(column, domainAnnotation); + + if ((gene == null && currentGene != null) + || (gene == null && domain == null && currentDomain != null)) + { + /* + * end of Gene or Domain annotation + */ + sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS) + .append(currentDomain).append(SPACE).append(PROPERTY) + .append(EQUALS).append(DOMAINEND).append(SEMICOLON); + } + else if (gene != null && !gene.equals(currentGene)) + { + /* + * start of a new Gene; output as "!Domain Gene=..." if we have domain + * annotation + */ + if (domain != null) + { + sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS) + .append(domain).append(SPACE).append(GENE).append(EQUALS) + .append(gene); + if (property != null) + { + sb.append(SPACE).append(PROPERTY).append(EQUALS).append(property); + } + sb.append(SEMICOLON); + } + else + { + sb.append(newline).append(BANG).append(GENE).append(EQUALS) + .append(gene).append(SEMICOLON); + } + } + else if (domain != null && !domain.equals(currentGene)) + { + /* + * change of domain within same (or no) gene + */ + sb.append(newline).append(BANG).append(DOMAIN).append(EQUALS) + .append(domain); + if (currentGene != null) + { + sb.append(SPACE).append(GENE).append(EQUALS).append(currentGene); + } + if (property != null) + { + sb.append(SPACE).append(PROPERTY).append(EQUALS).append(property); + } + sb.append(SEMICOLON); + } + currentGene = gene; + currentDomain = domain; + } + + /** * Outputs to string the MEGA header and any other known and relevant * alignment properties * @@ -1645,7 +1809,7 @@ public class MegaFile extends AlignFile } /** - * Get the longest sequence length + * Get the longest sequence length (including gaps) * * @param s * @return @@ -1874,20 +2038,15 @@ public class MegaFile extends AlignFile this.nucleotide = al.isNucleotide(); /* - * if the alignment has a "MEGA" annotation, we'll output its values as - * !Label statements; MEGA only supports one of these + * "MEGA Gene", "MEGA Domain" or "MEGA Label" annotations can be output */ AlignmentAnnotation[] anns = al.getAlignmentAnnotation(); if (anns != null) { + this.annotations = new Vector(); for (AlignmentAnnotation ann : anns) { - if (MEGA_ANNOTATION_LABEL.equals(ann.label)) - { - this.annotations = new Vector(); - annotations.add(ann); - break; - } + annotations.add(ann); } } @@ -1933,4 +2092,34 @@ public class MegaFile extends AlignFile { this.positionsPerLine = p; } + + /** + * Extracts the Coding / Noncoding property of a domain from MEGA Domain + * annotation at the given column position, if present + * + * @param column + * @param domainAnnotation + * @return + */ + protected static String getPropertyFromAnnotation(int column, + AlignmentAnnotation domainAnnotation) + { + /* + * currently depends on parsing "Exon1 (Aspx Coding)" or similar + */ + String property = null; + Annotation domainAnn = domainAnnotation == null ? null + : (column >= domainAnnotation.annotations.length ? null + : domainAnnotation.annotations[column]); + String domain = domainAnn == null ? null : domainAnn.description; + if (domain != null && domain.indexOf("(") > 0) + { + domain = domain.substring(domain.indexOf("(") +1); + if (domain.indexOf(SPACE) > -1 && domain.endsWith(")")) { + property = domain.substring(domain.indexOf(SPACE) + 1, + domain.length() - 1); + } + } + return property; + } } diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index 62dd3e6..4a3918e 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -8,6 +8,7 @@ import static org.testng.AssertJUnit.fail; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Annotation; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -234,10 +235,14 @@ public class MegaFileTest // normally output should match input // we cheated here with a number of short input lines // nb don't get Title in output if not calling print(AlignmentI) - String expected = "#MEGA\n\n" + "#U455 ABCDEF [6]\n" - + "#CPZANT MNOPQR [6]\n\n" + "#U455 KLMNOP [12]\n" - + "#CPZANT WXYZGC [12]" - + "\n"; + //@formatter:off + String expected = + "#MEGA\n\n" + + "#U455 ABCDEF [6]\n" + + "#CPZANT MNOPQR [6]\n\n" + + "#U455 KLMNOP [12]\n" + + "#CPZANT WXYZGC [12]\n"; + //@formatter:on assertEquals("Print format wrong", expected, printed); } @@ -917,4 +922,101 @@ public class MegaFileTest assertEquals("Roundtrip didn't match", expected, formatted); } + + /** + * Test (parse and) print of MEGA data with !Gene statements. + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testPrint_genes() throws IOException + { + /* + * to keep the test concise, input data is in the exact format that Jalview + * would output it; the important thing is functional equivalence of input + * and output + */ + //@formatter:off + String data = "#MEGA\n\n"+ + "#Seq1 ABCD [4]\n" + + "#Seq2 MNOP [4]\n\n" + + "!Domain=Exon1 Gene=Adh Property=Coding;\n" + + "#Seq1 EFGHI [9]\n" + + "#Seq2 QRSTU [9]\n\n" + + "!Domain=Intron1 Gene=Adh Property=Noncoding;\n" + + "#Seq1 JK [11]\n" + + "#Seq2 VW [11]\n\n" + + "!Domain=Intron1 Property=domainend;\n" + + "#Seq1 LMN [14]\n" + + "#Seq2 XYZ [14]\n"; + //@formatter:on + MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE); + String printed = testee.print(); + assertEquals("Print format wrong", data, printed); + } + + @Test(groups = { "Functional" }) + public void testGetDomainFromAnnotation() + { + Annotation[] anns = new Annotation[5]; + anns[1] = new Annotation("", "Intron1", '0', 0f); + anns[2] = new Annotation("", "Intron2 (Aspx)", '0', 0f); + anns[3] = new Annotation("", "Intron3 (Aspy Coding)", '0', 0f); + anns[4] = new Annotation("", "Intron4 (Coding)", '0', 0f); + AlignmentAnnotation aa = new AlignmentAnnotation("", "", anns); + // no annotations: + assertNull(MegaFile.getDomainFromAnnotation(0, null)); + // null annotation: + assertNull(MegaFile.getDomainFromAnnotation(0, aa)); + // column out of range: + assertNull(MegaFile.getDomainFromAnnotation(5, aa)); + // domain with no Gene or Property: + assertEquals("Intron1", MegaFile.getDomainFromAnnotation(1, aa)); + // domain with Gene but no Property: + assertEquals("Intron2", MegaFile.getDomainFromAnnotation(2, aa)); + // domain with Gene and Property: + assertEquals("Intron3", MegaFile.getDomainFromAnnotation(3, aa)); + // domain with Property and no Gene: + assertEquals("Intron4", MegaFile.getDomainFromAnnotation(4, aa)); + } + + @Test(groups = { "Functional" }) + public void testGetGeneFromAnnotation() + { + Annotation[] anns = new Annotation[3]; + anns[1] = new Annotation("", "Aspx", '0', 0f); + AlignmentAnnotation aa = new AlignmentAnnotation("", "", anns); + // no annotations: + assertNull(MegaFile.getGeneFromAnnotation(0, null)); + // null annotation: + assertNull(MegaFile.getGeneFromAnnotation(0, aa)); + // column out of range: + assertNull(MegaFile.getGeneFromAnnotation(3, aa)); + // gene annotation: + assertEquals("Aspx", MegaFile.getGeneFromAnnotation(1, aa)); + } + + @Test(groups = { "Functional" }) + public void testGetPropertyFromAnnotation() + { + Annotation[] anns = new Annotation[5]; + anns[1] = new Annotation("", "Intron1", '0', 0f); + anns[2] = new Annotation("", "Intron2 (Aspx)", '0', 0f); + anns[3] = new Annotation("", "Intron3 (Aspy Noncoding)", '0', 0f); + anns[4] = new Annotation("", "Exon1 (Aspx Coding)", '0', 0f); + AlignmentAnnotation aa = new AlignmentAnnotation("", "", anns); + // no annotations: + assertNull(MegaFile.getPropertyFromAnnotation(0, null)); + // null annotation: + assertNull(MegaFile.getPropertyFromAnnotation(0, aa)); + // column out of range: + assertNull(MegaFile.getPropertyFromAnnotation(5, aa)); + // domain with no Gene or Property: + assertNull(MegaFile.getPropertyFromAnnotation(1, aa)); + // domain with Gene but no Property: + assertNull(MegaFile.getPropertyFromAnnotation(2, aa)); + // domain with Gene and Property: + assertEquals("Noncoding", MegaFile.getPropertyFromAnnotation(3, aa)); + assertEquals("Coding", MegaFile.getPropertyFromAnnotation(4, aa)); + } } -- 1.7.10.2