import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.schemes.UserColourScheme;
import jalview.util.Comparison;
+import java.awt.Color;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
* http://www.megasoftware.net/manual.pdf <br>
* Limitations:
* <ul>
- * <li>nested comments (marked by [ ]) are accepted but not preserved</li>
- * <li>to be completed</li>
+ * <li>any comments (delimited by [ ]) are ignored and not preserved</li>
* </ul>
*
* @see http://www.megasoftware.net/
*/
public class MegaFile extends AlignFile
{
- private static final String MEGA_ANNOTATION_LABEL = "MEGA Label";
+ private static final String MEGA = "MEGA";
+
+ private static final String MEGA_ANNOTATION_LABEL = MEGA + " Label";
+
+ private static final String MEGA_ANNOTATION_GENE = MEGA + " Gene";
+
+ private static final String MEGA_ANNOTATION_DOMAIN = MEGA + " Domain";
private static final char UNDERSCORE = '_';
private static final String EQUALS = "=";
- private static final String MEGA_ID = HASHSIGN + "MEGA";
+ private static final String MEGA_ID = HASHSIGN + MEGA;
private static final String TITLE = "Title";
private static final String CODONSTART = "CodonStart";
+ private static final String DOMAINEND = "domainend";
+
private static final String LABEL = "Label";
/*
// current Gene if any we are parsing
private String currentGene;
- // start residue (base 1) per sequence of current gene
- Map<String, Integer> geneStart;
+ // start position in alignment (base 0) of current Gene
+ private int currentGeneStartCol;
+
+ // start residue (base 1) per sequence of current Gene
+ private Map<String, Integer> geneStart;
// current Domain if any we are parsing
private String currentDomain;
- // start residue (base 1) per sequence of current domain
- Map<String, Integer> domainStart;
+ // start position in alignment (base 0) of current Domain
+ private int currentDomainStartCol;
+
+ // start residue (base 1) per sequence of current Domain
+ private Map<String, Integer> domainStart;
// map of SequenceFeature's by sequence id
- Map<String, List<SequenceFeature>> sequenceFeatures;
+ private Map<String, List<SequenceFeature>> sequenceFeatures;
// each !Label line character becomes an Annotation (except underscores)
List<Annotation> labelAnnotations;
+ // records any declared Gene positions (including null values)
+ List<Annotation> geneAnnotations;
+
+ // records any declared Domain positions (including null values)
+ List<Annotation> domainAnnotations;
+
public MegaFile()
{
}
domainStart = new HashMap<String, Integer>();
residuesRead = new HashMap<String, Integer>();
labelAnnotations = new ArrayList<Annotation>();
+ geneAnnotations = new ArrayList<Annotation>();
+ domainAnnotations = new ArrayList<Annotation>();
+ currentDomainStartCol = -1;
+ currentGeneStartCol = -1;
/*
* Read and process MEGA and Title/Format/Description headers if present.
*/
currentSequenceId = "";
- boolean annotationAdded = false;
while (dataLine != null)
{
dataLine = dataLine.trim();
}
else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
{
- annotationAdded |= parseLabel(dataLine);
+ parseLabel(dataLine);
}
else
{
createFeature(GENE, currentGene, geneStart);
createFeature(DOMAIN, currentDomain, domainStart);
+ extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
+ extendAnnotation(domainAnnotations, currentDomain,
+ currentDomainStartCol);
+
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
deriveSequencesAndFeatures();
- if (annotationAdded)
- {
- deriveAnnotations();
- }
+ deriveAnnotations();
}
/**
- * If we parsed !Label statements into a list of Annotation objects, create an
- * AlignmentAnnotation
+ * Create AlignmentAnnotation for Label, Gene and Domain (provided at least
+ * one non-null annotation is present)
*/
protected void deriveAnnotations()
{
- if (this.labelAnnotations.size() > 0)
+ deriveAnnotation(this.labelAnnotations, MEGA_ANNOTATION_LABEL);
+ deriveAnnotation(this.geneAnnotations, MEGA_ANNOTATION_GENE);
+ deriveAnnotation(this.domainAnnotations, MEGA_ANNOTATION_DOMAIN);
+ }
+
+ /**
+ * Create and ad an AlignmentAnnotation (provided at least one non-null
+ * annotation is present)
+ *
+ * @param anns
+ * @param label
+ */
+ protected void deriveAnnotation(List<Annotation> anns, String label)
+ {
+ if (anns.size() > 0 && hasNonNullEntry(anns))
{
- Annotation[] anns = labelAnnotations
- .toArray(new Annotation[labelAnnotations.size()]);
- AlignmentAnnotation aa = new AlignmentAnnotation(MEGA_ANNOTATION_LABEL, "",
- anns);
+ Annotation[] annotationArray = anns.toArray(new Annotation[anns
+ .size()]);
+ AlignmentAnnotation aa = new AlignmentAnnotation(label, "",
+ annotationArray);
this.annotations.add(aa);
}
}
+ protected static boolean hasNonNullEntry(List<? extends Object> l)
+ {
+ for (Object o : l)
+ {
+ if (o != null)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
/**
* Parse a !Label line. This contains a single character per position (column)
* of the alignment block above. An underscore character represents no label.
{
this.firstDataBlockRead = true;
- /*
- * append null annotations to keep the annotations the same length as the
- * sequences (in case some blocks have !Label lines and some don't)
- */
+ padAnnotations(labelAnnotations);
+ }
- int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
- .iterator().next().length();
- int annotationsToAdd = sequenceLength - labelAnnotations.size();
+ /**
+ * Append null annotations to keep the annotations list the same length as the
+ * sequences. This ensures that when the list is converted to an array it is
+ * correctly aligned with the alignment columns. It is needed when there are
+ * gaps in declared 'annotations' in a MEGA file, such as lines with no !Label
+ * statement, or regions between marked genes or domains.
+ *
+ * @param anns
+ */
+ protected void padAnnotations(List<Annotation> anns)
+ {
+ addNullAnnotations(anns, getAlignmentWidth());
+ }
+
+ /**
+ * Append null annotations for positions up to (and excluding) the given end
+ * column (base 0)
+ *
+ * @param anns
+ * @param upTo
+ */
+ protected void addNullAnnotations(List<Annotation> anns, int upTo)
+ {
+ int annotationsToAdd = upTo - anns.size();
for (int i = 0; i < annotationsToAdd; i++)
{
- labelAnnotations.add(null);
+ anns.add(null);
}
}
String property, String codonStart)
{
/*
- * the order of processing below ensures that we correctly capture where a
- * domain is in the context of an enclosing gene
+ * the order of processing below ensures that we correctly handle a domain
+ * in the context of an enclosing gene
*/
processDomainEnd(domain, gene, property);
*/
protected void processDomainStart(String domain, String property)
{
- if ("domainend".equalsIgnoreCase(property))
+ if (DOMAINEND.equalsIgnoreCase(property))
{
currentDomain = null;
return;
{
String verboseDomain = makeVerboseDomainName(domain, property);
startSequenceFeature(domainStart);
+ currentDomainStartCol = getAlignmentWidth();
currentDomain = verboseDomain;
}
}
/**
+ * Returns the width of alignment parsed so far. Note we assume (as does MEGA)
+ * that all sequences are the same length, so we can just take the length of
+ * the first one.
+ *
+ * @return
+ */
+ protected int getAlignmentWidth()
+ {
+ return seqData.isEmpty() ? 0 : seqData.values()
+ .iterator().next().length();
+ }
+
+ /**
* If we have declared a gene, and it is not continuing, start a sequence
* feature for it
*
if (gene != null && !gene.equals(currentGene))
{
startSequenceFeature(geneStart);
+ currentGeneStartCol = getAlignmentWidth();
}
currentGene = gene;
}
if (this.currentDomain != null)
{
boolean newDomain = !this.currentDomain.equals(verboseDomain);
- boolean domainEnded = "domainend".equalsIgnoreCase(property);
+ boolean domainEnded = DOMAINEND.equalsIgnoreCase(property);
if (newDomain || newGene || domainEnded)
{
createFeature(DOMAIN, currentDomain, domainStart);
+ // and/or... create annnotations for domain
+ extendAnnotation(domainAnnotations, currentDomain,
+ currentDomainStartCol);
currentDomain = null;
+ currentDomainStartCol = -1;
return true;
}
}
if (this.currentGene != null && !this.currentGene.equals(gene))
{
createFeature(GENE, currentGene, geneStart);
+ // and/or... add annotations for Gene
+ extendAnnotation(geneAnnotations, currentGene, currentGeneStartCol);
currentGene = null;
+ currentGeneStartCol = -1;
created = true;
}
}
/**
+ * Helper method to add Annotation elements, with the given description and
+ * starting at the given start column, up to the end of the sequence length
+ * parsed so far. Null elements are inserted for any skipped columns since the
+ * last annotation (if any), i.e. positions with no annotation of this type.
+ *
+ * @param anns
+ * @param description
+ * @param startColumn
+ * the start column of the annotations to add, or -1 if nothing to
+ * add
+ */
+ protected void extendAnnotation(List<Annotation> anns,
+ String description, int startColumn)
+ {
+ int alignmentWidth = getAlignmentWidth();
+ addNullAnnotations(anns, startColumn == -1 ? alignmentWidth
+ : startColumn);
+
+ int numberToAdd = alignmentWidth - anns.size();
+ if (numberToAdd > 0)
+ {
+ Color col = description == null ? Color.black : UserColourScheme
+ .createColourFromName(description);
+ for (int i = 0; i < numberToAdd; i++)
+ {
+ anns.add(new Annotation("X", description, ' ', 0f, col));
+ }
+ }
+ }
+
+ /**
* Makes an expanded descriptive name for Domain if possible e.g.
* "Intron1 (Adh Coding)". Currently incorporates the current gene name (if
* any) and the Coding/Noncoding property value (if given).
{
int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10;
String none = "";
- if (annotations == null || annotations.isEmpty()
- || !MEGA_ANNOTATION_LABEL.equals(annotations.get(0).label))
+ AlignmentAnnotation ann = findAnnotation(MEGA_ANNOTATION_LABEL);
+ if (ann == null)
{
return none;
}
}
/**
+ * Returns the first stored annotation found with the given label, or null
+ *
+ * @param annotationLabel
+ * @return
+ */
+ protected AlignmentAnnotation findAnnotation(String annotationLabel)
+ {
+ if (annotations != null)
+ {
+ for (AlignmentAnnotation ann : annotations)
+ {
+ if (annotationLabel.equals(ann.label))
+ {
+ return ann;
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
* Flag this file as interleaved or not, based on data format. Throws an
* exception if has previously been determined to be otherwise.
*
verifySequenceFeature(sfs[7], "MEF2A", "Gene", 31, 36);
verifySequenceFeature(sfs[8], "BindingSite", "Domain", 37, 42);
}
+
+ /*
+ * verify gene and domain alignment annotations
+ */
+ assertEquals(2, testee.annotations.size());
+ AlignmentAnnotation ann = testee.annotations.get(0);
+ assertEquals("MEGA Gene", ann.label);
+ assertEquals(42, ann.annotations.length);
+ verifyAnnotation(ann, 0, 6, null);
+ verifyAnnotation(ann, 6, 24, "Adh");
+ verifyAnnotation(ann, 24, 30, "Opsin");
+ verifyAnnotation(ann, 30, 36, "MEF2A");
+ verifyAnnotation(ann, 37, 42, null);
+
+ ann = testee.annotations.get(1);
+ assertEquals("MEGA Domain", ann.label);
+ assertEquals(42, ann.annotations.length);
+ verifyAnnotation(ann, 0, 6, null);
+ verifyAnnotation(ann, 6, 12, "Exon1 (Adh Coding)");
+ verifyAnnotation(ann, 12, 18, "Intron1 (Adh Noncoding)");
+ verifyAnnotation(ann, 19, 24, "Exon2 (Adh Coding)");
+ verifyAnnotation(ann, 25, 30, "Intron1 (Opsin Noncoding)");
+ verifyAnnotation(ann, 31, 36, "Exon1 (MEF2A Coding)");
+ verifyAnnotation(ann, 37, 42, "BindingSite");
+
+ }
+
+ /**
+ * Helper method to verify a range of annotation positions all have the given
+ * description
+ *
+ * @param ann
+ * array of annotations to check
+ * @param from
+ * start index to check
+ * @param to
+ * end index to check (exclusive)
+ * @param description
+ * value to assert
+ */
+ protected void verifyAnnotation(AlignmentAnnotation ann, int from,
+ int to, String description)
+ {
+ for (int pos = from; pos < to; pos++)
+ {
+ if (description == null)
+ {
+ assertNull(ann.annotations[pos]);
+ }
+ else
+ {
+ assertEquals(description, ann.annotations[pos].description);
+ }
+ }
}
/**