*/
package jalview.io;
+import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Annotation;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
*/
public class MegaFile extends AlignFile
{
+ private static final char UNDERSCORE = '_';
+
private static final String WHITESPACE = "\\s+";
private static final int DEFAULT_LINE_LENGTH = 60;
private static final String SPACE = " ";
+ private static final String TAB = "\t";
+
/*
* number of sequence positions output per line
*/
// map of SequenceFeature's by sequence id
Map<String, List<SequenceFeature>> sequenceFeatures;
+ // each !Label line character becomes an Annotation (except underscores)
+ List<Annotation> labelAnnotations;
+
public MegaFile()
{
}
geneStart = new HashMap<String, Integer>();
domainStart = new HashMap<String, Integer>();
residuesRead = new HashMap<String, Integer>();
+ labelAnnotations = new ArrayList<Annotation>();
/*
* Read and process MEGA and Title/Format/Description headers if present.
dataLine = dataLine.trim();
if (dataLine.length() > 0)
{
- if (dataLine.startsWith(BANG + GENE)
- || dataLine.startsWith(BANG + DOMAIN))
+ dataLine = dataLine.replace(TAB, SPACE);
+ String upperCased = dataLine.toUpperCase();
+ if (upperCased.startsWith(BANG + GENE.toUpperCase())
+ || upperCased.startsWith(BANG + DOMAIN.toUpperCase()))
{
parseGeneOrDomain(dataLine);
}
- else if (dataLine.startsWith(BANG + LABEL))
+ else if (upperCased.startsWith(BANG + LABEL.toUpperCase()))
{
parseLabel(dataLine);
}
// remember the (longest) line length read in, so we can output the same
setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine));
- deriveSequences();
+ deriveSequencesAndFeatures();
+
+ deriveAnnotations();
}
/**
- * Parse a !Label
+ * If we parsed !Label statements into a list of Annotation objects, create an
+ * AlignmentAnnotation
+ */
+ protected void deriveAnnotations()
+ {
+ if (this.labelAnnotations.size() > 0)
+ {
+ Annotation[] anns = labelAnnotations
+ .toArray(new Annotation[labelAnnotations.size()]);
+ AlignmentAnnotation aa = new AlignmentAnnotation("MEGA", "Label",
+ anns);
+ this.annotations.add(aa);
+ }
+ }
+
+ /**
+ * Parse a !Label line. This contains a single character per position (column)
+ * of the alignment block above. An underscore character represents no label.
+ * Labels are assembled into an AlignmentAnnotation object.
*
* @param dataLine
+ * @throws FileFormatException
*/
- protected void parseLabel(String dataLine)
+ protected void parseLabel(String dataLine) throws FileFormatException
{
- // TODO Auto-generated method stub
+ // strip off leading !Label and following spaces
+ dataLine = dataLine.substring(LABEL.length() + 1).trim();
+
+ // remove internal spacing and any leading tab
+ String labels = dataLine.replace(SPACE, "");
+ if (labels.endsWith(SEMICOLON))
+ {
+ labels = labels.substring(0, labels.length() - 1);
+ }
+ else
+ {
+ System.err.println("Warning: '" + dataLine
+ + "' should end with semi-colon");
+ }
+ for (char c : labels.toCharArray())
+ {
+ if (c == UNDERSCORE)
+ {
+ this.labelAnnotations.add(null);
+ }
+ else
+ {
+ this.labelAnnotations.add(new Annotation(String.valueOf(c), "",
+ ' ', 0f));
+ }
+ }
+ /*
+ * sanity check - the number of labels added should exactly match the
+ * sequence length so far
+ */
+ int sequenceLength = seqData.isEmpty() ? 0 : seqData.values()
+ .iterator().next().length();
+ if (labelAnnotations.size() != sequenceLength)
+ {
+ System.err.println("Warning: file inconsistent - "
+ + labelAnnotations.size() + " labels for " + sequenceLength
+ + " positions after " + dataLine);
+ }
}
/**
* the order of processing below ensures that we correctly capture where a
* domain is in the context of an enclosing gene
*/
- processDomainEnd(domain, property);
+ processDomainEnd(domain, gene, property);
processGeneEnd(gene);
/**
* If we have been processing a domain, and it is not being continued, then
- * make a sequence feature for the domain just ended
+ * make a sequence feature for the domain just ended. Criteria for the domain
+ * not being continued are either an explicit new domain or gene name, or a
+ * 'Property=domainend' statement
*
* @param domain
+ * @param gene
* @param property
* @return true if a feature is created, else false
*/
- protected boolean processDomainEnd(String domain, String property)
+ protected boolean processDomainEnd(String domain, String gene,
+ String property)
{
+ boolean newGene = (gene != null && !gene.equals(currentGene));
+
String verboseDomain = makeVerboseDomainName(domain, property);
+
if (this.currentDomain != null)
{
+ boolean newDomain = !this.currentDomain.equals(verboseDomain);
boolean domainEnded = "domainend".equalsIgnoreCase(property);
- if (!this.currentDomain.equals(verboseDomain) || domainEnded)
+ if (newDomain || newGene || domainEnded)
{
createFeature(DOMAIN, currentDomain, domainStart);
+ currentDomain = null;
return true;
}
}
if (this.currentGene != null && !this.currentGene.equals(gene))
{
createFeature(GENE, currentGene, geneStart);
+ currentGene = null;
created = true;
}
/**
* Convert the parsed sequence strings to objects and store them in the model.
*/
- protected void deriveSequences()
+ protected void deriveSequencesAndFeatures()
{
Set<Entry<String, StringBuilder>> datasets = seqData.entrySet();
* @throws IOException
*/
protected void parseNoninterleavedDataLine(String dataLine)
- throws IOException
+ throws FileFormatException
{
if (currentSequenceId == null)
{
/*
* Oops. Data but no sequence id context.
*/
- throw new IOException("No sequence id context at: " + dataLine);
+ throw new FileFormatException("No sequence id context at: "
+ + dataLine);
}
assertInterleaved(false, dataLine);