*/
package jalview.io;
+import jalview.analysis.AlignmentUtils;
import jalview.analysis.SequenceIdMatcher;
import jalview.api.AlignViewportI;
+import jalview.api.FeaturesSourceI;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.io.gff.GffHelperBase;
+import jalview.io.gff.GffHelperFactory;
+import jalview.io.gff.GffHelperI;
import jalview.schemes.AnnotationColourGradient;
import jalview.schemes.GraduatedColor;
import jalview.schemes.UserColourScheme;
* @author jbprocter
* @author gmcarstairs
*/
-public class FeaturesFile extends AlignFile
+public class FeaturesFile extends AlignFile implements FeaturesSourceI
{
- private static final String NOTE = "Note";
-
- private static final String ALIGN = "Align";
-
- private static final String QUERY = "Query";
-
- private static final String TARGET = "Target";
-
- private static final String SIMILARITY = "similarity";
+ private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
- protected static final String STRAND = "STRAND";
+ private static final String NOTE = "Note";
protected static final String FRAME = "FRAME";
- protected static final String ATTRIBUTES = "ATTRIBUTES";
-
protected static final String TAB = "\t";
protected static final String GFF_VERSION = "##gff-version";
String line = null;
try
{
- StringTokenizer st;
+ String[] gffColumns;
String featureGroup = null;
while ((line = nextLine()) != null)
continue;
}
- st = new StringTokenizer(line, TAB);
- if (st.countTokens() == 1)
+ gffColumns = line.split("\\t"); // tab as regex
+ if (gffColumns.length == 1)
{
if (line.trim().equalsIgnoreCase("GFF"))
{
/*
- * Jalview features file with appendded GFF
- * assume GFF2 (though it may declare gff-version 3)
+ * Jalview features file with appended GFF
+ * assume GFF2 (though it may declare ##gff-version 3)
*/
gffVersion = 2;
continue;
}
}
- if (st.countTokens() > 1 && st.countTokens() < 4)
+ if (gffColumns.length > 1 && gffColumns.length < 4)
{
/*
* if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
- * a feature type colour specification; not GFF format
+ * a feature type colour specification
*/
- String ft = st.nextToken();
+ String ft = gffColumns[0];
if (ft.equalsIgnoreCase("startgroup"))
{
- featureGroup = st.nextToken();
+ featureGroup = gffColumns[1];
}
else if (ft.equalsIgnoreCase("endgroup"))
{
// We should check whether this is the current group,
// but at present theres no way of showing more than 1 group
- st.nextToken();
featureGroup = null;
}
else
{
- parseFeatureColour(line, ft, st, colours);
+ parseFeatureColour(line, ft, gffColumns, colours);
}
continue;
}
*/
if (gffVersion == 0)
{
- parseJalviewFeature(line, st, align, colours, removeHTML,
+ parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
relaxedIdmatching, featureGroup);
}
else
{
- parseGffFeature(st, align, relaxedIdmatching, newseqs);
+ parseGff(gffColumns, align, relaxedIdmatching, newseqs);
}
}
resetMatcher();
}
/**
- * Try to parse a Jalview format feature specification. Returns true if
- * successful or false if not.
+ * Try to parse a Jalview format feature specification and add it as a
+ * sequence feature to any matching sequences in the alignment. Returns true
+ * if successful (a feature was added), or false if not.
*
* @param line
- * @param st
+ * @param gffColumns
* @param alignment
* @param featureColours
* @param removeHTML
* @param relaxedIdmatching
* @param featureGroup
*/
- protected boolean parseJalviewFeature(String line, StringTokenizer st,
+ protected boolean parseJalviewFeature(String line, String[] gffColumns,
AlignmentI alignment, Map<String, Object> featureColours,
boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
{
/*
- * Jalview: description seqid seqIndex start end type [score]
+ * tokens: description seqid seqIndex start end type [score]
*/
- if (st.countTokens() < 6)
+ if (gffColumns.length < 6)
{
System.err.println("Ignoring feature line '" + line
- + "' with unexpected number of columns (" + st.countTokens()
- + ")");
+ + "' with too few columns (" + gffColumns.length + ")");
return false;
}
- String desc = st.nextToken();
- String seqId = st.nextToken();
- SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId);
+ String desc = gffColumns[0];
+ String seqId = gffColumns[1];
+ SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
- if (!seqId.equals("ID_NOT_SPECIFIED"))
+ if (!ID_NOT_SPECIFIED.equals(seqId))
{
- seq = findName(alignment, null, relaxedIdMatching, seqId);
- st.nextToken();
+ seq = findSequence(seqId, alignment, null, relaxedIdMatching);
}
else
{
seqId = null;
seq = null;
+ String seqIndex = gffColumns[2];
try
{
- int idx = Integer.parseInt(st.nextToken());
+ int idx = Integer.parseInt(seqIndex);
seq = alignment.getSequenceAt(idx);
} catch (NumberFormatException ex)
{
- // continue
+ System.err.println("Invalid sequence index: " + seqIndex);
}
}
return false;
}
- int startPos = Integer.parseInt(st.nextToken());
- int endPos = Integer.parseInt(st.nextToken());
+ int startPos = Integer.parseInt(gffColumns[3]);
+ int endPos = Integer.parseInt(gffColumns[4]);
- String ft = st.nextToken();
+ String ft = gffColumns[5];
if (!featureColours.containsKey(ft))
{
UserColourScheme ucs = new UserColourScheme(ft);
featureColours.put(ft, ucs.findColour('A'));
}
- SequenceFeature sf = new SequenceFeature(ft, desc, "",
- startPos, endPos, featureGroup);
- if (st.hasMoreTokens())
+ SequenceFeature sf = new SequenceFeature(ft, desc, "", startPos,
+ endPos, featureGroup);
+ if (gffColumns.length > 6)
{
- float score = 0f;
+ float score = Float.NaN;
try
{
- score = new Float(st.nextToken()).floatValue();
+ score = new Float(gffColumns[6]).floatValue();
// update colourgradient bounds if allowed to
} catch (NumberFormatException ex)
{
- // leave as 0
+ // leave as NaN
}
sf.setScore(score);
}
* the current input line (for error messages only)
* @param featureType
* the first token on the line
- * @param st
- * holds remaining tokens on the line
+ * @param gffColumns
+ * holds tokens on the line
* @param colours
* map to which to add derived colour specification
*/
protected void parseFeatureColour(String line, String featureType,
- StringTokenizer st, Map<String, Object> colours)
+ String[] gffColumns, Map<String, Object> colours)
{
Object colour = null;
- String colscheme = st.nextToken();
+ String colscheme = gffColumns[1];
if (colscheme.indexOf("|") > -1
|| colscheme.trim().equalsIgnoreCase("label"))
{
* list, and returns it</li>
* </ul>
*
+ * @param seqId
* @param align
* @param newseqs
* @param relaxedIdMatching
- * @param seqId
+ *
* @return
*/
- protected SequenceI findName(AlignmentI align, List<SequenceI> newseqs,
- boolean relaxedIdMatching, String seqId)
+ protected SequenceI findSequence(String seqId, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
{
+ // TODO encapsulate in SequenceIdMatcher, share the matcher
+ // with the GffHelper (removing code duplication)
SequenceI match = null;
if (relaxedIdMatching)
{
* a map whose keys are the type names of visible features
* @return
*/
- public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible)
+ public String printGffFormat(SequenceI[] sequences,
+ Map<String, Object> visible)
{
return printGffFormat(sequences, visible, true, true);
}
* @param includeNonPositionalFeatures
* @return
*/
- public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible, boolean outputVisibleOnly,
+ public String printGffFormat(SequenceI[] sequences,
+ Map<String, Object> visible, boolean outputVisibleOnly,
boolean includeNonPositionalFeatures)
{
StringBuilder out = new StringBuilder(256);
*/
continue;
}
-
+
source = sf.featureGroup;
if (source == null)
{
source = sf.getDescription();
}
-
+
out.append(seq.getName());
out.append(TAB);
out.append(source);
out.append(TAB);
out.append(sf.score);
out.append(TAB);
-
- out.append(sf.getValue(STRAND, "."));
+
+ int strand = sf.getStrand();
+ out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
out.append(TAB);
-
+
out.append(sf.getValue(FRAME, "."));
-
+
// miscellaneous key-values (GFF column 9)
- String attributes = (String) sf.getValue(ATTRIBUTES);
+ String attributes = sf.getAttributes();
if (attributes != null)
{
out.append(TAB).append(attributes);
}
-
+
out.append(newline);
}
}
}
-
+
return out.toString();
}
toRanges[toRangesIndex++] = toStart;
toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
}
-
+
return new MapList(fromRanges, toRanges, 3, 1);
}
/**
- * Parse a GFF format feature. This may include creating a 'dummy' sequence
- * for the feature or its mapped sequence
+ * Parse a GFF format feature. This may include creating a 'dummy' sequence to
+ * hold the feature, or for its mapped sequence, or both, to be resolved
+ * either later in the GFF file (##FASTA section), or when the user loads
+ * additional sequences.
*
- * @param st
+ * @param gffColumns
* @param alignment
* @param relaxedIdMatching
* @param newseqs
* @return
*/
- protected SequenceI parseGffFeature(StringTokenizer st,
- AlignmentI alignment, boolean relaxedIdMatching,
- List<SequenceI> newseqs)
+ protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
+ boolean relaxedIdMatching, List<SequenceI> newseqs)
{
- SequenceI seq;
/*
* GFF: seqid source type start end score strand phase [attributes]
*/
- if (st.countTokens() < 8)
+ if (gffColumns.length < 5)
{
- System.err
- .println("Ignoring GFF feature line with unexpected number of columns ("
- + st.countTokens() + ")");
+ System.err.println("Ignoring GFF feature line with too few columns ("
+ + gffColumns.length + ")");
return null;
}
- String seqId = st.nextToken();
-
+
/*
* locate referenced sequence in alignment _or_
- * as a forward reference (SequenceDummy)
+ * as a forward or external reference (SequenceDummy)
*/
- seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
-
- String desc = st.nextToken();
- String group = null;
- if (desc.indexOf(' ') == -1)
- {
- // could also be a source term rather than description line
- group = desc;
- }
- String ft = st.nextToken();
- int startPos = StringUtils.parseInt(st.nextToken());
- int endPos = StringUtils.parseInt(st.nextToken());
- // TODO: decide if non positional feature assertion for input data
- // where end==0 is generally valid
- if (endPos == 0)
- {
- // treat as non-positional feature, regardless.
- startPos = 0;
- }
- float score = 0f;
- try
- {
- score = new Float(st.nextToken()).floatValue();
- } catch (NumberFormatException ex)
- {
- // leave at 0
- }
-
- SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
- endPos, score, group);
- if (st.hasMoreTokens())
- {
- sf.setValue(STRAND, st.nextToken());
- }
- if (st.hasMoreTokens())
- {
- sf.setValue(FRAME, st.nextToken());
- }
-
- if (st.hasMoreTokens())
- {
- processGffColumnNine(st.nextToken(), sf);
- }
-
- if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
- relaxedIdMatching))
+ String seqId = gffColumns[0];
+ SequenceI seq = findSequence(seqId, alignment, newseqs,
+ relaxedIdMatching);
+
+ SequenceFeature sf = null;
+ GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
+ if (helper != null)
{
- // check whether we should add the sequence feature to any other
- // sequences in the alignment with the same or similar
- while ((seq = alignment.findName(seq, seqId, true)) != null)
+ try
+ {
+ sf = helper.processGff(seq, gffColumns, alignment, newseqs,
+ relaxedIdMatching);
+ if (sf != null)
+ {
+ seq.addSequenceFeature(sf);
+ while ((seq = alignment.findName(seq, seqId, true)) != null)
+ {
+ seq.addSequenceFeature(new SequenceFeature(sf));
+ }
+ }
+ } catch (IOException e)
{
- seq.addSequenceFeature(new SequenceFeature(sf));
+ System.err.println("GFF parsing failed with: " + e.getMessage());
+ return null;
}
}
+
return seq;
}
*/
protected void processGffColumnNine(String attributes, SequenceFeature sf)
{
- sf.setValue(ATTRIBUTES, attributes);
-
+ sf.setAttributes(attributes);
+
/*
* Parse attributes in column 9 and add them to the sequence feature's
* 'otherData' table; use Note as a best proxy for description
*/
- char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
- Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
- nameValueSeparator);
+ char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
+ // TODO check we don't break GFF2 values which include commas here
+ Map<String, List<String>> nameValues = GffHelperBase
+ .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
for (Entry<String, List<String>> attr : nameValues.entrySet())
{
String values = StringUtils.listToDelimitedString(attr.getValue(),
}
FastaFile parser = new FastaFile(this);
List<SequenceI> includedseqs = parser.getSeqs();
+
SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
- // iterate over includedseqs, and replacing matching ones with newseqs
- // sequences. Generic iterator not used here because we modify includedseqs
- // as we go
+
+ /*
+ * iterate over includedseqs, and replacing matching ones with newseqs
+ * sequences. Generic iterator not used here because we modify
+ * includedseqs as we go
+ */
for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
{
// search for any dummy seqs that this sequence can be used to update
- SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
- if (dummyseq != null)
+ SequenceI includedSeq = includedseqs.get(p);
+ SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
+ if (dummyseq != null && dummyseq instanceof SequenceDummy)
{
- // dummyseq was created so it could be annotated and referred to in
- // alignments/codon mappings
-
- SequenceI mseq = includedseqs.get(p);
- // mseq is the 'template' imported from the FASTA file which we'll use
- // to coomplete dummyseq
- if (dummyseq instanceof SequenceDummy)
+ // probably have the pattern wrong
+ // idea is that a flyweight proxy for a sequence ID can be created for
+ // 1. stable reference creation
+ // 2. addition of annotation
+ // 3. future replacement by a real sequence
+ // current pattern is to create SequenceDummy objects - a convenience
+ // constructor for a Sequence.
+ // problem is that when promoted to a real sequence, all references
+ // need to be updated somehow. We avoid that by keeping the same object.
+ ((SequenceDummy) dummyseq).become(includedSeq);
+ dummyseq.createDatasetSequence();
+
+ /*
+ * Update mappings so they are now to the dataset sequence
+ */
+ for (AlignedCodonFrame mapping : align.getCodonFrames())
{
- // probably have the pattern wrong
- // idea is that a flyweight proxy for a sequence ID can be created for
- // 1. stable reference creation
- // 2. addition of annotation
- // 3. future replacement by a real sequence
- // current pattern is to create SequenceDummy objects - a convenience
- // constructor for a Sequence.
- // problem is that when promoted to a real sequence, all references
- // need
- // to be updated somehow.
- ((SequenceDummy) dummyseq).become(mseq);
- includedseqs.set(p, dummyseq); // template is no longer needed
+ mapping.updateToDataset(dummyseq);
}
+
+ /*
+ * replace parsed sequence with the realised forward reference
+ */
+ includedseqs.set(p, dummyseq);
}
}
- // finally add sequences to the dataset
+
+ /*
+ * finally add sequences to the dataset
+ */
for (SequenceI seq : includedseqs)
{
+ // experimental: mapping-based 'alignment' to query sequence
+ AlignmentUtils.alignSequenceAs(seq, align,
+ String.valueOf(align.getGapCharacter()), false, true);
+
+ // rename sequences if GFF handler requested this
+ // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
+ SequenceFeature[] sfs = seq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ String newName = (String) sfs[0].getValue(GffHelperI.RENAME_TOKEN);
+ if (newName != null)
+ {
+ seq.setName(newName);
+ }
+ }
align.addSequence(seq);
}
}
* @param newseqs
* @throws IOException
*/
- protected void processGffPragma(String line, Map<String, String> gffProps, AlignmentI align,
+ protected void processGffPragma(String line,
+ Map<String, String> gffProps, AlignmentI align,
List<SequenceI> newseqs) throws IOException
{
line = line.trim();
// close off any open 'forward references'
return;
}
-
+
String[] tokens = line.substring(2).split(" ");
String pragma = tokens[0];
String value = tokens.length == 1 ? null : tokens[1];
-
+
if ("gff-version".equalsIgnoreCase(pragma))
{
if (value != null)
}
}
}
+ else if ("sequence-region".equalsIgnoreCase(pragma))
+ {
+ // could capture <seqid start end> if wanted here
+ }
else if ("feature-ontology".equalsIgnoreCase(pragma))
{
// should resolve against the specified feature ontology URI
System.err.println("Ignoring unknown pragma: " + line);
}
}
-
- /**
- * Processes the 'Query' (or 'Target') and 'Align' properties associated with
- * an exonerate GFF similarity feature; these properties define the mapping of
- * the annotated feature (e.g. 'exon') to a related sequence.
- *
- * @param set
- * @param seq
- * @param sf
- * @param align
- * @param newseqs
- * @param relaxedIdMatching
- * @throws IOException
- */
- public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
- SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
- throws IOException
- {
- if (!validateExonerateModel(sf))
- {
- return;
- }
-
- int strand = sf.getStrand();
-
- /*
- * exonerate (protein2dna or protein2genome) may be run with
- * --showquerygff outputs
- * Target <dnaseqid> ; Align proteinStartPos dnaStartPos peptideCount
- * --showtargetgff outputs
- * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
- * where the Align spec may repeat
- */
- boolean mapIsFromCdna = true;
- List<String> mapTo = set.get(QUERY);
- if (mapTo == null)
- {
- mapTo = set.get(TARGET);
- mapIsFromCdna = false;
- }
- if (mapTo == null || mapTo.size() != 1)
- {
- throw new IOException(
- "Expecting exactly one sequence in Query field (got " + mapTo
- + ")");
- }
-
- /*
- * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
- */
- SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching,
- mapTo.get(0));
- /*
- * Process the Align maps and create cdna/protein maps;
- * ideally, the query sequences are in the alignment, but maybe not...
- */
- AlignedCodonFrame alco = new AlignedCodonFrame();
- MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN),
- mapIsFromCdna, strand);
-
- /*
- * Jalview always maps from dna to protein
- */
- if (mapIsFromCdna)
- {
- alco.addMap(seq, mappedSequence, codonmapping);
- }
- else
- {
- alco.addMap(mappedSequence, seq, codonmapping);
- }
- align.addCodonFrame(alco);
- }
-
- /**
- * Returns true if the exonerate model (saved from column 2 of the GFF as the
- * SequenceFeature's group) is one that we are willing to process, else false
- *
- * @param sf
- */
- protected boolean validateExonerateModel(SequenceFeature sf)
- {
- /*
- * we don't handle protein-to-protein or dna-to-dna alignment here
- */
- String source = sf.getFeatureGroup();
- if (source == null
- || (!source.contains("protein2dna") && !source
- .contains("protein2genome")))
- {
- System.err
- .println("I only accept protein2dna or protein2genome but found "
- + source);
- return false;
- }
- return true;
- }
-
- /**
- * take a sequence feature and examine its attributes to decide how it should
- * be added to a sequence
- *
- * @param seq
- * - the destination sequence constructed or discovered in the
- * current context
- * @param sf
- * - the base feature with ATTRIBUTES property containing any
- * additional attributes
- * @param gFFFile
- * - true if we are processing a GFF annotation file
- * @return true if sf was actually added to the sequence, false if it was
- * processed in another way
- */
- public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs,
- SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching)
- {
- String attr = (String) sf.getValue(ATTRIBUTES);
- boolean addFeature = true;
- if (attr != null)
- {
- for (String attset : attr.split(TAB))
- {
- Map<String, List<String>> set = StringUtils.parseNameValuePairs(
- attset, ";", new char[] { ' ', '-' });
-
- if (SIMILARITY.equals(sf.getType()))
- {
- try
- {
- addFeature = false;
- processGffSimilarity(set, seq, sf, align, newseqs,
- relaxedIdMatching);
- } catch (IOException ivfe)
- {
- System.err.println(ivfe);
- }
- }
- }
- }
- if (addFeature)
- {
- seq.addSequenceFeature(sf);
- }
- return addFeature;
- }
-
}