1 package jalview.io.gff;
3 import jalview.datamodel.AlignedCodonFrame;
4 import jalview.datamodel.AlignmentI;
5 import jalview.datamodel.MappingType;
6 import jalview.datamodel.SequenceFeature;
7 import jalview.datamodel.SequenceI;
8 import jalview.ext.ensembl.EnsemblSeqProxy;
9 import jalview.util.MapList;
10 import jalview.util.StringUtils;
12 import java.io.IOException;
13 import java.util.List;
17 * Base class with generic / common functionality for processing GFF3 data.
18 * Override this as required for any specialisations resulting from
19 * peculiarities of GFF3 generated by particular tools.
21 public class Gff3Helper extends GffHelperBase
23 protected static final String TARGET = "Target";
25 protected static final String ID = "ID";
27 private static final String NAME = "Name";
30 * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
31 * separate multiple values for a name
36 public static Map<String, List<String>> parseNameValuePairs(String text)
38 return parseNameValuePairs(text, ";", '=', ",");
42 * Process one GFF feature line (as modelled by SequenceFeature)
45 * the sequence with which this feature is associated
47 * the sequence feature with ATTRIBUTES property containing any
48 * additional attributes
50 * the alignment we are adding GFF to
52 * any new sequences referenced by the GFF
53 * @param relaxedIdMatching
54 * if true, match word tokens in sequence names
55 * @return true if the sequence feature should be added to the sequence, else
56 * false (i.e. it has been processed in another way e.g. to generate a
61 public SequenceFeature processGff(SequenceI seq, String[] gff,
62 AlignmentI align, List<SequenceI> newseqs,
63 boolean relaxedIdMatching) throws IOException
65 SequenceFeature sf = null;
69 String soTerm = gff[TYPE_COL];
70 String atts = gff[ATTRIBUTES_COL];
71 Map<String, List<String>> attributes = parseNameValuePairs(atts);
73 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
74 if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
76 sf = processProteinMatch(attributes, seq, gff, align, newseqs,
79 else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
81 sf = processNucleotideMatch(attributes, seq, gff, align,
82 newseqs, relaxedIdMatching);
86 sf = buildSequenceFeature(gff, attributes);
92 * fall back on generating a sequence feature with no special processing
94 sf = buildSequenceFeature(gff, null);
101 * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
104 * parsed GFF column 9 key/value(s)
106 * the sequence the GFF feature is on
108 * the GFF column data
110 * the alignment the sequence belongs to, where any new mappings
113 * a list of new 'virtual sequences' generated while parsing GFF
114 * @param relaxedIdMatching
115 * if true allow fuzzy search for a matching target sequence
116 * @return a sequence feature, if one should be added to the sequence, else
118 * @throws IOException
120 protected SequenceFeature processNucleotideMatch(
121 Map<String, List<String>> attributes, SequenceI seq,
122 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
123 boolean relaxedIdMatching)
126 String strand = gffColumns[STRAND_COL];
129 * (For now) we don't process mappings from reverse complement ; to do
130 * this would require (a) creating a virtual sequence placeholder for
131 * the reverse complement (b) resolving the sequence by its id from some
132 * source (GFF ##FASTA or other) (c) creating the reverse complement
133 * sequence (d) updating the mapping to be to the reverse complement
135 if ("-".equals(strand))
138 .println("Skipping mapping from reverse complement as not yet supported");
142 List<String> targets = attributes.get(TARGET);
145 System.err.println("'Target' missing in GFF");
150 * Typically we only expect one Target per GFF line, but this can handle
151 * multiple matches, to the same or different sequences (e.g. dna variants)
153 for (String target : targets)
156 * Process "seqid start end [strand]"
158 String[] tokens = target.split(" ");
159 if (tokens.length < 3)
161 System.err.println("Incomplete Target: " + target);
166 * Locate the mapped sequence in the alignment, or as a
167 * (new or existing) virtual sequence in the newseqs list
169 String targetId = findTargetId(tokens[0], attributes);
170 SequenceI mappedSequence1 = findSequence(targetId, align,
171 newseqs, relaxedIdMatching);
172 SequenceI mappedSequence = mappedSequence1;
173 if (mappedSequence == null)
179 * get any existing mapping for these sequences (or start one),
180 * and add this mapped range
182 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
186 int toStart = Integer.parseInt(tokens[1]);
187 int toEnd = Integer.parseInt(tokens[2]);
188 if (tokens.length > 3 && "-".equals(tokens[3]))
190 // mapping to reverse strand - swap start/end
196 int fromStart = Integer.parseInt(gffColumns[START_COL]);
197 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
198 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
200 MappingType.NucleotideToNucleotide);
204 acf.addMap(seq, mappedSequence, mapping);
205 align.addCodonFrame(acf);
207 } catch (NumberFormatException nfe)
209 System.err.println("Invalid start or end in Target " + target);
213 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
218 * Returns the target sequence id extracted from the GFF name/value pairs.
219 * Default (standard behaviour) is the first token for "Target". This may be
220 * overridden where tools report this in a non-standard way.
223 * first token of a "Target" value from GFF column 9, typically
226 * a map with all parsed column 9 attributes
229 @SuppressWarnings("unused")
230 protected String findTargetId(String target, Map<String, List<String>> set)
236 * Processes one GFF 'protein_match'; fields of interest are
238 * <li>feature group - the database reporting a match e.g. Pfam</li>
239 * <li>Name - the matched entry's accession id in the database</li>
240 * <li>ID - a sequence identifier for the matched region (which may be
241 * appended as FASTA in the GFF file)</li>
245 * parsed GFF column 9 key/value(s)
247 * the sequence the GFF feature is on
249 * the sequence feature holding GFF data
251 * the alignment the sequence belongs to, where any new mappings
254 * a list of new 'virtual sequences' generated while parsing GFF
255 * @param relaxedIdMatching
256 * if true allow fuzzy search for a matching target sequence
257 * @return the (real or virtual) sequence(s) mapped to by this match
258 * @throws IOException
260 protected SequenceFeature processProteinMatch(
261 Map<String, List<String>> set, SequenceI seq,
262 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
263 boolean relaxedIdMatching)
265 // This is currently tailored to InterProScan GFF output:
266 // ID holds the ID of the matched sequence, Target references the
267 // query sequence; this looks wrong, as ID should just be the GFF internal
268 // ID of the GFF feature, while Target would normally reference the matched
270 // TODO refactor as needed if other protein-protein GFF varies
272 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
275 * locate the mapped sequence in the alignment, or as a
276 * (new or existing) virtual sequence in the newseqs list
278 List<String> targets = set.get(TARGET);
281 for (String target : targets)
284 SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
285 newseqs, relaxedIdMatching);
286 SequenceI mappedSequence = mappedSequence1;
287 if (mappedSequence == null)
293 * give the mapped sequence a copy of the sequence feature, with
294 * start/end range adjusted
296 SequenceFeature sf2 = new SequenceFeature(sf);
298 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
299 sf2.setEnd(sequenceFeatureLength);
300 mappedSequence.addSequenceFeature(sf2);
303 * add a property to the mapped sequence so that it can eventually be
304 * renamed with its qualified accession id; renaming has to wait until
305 * all sequence reference resolution is complete
307 String accessionId = StringUtils.listToDelimitedString(
309 if (accessionId.length() > 0)
311 String database = sf.getType(); // TODO InterProScan only??
312 String qualifiedAccId = database + "|" + accessionId;
313 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
317 * get any existing mapping for these sequences (or start one),
318 * and add this mapped range
320 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
321 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
322 int[] to = new int[] { 1, sequenceFeatureLength };
323 MapList mapping = new MapList(from, to, 1, 1);
325 alco.addMap(seq, mappedSequence, mapping);
326 align.addCodonFrame(alco);
334 * Return '=' as the name-value separator used in column 9 attributes.
337 protected char getNameValueSeparator()
343 * Modifies the default SequenceFeature in order to set the Target sequence id
347 protected SequenceFeature buildSequenceFeature(String[] gff,
348 Map<String, List<String>> attributes)
350 SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
351 String desc = getDescription(sf, attributes);
354 sf.setDescription(desc);
360 * Apply heuristic rules to try to get the most useful feature description
366 protected String getDescription(SequenceFeature sf,
367 Map<String, List<String>> attributes)
370 String target = (String) sf.getValue(TARGET);
373 desc = target.split(" ")[0];
376 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
377 String type = sf.getType();
378 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
381 * Ensembl returns dna variants as 'alleles'
383 desc = StringUtils.listToDelimitedString(
384 attributes.get("alleles"), ",");
388 * extract 'Name' for a transcript (to show gene name)
389 * or an exon (so 'colour by label' shows exon boundaries)
391 if (EnsemblSeqProxy.isTranscript(type)
392 || so.isA(type, SequenceOntology.EXON))
394 desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");