1 package jalview.io.gff;
3 import jalview.datamodel.AlignedCodonFrame;
4 import jalview.datamodel.AlignmentI;
5 import jalview.datamodel.MappingType;
6 import jalview.datamodel.SequenceFeature;
7 import jalview.datamodel.SequenceI;
8 import jalview.util.MapList;
9 import jalview.util.StringUtils;
11 import java.io.IOException;
12 import java.util.List;
16 * Base class with generic / common functionality for processing GFF3 data.
17 * Override this as required for any specialisations resulting from
18 * peculiarities of GFF3 generated by particular tools.
20 public class Gff3Helper extends GffHelperBase
22 protected static final String TARGET = "Target";
24 protected static final String ID = "ID";
26 private static final String NAME = "Name";
29 * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
30 * separate multiple values for a name
35 public static Map<String, List<String>> parseNameValuePairs(String text)
37 return parseNameValuePairs(text, ";", '=', ",");
41 * Process one GFF feature line (as modelled by SequenceFeature)
44 * the sequence with which this feature is associated
46 * the sequence feature with ATTRIBUTES property containing any
47 * additional attributes
49 * the alignment we are adding GFF to
51 * any new sequences referenced by the GFF
52 * @param relaxedIdMatching
53 * if true, match word tokens in sequence names
54 * @return true if the sequence feature should be added to the sequence, else
55 * false (i.e. it has been processed in another way e.g. to generate a
60 public SequenceFeature processGff(SequenceI seq, String[] gff,
61 AlignmentI align, List<SequenceI> newseqs,
62 boolean relaxedIdMatching) throws IOException
65 * (For now) we don't process mappings from reverse complement ; to do
66 * this would require (a) creating a virtual sequence placeholder for
67 * the reverse complement (b) resolving the sequence by its id from some
68 * source (GFF ##FASTA or other) (c) creating the reverse complement
69 * sequence (d) updating the mapping to be to the reverse complement
71 if ("-".equals(gff[STRAND_COL]))
74 .println("Skipping mapping from reverse complement as not yet supported");
77 SequenceFeature sf = null;
81 String soTerm = gff[TYPE_COL];
82 String atts = gff[ATTRIBUTES_COL];
83 Map<String, List<String>> attributes = parseNameValuePairs(atts);
85 if (SequenceOntology.getInstance().isProteinMatch(soTerm))
87 sf = processProteinMatch(attributes, seq, gff, align,
88 newseqs, relaxedIdMatching);
90 else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
92 sf = processNucleotideMatch(attributes, seq, gff, align,
93 newseqs, relaxedIdMatching);
97 sf = buildSequenceFeature(gff, attributes);
103 * fall back on generating a sequence feature with no special processing
105 sf = buildSequenceFeature(gff, null);
112 * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
115 * parsed GFF column 9 key/value(s)
117 * the sequence the GFF feature is on
119 * the GFF column data
121 * the alignment the sequence belongs to, where any new mappings
124 * a list of new 'virtual sequences' generated while parsing GFF
125 * @param relaxedIdMatching
126 * if true allow fuzzy search for a matching target sequence
127 * @return a sequence feature, if one should be added to the sequence, else
129 * @throws IOException
131 protected SequenceFeature processNucleotideMatch(
132 Map<String, List<String>> attributes, SequenceI seq,
133 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
134 boolean relaxedIdMatching)
137 String strand = gffColumns[STRAND_COL];
138 if ("-1".equals(strand))
141 .println("Currently ignoring mappings from reverse complement");
145 List<String> targets = attributes.get(TARGET);
148 System.err.println("'Target' missing in GFF");
153 * Typically we only expect one Target per GFF line, but this can handle
154 * multiple matches, to the same or different sequences (e.g. dna variants)
156 for (String target : targets)
159 * Process "seqid start end [strand]"
161 String[] tokens = target.split(" ");
162 if (tokens.length < 3)
164 System.err.println("Incomplete Target: " + target);
169 * Locate the mapped sequence in the alignment, or as a
170 * (new or existing) virtual sequence in the newseqs list
172 String targetId = findTargetId(tokens[0], attributes);
173 SequenceI mappedSequence1 = findSequence(targetId, align,
174 newseqs, relaxedIdMatching);
175 SequenceI mappedSequence = mappedSequence1;
176 if (mappedSequence == null)
182 * get any existing mapping for these sequences (or start one),
183 * and add this mapped range
185 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
189 int toStart = Integer.parseInt(tokens[1]);
190 int toEnd = Integer.parseInt(tokens[2]);
191 if (tokens.length > 3 && "-".equals(tokens[3]))
193 // mapping to reverse strand - swap start/end
199 int fromStart = Integer.parseInt(gffColumns[START_COL]);
200 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
201 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
203 MappingType.NucleotideToNucleotide);
207 acf.addMap(seq, mappedSequence, mapping);
208 align.addCodonFrame(acf);
210 } catch (NumberFormatException nfe)
212 System.err.println("Invalid start or end in Target " + target);
216 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
221 * Returns the target sequence id extracted from the GFF name/value pairs.
222 * Default (standard behaviour) is the first token for "Target". This may be
223 * overridden where tools report this in a non-standard way.
226 * first token of a "Target" value from GFF column 9, typically
229 * a map with all parsed column 9 attributes
232 @SuppressWarnings("unused")
233 protected String findTargetId(String target, Map<String, List<String>> set)
239 * Processes one GFF 'protein_match'; fields of interest are
241 * <li>feature group - the database reporting a match e.g. Pfam</li>
242 * <li>Name - the matched entry's accession id in the database</li>
243 * <li>ID - a sequence identifier for the matched region (which may be
244 * appended as FASTA in the GFF file)</li>
248 * parsed GFF column 9 key/value(s)
250 * the sequence the GFF feature is on
252 * the sequence feature holding GFF data
254 * the alignment the sequence belongs to, where any new mappings
257 * a list of new 'virtual sequences' generated while parsing GFF
258 * @param relaxedIdMatching
259 * if true allow fuzzy search for a matching target sequence
260 * @return the (real or virtual) sequence(s) mapped to by this match
261 * @throws IOException
263 protected SequenceFeature processProteinMatch(
264 Map<String, List<String>> set, SequenceI seq,
265 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
266 boolean relaxedIdMatching)
268 // This is currently tailored to InterProScan GFF output:
269 // ID holds the ID of the matched sequence, Target references the
270 // query sequence; this looks wrong, as ID should just be the GFF internal
271 // ID of the GFF feature, while Target would normally reference the matched
273 // TODO refactor as needed if other protein-protein GFF varies
275 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
278 * locate the mapped sequence in the alignment, or as a
279 * (new or existing) virtual sequence in the newseqs list
281 List<String> targets = set.get(TARGET);
284 for (String target : targets)
287 SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
288 newseqs, relaxedIdMatching);
289 SequenceI mappedSequence = mappedSequence1;
290 if (mappedSequence == null)
296 * give the mapped sequence a copy of the sequence feature, with
297 * start/end range adjusted
299 SequenceFeature sf2 = new SequenceFeature(sf);
301 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
302 sf2.setEnd(sequenceFeatureLength);
303 mappedSequence.addSequenceFeature(sf2);
306 * add a property to the mapped sequence so that it can eventually be
307 * renamed with its qualified accession id; renaming has to wait until
308 * all sequence reference resolution is complete
310 String accessionId = StringUtils.listToDelimitedString(
312 if (accessionId.length() > 0)
314 String database = sf.getType(); // TODO InterProScan only??
315 String qualifiedAccId = database + "|" + accessionId;
316 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
320 * get any existing mapping for these sequences (or start one),
321 * and add this mapped range
323 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
324 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
325 int[] to = new int[] { 1, sequenceFeatureLength };
326 MapList mapping = new MapList(from, to, 1, 1);
328 alco.addMap(seq, mappedSequence, mapping);
329 align.addCodonFrame(alco);
337 * Return '=' as the name-value separator used in column 9 attributes.
340 protected char getNameValueSeparator()
346 * Modifies the default SequenceFeature in order to set the Target sequence id
350 protected SequenceFeature buildSequenceFeature(String[] gff,
351 Map<String, List<String>> attributes)
353 SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
354 String target = (String) sf.getValue(TARGET);
357 sf.setDescription(target.split(" ")[0]);