2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.MappingType;
26 import jalview.datamodel.SequenceFeature;
27 import jalview.datamodel.SequenceI;
28 import jalview.util.MapList;
29 import jalview.util.StringUtils;
31 import java.io.IOException;
32 import java.util.List;
36 * Base class with generic / common functionality for processing GFF3 data.
37 * Override this as required for any specialisations resulting from
38 * peculiarities of GFF3 generated by particular tools.
40 public class Gff3Helper extends GffHelperBase
42 public static final String ALLELES = "alleles";
44 protected static final String TARGET = "Target";
46 protected static final String ID = "ID";
48 private static final String NAME = "Name";
51 * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
52 * separate multiple values for a name
57 public static Map<String, List<String>> parseNameValuePairs(String text)
59 return parseNameValuePairs(text, ";", '=', ",");
63 * Process one GFF feature line (as modelled by SequenceFeature)
66 * the sequence with which this feature is associated
68 * the sequence feature with ATTRIBUTES property containing any
69 * additional attributes
71 * the alignment we are adding GFF to
73 * any new sequences referenced by the GFF
74 * @param relaxedIdMatching
75 * if true, match word tokens in sequence names
76 * @return true if the sequence feature should be added to the sequence, else
77 * false (i.e. it has been processed in another way e.g. to generate a
82 public SequenceFeature processGff(SequenceI seq, String[] gff,
83 AlignmentI align, List<SequenceI> newseqs,
84 boolean relaxedIdMatching) throws IOException
86 SequenceFeature sf = null;
90 String soTerm = gff[TYPE_COL];
91 String atts = gff[ATTRIBUTES_COL];
92 Map<String, List<String>> attributes = parseNameValuePairs(atts);
94 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
95 if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
97 sf = processProteinMatch(attributes, seq, gff, align, newseqs,
100 else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
102 sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
107 sf = buildSequenceFeature(gff, attributes);
113 * fall back on generating a sequence feature with no special processing
115 sf = buildSequenceFeature(gff, null);
122 * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
125 * parsed GFF column 9 key/value(s)
127 * the sequence the GFF feature is on
129 * the GFF column data
131 * the alignment the sequence belongs to, where any new mappings
134 * a list of new 'virtual sequences' generated while parsing GFF
135 * @param relaxedIdMatching
136 * if true allow fuzzy search for a matching target sequence
137 * @return a sequence feature, if one should be added to the sequence, else
139 * @throws IOException
141 protected SequenceFeature processNucleotideMatch(
142 Map<String, List<String>> attributes, SequenceI seq,
143 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
144 boolean relaxedIdMatching) throws IOException
146 String strand = gffColumns[STRAND_COL];
149 * (For now) we don't process mappings from reverse complement ; to do
150 * this would require (a) creating a virtual sequence placeholder for
151 * the reverse complement (b) resolving the sequence by its id from some
152 * source (GFF ##FASTA or other) (c) creating the reverse complement
153 * sequence (d) updating the mapping to be to the reverse complement
155 if ("-".equals(strand))
157 jalview.bin.Console.errPrintln(
158 "Skipping mapping from reverse complement as not yet supported");
162 List<String> targets = attributes.get(TARGET);
165 jalview.bin.Console.errPrintln("'Target' missing in GFF");
170 * Typically we only expect one Target per GFF line, but this can handle
171 * multiple matches, to the same or different sequences (e.g. dna variants)
173 for (String target : targets)
176 * Process "seqid start end [strand]"
178 String[] tokens = target.split(" ");
179 if (tokens.length < 3)
181 jalview.bin.Console.errPrintln("Incomplete Target: " + target);
186 * Locate the mapped sequence in the alignment, or as a
187 * (new or existing) virtual sequence in the newseqs list
189 String targetId = findTargetId(tokens[0], attributes);
190 SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
192 SequenceI mappedSequence = mappedSequence1;
193 if (mappedSequence == null)
199 * get any existing mapping for these sequences (or start one),
200 * and add this mapped range
202 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
206 int toStart = Integer.parseInt(tokens[1]);
207 int toEnd = Integer.parseInt(tokens[2]);
208 if (tokens.length > 3 && "-".equals(tokens[3]))
210 // mapping to reverse strand - swap start/end
216 int fromStart = Integer.parseInt(gffColumns[START_COL]);
217 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
218 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
219 toStart, toEnd, MappingType.NucleotideToNucleotide);
223 acf.addMap(seq, mappedSequence, mapping);
224 align.addCodonFrame(acf);
226 } catch (NumberFormatException nfe)
229 .errPrintln("Invalid start or end in Target " + target);
233 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
238 * Returns the target sequence id extracted from the GFF name/value pairs.
239 * Default (standard behaviour) is the first token for "Target". This may be
240 * overridden where tools report this in a non-standard way.
243 * first token of a "Target" value from GFF column 9, typically
246 * a map with all parsed column 9 attributes
249 @SuppressWarnings("unused")
250 protected String findTargetId(String target,
251 Map<String, List<String>> set)
257 * Processes one GFF 'protein_match'; fields of interest are
259 * <li>feature group - the database reporting a match e.g. Pfam</li>
260 * <li>Name - the matched entry's accession id in the database</li>
261 * <li>ID - a sequence identifier for the matched region (which may be
262 * appended as FASTA in the GFF file)</li>
266 * parsed GFF column 9 key/value(s)
268 * the sequence the GFF feature is on
270 * the sequence feature holding GFF data
272 * the alignment the sequence belongs to, where any new mappings
275 * a list of new 'virtual sequences' generated while parsing GFF
276 * @param relaxedIdMatching
277 * if true allow fuzzy search for a matching target sequence
278 * @return the (real or virtual) sequence(s) mapped to by this match
279 * @throws IOException
281 protected SequenceFeature processProteinMatch(
282 Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
283 AlignmentI align, List<SequenceI> newseqs,
284 boolean relaxedIdMatching)
286 // This is currently tailored to InterProScan GFF output:
287 // ID holds the ID of the matched sequence, Target references the
288 // query sequence; this looks wrong, as ID should just be the GFF internal
289 // ID of the GFF feature, while Target would normally reference the matched
291 // TODO refactor as needed if other protein-protein GFF varies
293 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
296 * locate the mapped sequence in the alignment, or as a
297 * (new or existing) virtual sequence in the newseqs list
299 List<String> targets = set.get(TARGET);
302 for (String target : targets)
305 SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
306 align, newseqs, relaxedIdMatching);
307 SequenceI mappedSequence = mappedSequence1;
308 if (mappedSequence == null)
314 * give the mapped sequence a copy of the sequence feature, with
315 * start/end range adjusted
317 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
318 SequenceFeature sf2 = new SequenceFeature(sf, 1,
319 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
320 mappedSequence.addSequenceFeature(sf2);
323 * add a property to the mapped sequence so that it can eventually be
324 * renamed with its qualified accession id; renaming has to wait until
325 * all sequence reference resolution is complete
327 String accessionId = StringUtils
328 .listToDelimitedString(set.get(NAME), ",");
329 if (accessionId.length() > 0)
331 String database = sf.getType(); // TODO InterProScan only??
332 String qualifiedAccId = database + "|" + accessionId;
333 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
337 * get any existing mapping for these sequences (or start one),
338 * and add this mapped range
340 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
341 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
342 int[] to = new int[] { 1, sequenceFeatureLength };
343 MapList mapping = new MapList(from, to, 1, 1);
345 alco.addMap(seq, mappedSequence, mapping);
346 align.addCodonFrame(alco);
354 * Modifies the default SequenceFeature in order to set the Target sequence id
358 protected SequenceFeature buildSequenceFeature(String[] gff,
359 int typeColumn, String group,
360 Map<String, List<String>> attributes)
362 SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
364 String desc = getDescription(sf, attributes);
367 sf.setDescription(desc);
373 * Apply heuristic rules to try to get the most useful feature description
379 protected String getDescription(SequenceFeature sf,
380 Map<String, List<String>> attributes)
383 String target = (String) sf.getValue(TARGET);
386 desc = target.split(" ")[0];
389 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
390 String type = sf.getType();
391 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
394 * Ensembl returns dna variants as 'alleles'
396 desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
401 * extract 'Name' for a transcript (to show gene name)
402 * or an exon (so 'colour by label' shows exon boundaries)
404 if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
405 || so.isA(type, SequenceOntologyI.TRANSCRIPT)
406 || so.isA(type, SequenceOntologyI.EXON))
408 desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
412 * if the above fails, try ID
416 desc = (String) sf.getValue(ID);
420 * and decode comma, equals, semi-colon as required by GFF3 spec
422 desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);