2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.MappingType;
26 import jalview.datamodel.SequenceFeature;
27 import jalview.datamodel.SequenceI;
28 import jalview.util.MapList;
29 import jalview.util.StringUtils;
31 import java.io.IOException;
32 import java.util.List;
36 * Base class with generic / common functionality for processing GFF3 data.
37 * Override this as required for any specialisations resulting from
38 * peculiarities of GFF3 generated by particular tools.
40 public class Gff3Helper extends GffHelperBase
42 protected static final String TARGET = "Target";
44 protected static final String ID = "ID";
46 private static final String NAME = "Name";
49 * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
50 * separate multiple values for a name
55 public static Map<String, List<String>> parseNameValuePairs(String text)
57 return parseNameValuePairs(text, ";", '=', ",");
61 * Process one GFF feature line (as modelled by SequenceFeature)
64 * the sequence with which this feature is associated
66 * the sequence feature with ATTRIBUTES property containing any
67 * additional attributes
69 * the alignment we are adding GFF to
71 * any new sequences referenced by the GFF
72 * @param relaxedIdMatching
73 * if true, match word tokens in sequence names
74 * @return true if the sequence feature should be added to the sequence, else
75 * false (i.e. it has been processed in another way e.g. to generate a
80 public SequenceFeature processGff(SequenceI seq, String[] gff,
81 AlignmentI align, List<SequenceI> newseqs,
82 boolean relaxedIdMatching) throws IOException
84 SequenceFeature sf = null;
88 String soTerm = gff[TYPE_COL];
89 String atts = gff[ATTRIBUTES_COL];
90 Map<String, List<String>> attributes = parseNameValuePairs(atts);
92 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
93 if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
95 sf = processProteinMatch(attributes, seq, gff, align, newseqs,
98 else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
100 sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
105 sf = buildSequenceFeature(gff, attributes);
111 * fall back on generating a sequence feature with no special processing
113 sf = buildSequenceFeature(gff, null);
120 * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
123 * parsed GFF column 9 key/value(s)
125 * the sequence the GFF feature is on
127 * the GFF column data
129 * the alignment the sequence belongs to, where any new mappings
132 * a list of new 'virtual sequences' generated while parsing GFF
133 * @param relaxedIdMatching
134 * if true allow fuzzy search for a matching target sequence
135 * @return a sequence feature, if one should be added to the sequence, else
137 * @throws IOException
139 protected SequenceFeature processNucleotideMatch(
140 Map<String, List<String>> attributes, SequenceI seq,
141 String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
142 boolean relaxedIdMatching) throws IOException
144 String strand = gffColumns[STRAND_COL];
147 * (For now) we don't process mappings from reverse complement ; to do
148 * this would require (a) creating a virtual sequence placeholder for
149 * the reverse complement (b) resolving the sequence by its id from some
150 * source (GFF ##FASTA or other) (c) creating the reverse complement
151 * sequence (d) updating the mapping to be to the reverse complement
153 if ("-".equals(strand))
156 "Skipping mapping from reverse complement as not yet supported");
160 List<String> targets = attributes.get(TARGET);
163 System.err.println("'Target' missing in GFF");
168 * Typically we only expect one Target per GFF line, but this can handle
169 * multiple matches, to the same or different sequences (e.g. dna variants)
171 for (String target : targets)
174 * Process "seqid start end [strand]"
176 String[] tokens = target.split(" ");
177 if (tokens.length < 3)
179 System.err.println("Incomplete Target: " + target);
184 * Locate the mapped sequence in the alignment, or as a
185 * (new or existing) virtual sequence in the newseqs list
187 String targetId = findTargetId(tokens[0], attributes);
188 SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
190 SequenceI mappedSequence = mappedSequence1;
191 if (mappedSequence == null)
197 * get any existing mapping for these sequences (or start one),
198 * and add this mapped range
200 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
204 int toStart = Integer.parseInt(tokens[1]);
205 int toEnd = Integer.parseInt(tokens[2]);
206 if (tokens.length > 3 && "-".equals(tokens[3]))
208 // mapping to reverse strand - swap start/end
214 int fromStart = Integer.parseInt(gffColumns[START_COL]);
215 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
216 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
217 toStart, toEnd, MappingType.NucleotideToNucleotide);
221 acf.addMap(seq, mappedSequence, mapping);
222 align.addCodonFrame(acf);
224 } catch (NumberFormatException nfe)
226 System.err.println("Invalid start or end in Target " + target);
230 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
235 * Returns the target sequence id extracted from the GFF name/value pairs.
236 * Default (standard behaviour) is the first token for "Target". This may be
237 * overridden where tools report this in a non-standard way.
240 * first token of a "Target" value from GFF column 9, typically
243 * a map with all parsed column 9 attributes
246 @SuppressWarnings("unused")
247 protected String findTargetId(String target,
248 Map<String, List<String>> set)
254 * Processes one GFF 'protein_match'; fields of interest are
256 * <li>feature group - the database reporting a match e.g. Pfam</li>
257 * <li>Name - the matched entry's accession id in the database</li>
258 * <li>ID - a sequence identifier for the matched region (which may be
259 * appended as FASTA in the GFF file)</li>
263 * parsed GFF column 9 key/value(s)
265 * the sequence the GFF feature is on
267 * the sequence feature holding GFF data
269 * the alignment the sequence belongs to, where any new mappings
272 * a list of new 'virtual sequences' generated while parsing GFF
273 * @param relaxedIdMatching
274 * if true allow fuzzy search for a matching target sequence
275 * @return the (real or virtual) sequence(s) mapped to by this match
276 * @throws IOException
278 protected SequenceFeature processProteinMatch(
279 Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
280 AlignmentI align, List<SequenceI> newseqs,
281 boolean relaxedIdMatching)
283 // This is currently tailored to InterProScan GFF output:
284 // ID holds the ID of the matched sequence, Target references the
285 // query sequence; this looks wrong, as ID should just be the GFF internal
286 // ID of the GFF feature, while Target would normally reference the matched
288 // TODO refactor as needed if other protein-protein GFF varies
290 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
293 * locate the mapped sequence in the alignment, or as a
294 * (new or existing) virtual sequence in the newseqs list
296 List<String> targets = set.get(TARGET);
299 for (String target : targets)
302 SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
303 align, newseqs, relaxedIdMatching);
304 SequenceI mappedSequence = mappedSequence1;
305 if (mappedSequence == null)
311 * give the mapped sequence a copy of the sequence feature, with
312 * start/end range adjusted
314 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
315 SequenceFeature sf2 = new SequenceFeature(sf, 1,
316 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
317 mappedSequence.addSequenceFeature(sf2);
320 * add a property to the mapped sequence so that it can eventually be
321 * renamed with its qualified accession id; renaming has to wait until
322 * all sequence reference resolution is complete
324 String accessionId = StringUtils
325 .listToDelimitedString(set.get(NAME), ",");
326 if (accessionId.length() > 0)
328 String database = sf.getType(); // TODO InterProScan only??
329 String qualifiedAccId = database + "|" + accessionId;
330 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
334 * get any existing mapping for these sequences (or start one),
335 * and add this mapped range
337 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
338 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
339 int[] to = new int[] { 1, sequenceFeatureLength };
340 MapList mapping = new MapList(from, to, 1, 1);
342 alco.addMap(seq, mappedSequence, mapping);
343 align.addCodonFrame(alco);
351 * Return '=' as the name-value separator used in column 9 attributes.
354 protected char getNameValueSeparator()
360 * Modifies the default SequenceFeature in order to set the Target sequence id
364 protected SequenceFeature buildSequenceFeature(String[] gff,
365 int typeColumn, String group,
366 Map<String, List<String>> attributes)
368 SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
370 String desc = getDescription(sf, attributes);
373 sf.setDescription(desc);
379 * Apply heuristic rules to try to get the most useful feature description
385 protected String getDescription(SequenceFeature sf,
386 Map<String, List<String>> attributes)
389 String target = (String) sf.getValue(TARGET);
392 desc = target.split(" ")[0];
395 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
396 String type = sf.getType();
397 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
400 * Ensembl returns dna variants as 'alleles'
402 desc = StringUtils.listToDelimitedString(attributes.get("alleles"),
407 * extract 'Name' for a transcript (to show gene name)
408 * or an exon (so 'colour by label' shows exon boundaries)
410 if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
411 || so.isA(type, SequenceOntologyI.TRANSCRIPT)
412 || so.isA(type, SequenceOntologyI.EXON))
414 desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
418 * if the above fails, try ID
422 desc = (String) sf.getValue(ID);