--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.ext.ensembl.EnsemblSeqProxy;
+import jalview.util.MapList;
+import jalview.util.StringUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Base class with generic / common functionality for processing GFF3 data.
+ * Override this as required for any specialisations resulting from
+ * peculiarities of GFF3 generated by particular tools.
+ */
+public class Gff3Helper extends GffHelperBase
+{
+ protected static final String TARGET = "Target";
+
+ protected static final String ID = "ID";
+
+ private static final String NAME = "Name";
+
+ /**
+ * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
+ * separate multiple values for a name
+ *
+ * @param text
+ * @return
+ */
+ public static Map<String, List<String>> parseNameValuePairs(String text)
+ {
+ return parseNameValuePairs(text, ";", '=', ",");
+ }
+
+ /**
+ * Process one GFF feature line (as modelled by SequenceFeature)
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param sf
+ * the sequence feature with ATTRIBUTES property containing any
+ * additional attributes
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return true if the sequence feature should be added to the sequence, else
+ * false (i.e. it has been processed in another way e.g. to generate a
+ * mapping)
+ * @throws IOException
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gff,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching) throws IOException
+ {
+ SequenceFeature sf = null;
+
+ if (gff.length == 9)
+ {
+ String soTerm = gff[TYPE_COL];
+ String atts = gff[ATTRIBUTES_COL];
+ Map<String, List<String>> attributes = parseNameValuePairs(atts);
+
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
+ {
+ sf = processProteinMatch(attributes, seq, gff, align, newseqs,
+ relaxedIdMatching);
+ }
+ else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
+ {
+ sf = processNucleotideMatch(attributes, seq, gff, align,
+ newseqs, relaxedIdMatching);
+ }
+ else
+ {
+ sf = buildSequenceFeature(gff, attributes);
+ }
+ }
+ else
+ {
+ /*
+ * fall back on generating a sequence feature with no special processing
+ */
+ sf = buildSequenceFeature(gff, null);
+ }
+
+ return sf;
+ }
+
+ /**
+ * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
+ *
+ * @param attributes
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gffColumns
+ * the GFF column data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @return a sequence feature, if one should be added to the sequence, else
+ * null
+ * @throws IOException
+ */
+ protected SequenceFeature processNucleotideMatch(
+ Map<String, List<String>> attributes, SequenceI seq,
+ String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ throws IOException
+ {
+ String strand = gffColumns[STRAND_COL];
+
+ /*
+ * (For now) we don't process mappings from reverse complement ; to do
+ * this would require (a) creating a virtual sequence placeholder for
+ * the reverse complement (b) resolving the sequence by its id from some
+ * source (GFF ##FASTA or other) (c) creating the reverse complement
+ * sequence (d) updating the mapping to be to the reverse complement
+ */
+ if ("-".equals(strand))
+ {
+ System.err
+ .println("Skipping mapping from reverse complement as not yet supported");
+ return null;
+ }
+
+ List<String> targets = attributes.get(TARGET);
+ if (targets == null)
+ {
+ System.err.println("'Target' missing in GFF");
+ return null;
+ }
+
+ /*
+ * Typically we only expect one Target per GFF line, but this can handle
+ * multiple matches, to the same or different sequences (e.g. dna variants)
+ */
+ for (String target : targets)
+ {
+ /*
+ * Process "seqid start end [strand]"
+ */
+ String[] tokens = target.split(" ");
+ if (tokens.length < 3)
+ {
+ System.err.println("Incomplete Target: " + target);
+ continue;
+ }
+
+ /*
+ * Locate the mapped sequence in the alignment, or as a
+ * (new or existing) virtual sequence in the newseqs list
+ */
+ String targetId = findTargetId(tokens[0], attributes);
+ SequenceI mappedSequence1 = findSequence(targetId, align,
+ newseqs, relaxedIdMatching);
+ SequenceI mappedSequence = mappedSequence1;
+ if (mappedSequence == null)
+ {
+ continue;
+ }
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
+
+ try
+ {
+ int toStart = Integer.parseInt(tokens[1]);
+ int toEnd = Integer.parseInt(tokens[2]);
+ if (tokens.length > 3 && "-".equals(tokens[3]))
+ {
+ // mapping to reverse strand - swap start/end
+ int temp = toStart;
+ toStart = toEnd;
+ toEnd = temp;
+ }
+
+ int fromStart = Integer.parseInt(gffColumns[START_COL]);
+ int fromEnd = Integer.parseInt(gffColumns[END_COL]);
+ MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
+ toStart, toEnd,
+ MappingType.NucleotideToNucleotide);
+
+ if (mapping != null)
+ {
+ acf.addMap(seq, mappedSequence, mapping);
+ align.addCodonFrame(acf);
+ }
+ } catch (NumberFormatException nfe)
+ {
+ System.err.println("Invalid start or end in Target " + target);
+ }
+ }
+
+ SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
+ return sf;
+ }
+
+ /**
+ * Returns the target sequence id extracted from the GFF name/value pairs.
+ * Default (standard behaviour) is the first token for "Target". This may be
+ * overridden where tools report this in a non-standard way.
+ *
+ * @param target
+ * first token of a "Target" value from GFF column 9, typically
+ * "seqid start end"
+ * @param set
+ * a map with all parsed column 9 attributes
+ * @return
+ */
+ @SuppressWarnings("unused")
+ protected String findTargetId(String target, Map<String, List<String>> set)
+ {
+ return target;
+ }
+
+ /**
+ * Processes one GFF 'protein_match'; fields of interest are
+ * <ul>
+ * <li>feature group - the database reporting a match e.g. Pfam</li>
+ * <li>Name - the matched entry's accession id in the database</li>
+ * <li>ID - a sequence identifier for the matched region (which may be
+ * appended as FASTA in the GFF file)</li>
+ * </ul>
+ *
+ * @param set
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gffColumns
+ * the sequence feature holding GFF data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @return the (real or virtual) sequence(s) mapped to by this match
+ * @throws IOException
+ */
+ protected SequenceFeature processProteinMatch(
+ Map<String, List<String>> set, SequenceI seq,
+ String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ {
+ // This is currently tailored to InterProScan GFF output:
+ // ID holds the ID of the matched sequence, Target references the
+ // query sequence; this looks wrong, as ID should just be the GFF internal
+ // ID of the GFF feature, while Target would normally reference the matched
+ // sequence.
+ // TODO refactor as needed if other protein-protein GFF varies
+
+ SequenceFeature sf = buildSequenceFeature(gffColumns, set);
+
+ /*
+ * locate the mapped sequence in the alignment, or as a
+ * (new or existing) virtual sequence in the newseqs list
+ */
+ List<String> targets = set.get(TARGET);
+ if (targets != null)
+ {
+ for (String target : targets)
+ {
+
+ SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
+ newseqs, relaxedIdMatching);
+ SequenceI mappedSequence = mappedSequence1;
+ if (mappedSequence == null)
+ {
+ continue;
+ }
+
+ /*
+ * give the mapped sequence a copy of the sequence feature, with
+ * start/end range adjusted
+ */
+ SequenceFeature sf2 = new SequenceFeature(sf);
+ sf2.setBegin(1);
+ int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
+ sf2.setEnd(sequenceFeatureLength);
+ mappedSequence.addSequenceFeature(sf2);
+
+ /*
+ * add a property to the mapped sequence so that it can eventually be
+ * renamed with its qualified accession id; renaming has to wait until
+ * all sequence reference resolution is complete
+ */
+ String accessionId = StringUtils.listToDelimitedString(
+ set.get(NAME), ",");
+ if (accessionId.length() > 0)
+ {
+ String database = sf.getType(); // TODO InterProScan only??
+ String qualifiedAccId = database + "|" + accessionId;
+ sf2.setValue(RENAME_TOKEN, qualifiedAccId);
+ }
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
+ int[] from = new int[] { sf.getBegin(), sf.getEnd() };
+ int[] to = new int[] { 1, sequenceFeatureLength };
+ MapList mapping = new MapList(from, to, 1, 1);
+
+ alco.addMap(seq, mappedSequence, mapping);
+ align.addCodonFrame(alco);
+ }
+ }
+
+ return sf;
+ }
+
+ /**
+ * Return '=' as the name-value separator used in column 9 attributes.
+ */
+ @Override
+ protected char getNameValueSeparator()
+ {
+ return '=';
+ }
+
+ /**
+ * Modifies the default SequenceFeature in order to set the Target sequence id
+ * as the description
+ */
+ @Override
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> attributes)
+ {
+ SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
+ String desc = getDescription(sf, attributes);
+ if (desc != null)
+ {
+ sf.setDescription(desc);
+ }
+ return sf;
+ }
+
+ /**
+ * Apply heuristic rules to try to get the most useful feature description
+ *
+ * @param sf
+ * @param attributes
+ * @return
+ */
+ protected String getDescription(SequenceFeature sf,
+ Map<String, List<String>> attributes)
+ {
+ String desc = null;
+ String target = (String) sf.getValue(TARGET);
+ if (target != null)
+ {
+ desc = target.split(" ")[0];
+ }
+
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ String type = sf.getType();
+ if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ /*
+ * Ensembl returns dna variants as 'alleles'
+ */
+ desc = StringUtils.listToDelimitedString(
+ attributes.get("alleles"), ",");
+ }
+
+ /*
+ * extract 'Name' for a transcript (to show gene name)
+ * or an exon (so 'colour by label' shows exon boundaries)
+ */
+ if (EnsemblSeqProxy.isTranscript(type)
+ || so.isA(type, SequenceOntology.EXON))
+ {
+ desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
+ }
+
+ /*
+ * if the above fails, try ID
+ */
+ if (desc == null)
+ {
+ desc = (String) sf.getValue(ID);
+ }
+
+ return desc;
+ }
+}