/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io.gff;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.MappingType;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;
import jalview.util.StringUtils;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* Base class with generic / common functionality for processing GFF3 data.
* Override this as required for any specialisations resulting from
* peculiarities of GFF3 generated by particular tools.
*/
public class Gff3Helper extends GffHelperBase
{
public static final String ALLELES = "alleles";
protected static final String TARGET = "Target";
protected static final String ID = "ID";
private static final String NAME = "Name";
/**
* GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
* separate multiple values for a name
*
* @param text
* @return
*/
public static Map> parseNameValuePairs(String text)
{
return parseNameValuePairs(text, ";", '=', ",");
}
/**
* Process one GFF feature line (as modelled by SequenceFeature)
*
* @param seq
* the sequence with which this feature is associated
* @param sf
* the sequence feature with ATTRIBUTES property containing any
* additional attributes
* @param align
* the alignment we are adding GFF to
* @param newseqs
* any new sequences referenced by the GFF
* @param relaxedIdMatching
* if true, match word tokens in sequence names
* @return true if the sequence feature should be added to the sequence, else
* false (i.e. it has been processed in another way e.g. to generate a
* mapping)
* @throws IOException
*/
@Override
public SequenceFeature processGff(SequenceI seq, String[] gff,
AlignmentI align, List newseqs,
boolean relaxedIdMatching) throws IOException
{
SequenceFeature sf = null;
if (gff.length == 9)
{
String soTerm = gff[TYPE_COL];
String atts = gff[ATTRIBUTES_COL];
Map> attributes = parseNameValuePairs(atts);
SequenceOntologyI so = SequenceOntologyFactory.getSequenceOntology();
if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
{
sf = processProteinMatch(attributes, seq, gff, align, newseqs,
relaxedIdMatching);
}
else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
{
sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
relaxedIdMatching);
}
else
{
sf = buildSequenceFeature(gff, attributes);
}
}
else
{
/*
* fall back on generating a sequence feature with no special processing
*/
sf = buildSequenceFeature(gff, null);
}
return sf;
}
/**
* Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
*
* @param attributes
* parsed GFF column 9 key/value(s)
* @param seq
* the sequence the GFF feature is on
* @param gffColumns
* the GFF column data
* @param align
* the alignment the sequence belongs to, where any new mappings
* should be added
* @param newseqs
* a list of new 'virtual sequences' generated while parsing GFF
* @param relaxedIdMatching
* if true allow fuzzy search for a matching target sequence
* @return a sequence feature, if one should be added to the sequence, else
* null
* @throws IOException
*/
protected SequenceFeature processNucleotideMatch(
Map> attributes, SequenceI seq,
String[] gffColumns, AlignmentI align, List newseqs,
boolean relaxedIdMatching) throws IOException
{
String strand = gffColumns[STRAND_COL];
/*
* (For now) we don't process mappings from reverse complement ; to do
* this would require (a) creating a virtual sequence placeholder for
* the reverse complement (b) resolving the sequence by its id from some
* source (GFF ##FASTA or other) (c) creating the reverse complement
* sequence (d) updating the mapping to be to the reverse complement
*/
if ("-".equals(strand))
{
System.err.println(
"Skipping mapping from reverse complement as not yet supported");
return null;
}
List targets = attributes.get(TARGET);
if (targets == null)
{
System.err.println("'Target' missing in GFF");
return null;
}
/*
* Typically we only expect one Target per GFF line, but this can handle
* multiple matches, to the same or different sequences (e.g. dna variants)
*/
for (String target : targets)
{
/*
* Process "seqid start end [strand]"
*/
String[] tokens = target.split(" ");
if (tokens.length < 3)
{
System.err.println("Incomplete Target: " + target);
continue;
}
/*
* Locate the mapped sequence in the alignment, or as a
* (new or existing) virtual sequence in the newseqs list
*/
String targetId = findTargetId(tokens[0], attributes);
SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
relaxedIdMatching);
SequenceI mappedSequence = mappedSequence1;
if (mappedSequence == null)
{
continue;
}
/*
* get any existing mapping for these sequences (or start one),
* and add this mapped range
*/
AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
try
{
int toStart = Integer.parseInt(tokens[1]);
int toEnd = Integer.parseInt(tokens[2]);
if (tokens.length > 3 && "-".equals(tokens[3]))
{
// mapping to reverse strand - swap start/end
int temp = toStart;
toStart = toEnd;
toEnd = temp;
}
int fromStart = Integer.parseInt(gffColumns[START_COL]);
int fromEnd = Integer.parseInt(gffColumns[END_COL]);
MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
toStart, toEnd, MappingType.NucleotideToNucleotide);
if (mapping != null)
{
acf.addMap(seq, mappedSequence, mapping);
align.addCodonFrame(acf);
}
} catch (NumberFormatException nfe)
{
System.err.println("Invalid start or end in Target " + target);
}
}
SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
return sf;
}
/**
* Returns the target sequence id extracted from the GFF name/value pairs.
* Default (standard behaviour) is the first token for "Target". This may be
* overridden where tools report this in a non-standard way.
*
* @param target
* first token of a "Target" value from GFF column 9, typically
* "seqid start end"
* @param set
* a map with all parsed column 9 attributes
* @return
*/
@SuppressWarnings("unused")
protected String findTargetId(String target,
Map> set)
{
return target;
}
/**
* Processes one GFF 'protein_match'; fields of interest are
*
* - feature group - the database reporting a match e.g. Pfam
* - Name - the matched entry's accession id in the database
* - ID - a sequence identifier for the matched region (which may be
* appended as FASTA in the GFF file)
*
*
* @param set
* parsed GFF column 9 key/value(s)
* @param seq
* the sequence the GFF feature is on
* @param gffColumns
* the sequence feature holding GFF data
* @param align
* the alignment the sequence belongs to, where any new mappings
* should be added
* @param newseqs
* a list of new 'virtual sequences' generated while parsing GFF
* @param relaxedIdMatching
* if true allow fuzzy search for a matching target sequence
* @return the (real or virtual) sequence(s) mapped to by this match
* @throws IOException
*/
protected SequenceFeature processProteinMatch(
Map> set, SequenceI seq, String[] gffColumns,
AlignmentI align, List newseqs,
boolean relaxedIdMatching)
{
// This is currently tailored to InterProScan GFF output:
// ID holds the ID of the matched sequence, Target references the
// query sequence; this looks wrong, as ID should just be the GFF internal
// ID of the GFF feature, while Target would normally reference the matched
// sequence.
// TODO refactor as needed if other protein-protein GFF varies
SequenceFeature sf = buildSequenceFeature(gffColumns, set);
/*
* locate the mapped sequence in the alignment, or as a
* (new or existing) virtual sequence in the newseqs list
*/
List targets = set.get(TARGET);
if (targets != null)
{
for (String target : targets)
{
SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
align, newseqs, relaxedIdMatching);
SequenceI mappedSequence = mappedSequence1;
if (mappedSequence == null)
{
continue;
}
/*
* give the mapped sequence a copy of the sequence feature, with
* start/end range adjusted
*/
int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
SequenceFeature sf2 = new SequenceFeature(sf, 1,
sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
mappedSequence.addSequenceFeature(sf2);
/*
* add a property to the mapped sequence so that it can eventually be
* renamed with its qualified accession id; renaming has to wait until
* all sequence reference resolution is complete
*/
String accessionId = StringUtils
.listToDelimitedString(set.get(NAME), ",");
if (accessionId.length() > 0)
{
String database = sf.getType(); // TODO InterProScan only??
String qualifiedAccId = database + "|" + accessionId;
sf2.setValue(RENAME_TOKEN, qualifiedAccId);
}
/*
* get any existing mapping for these sequences (or start one),
* and add this mapped range
*/
AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
int[] from = new int[] { sf.getBegin(), sf.getEnd() };
int[] to = new int[] { 1, sequenceFeatureLength };
MapList mapping = new MapList(from, to, 1, 1);
alco.addMap(seq, mappedSequence, mapping);
align.addCodonFrame(alco);
}
}
return sf;
}
/**
* Return '=' as the name-value separator used in column 9 attributes.
*/
@Override
protected char getNameValueSeparator()
{
return '=';
}
/**
* Modifies the default SequenceFeature in order to set the Target sequence id
* as the description
*/
@Override
protected SequenceFeature buildSequenceFeature(String[] gff,
int typeColumn, String group,
Map> attributes)
{
SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
attributes);
String desc = getDescription(sf, attributes);
if (desc != null)
{
sf.setDescription(desc);
}
return sf;
}
/**
* Apply heuristic rules to try to get the most useful feature description
*
* @param sf
* @param attributes
* @return
*/
protected String getDescription(SequenceFeature sf,
Map> attributes)
{
String desc = null;
String target = (String) sf.getValue(TARGET);
if (target != null)
{
desc = target.split(" ")[0];
}
SequenceOntologyI so = SequenceOntologyFactory.getSequenceOntology();
String type = sf.getType();
if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
{
/*
* Ensembl returns dna variants as 'alleles'
*/
desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
",");
}
/*
* extract 'Name' for a transcript (to show gene name)
* or an exon (so 'colour by label' shows exon boundaries)
*/
if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
|| so.isA(type, SequenceOntologyI.TRANSCRIPT)
|| so.isA(type, SequenceOntologyI.EXON))
{
desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
}
/*
* if the above fails, try ID
*/
if (desc == null)
{
desc = (String) sf.getValue(ID);
}
return desc;
}
}