X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fgff%2FGff3Helper.java;fp=src%2Fjalview%2Fio%2Fgff%2FGff3Helper.java;h=4c67caa7fb4db009038dd878c8f553d96e54a9cf;hb=8f920d337154e092f5f9056ffde3cdf2735eca43;hp=0000000000000000000000000000000000000000;hpb=da768251d307c7ce11283d72e0e522b2c5fac526;p=jalview.git diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java new file mode 100644 index 0000000..4c67caa --- /dev/null +++ b/src/jalview/io/gff/Gff3Helper.java @@ -0,0 +1,361 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappingType; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.util.StringUtils; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Base class with generic / common functionality for processing GFF3 data. + * Override this as required for any specialisations resulting from + * peculiarities of GFF3 generated by particular tools. + */ +public class Gff3Helper extends GffHelperBase +{ + protected static final String TARGET = "Target"; + + protected static final String ID = "ID"; + + private static final String NAME = "Name"; + + /** + * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to + * separate multiple values for a name + * + * @param text + * @return + */ + public static Map> parseNameValuePairs(String text) + { + return parseNameValuePairs(text, ";", '=', ","); + } + + /** + * Process one GFF feature line (as modelled by SequenceFeature) + * + * @param seq + * the sequence with which this feature is associated + * @param sf + * the sequence feature with ATTRIBUTES property containing any + * additional attributes + * @param align + * the alignment we are adding GFF to + * @param newseqs + * any new sequences referenced by the GFF + * @param relaxedIdMatching + * if true, match word tokens in sequence names + * @return true if the sequence feature should be added to the sequence, else + * false (i.e. it has been processed in another way e.g. to generate a + * mapping) + * @throws IOException + */ + @Override + public SequenceFeature processGff(SequenceI seq, String[] gff, + AlignmentI align, List newseqs, + boolean relaxedIdMatching) throws IOException + { + /* + * (For now) we don't process mappings from reverse complement ; to do + * this would require (a) creating a virtual sequence placeholder for + * the reverse complement (b) resolving the sequence by its id from some + * source (GFF ##FASTA or other) (c) creating the reverse complement + * sequence (d) updating the mapping to be to the reverse complement + */ + if ("-".equals(gff[STRAND_COL])) + { + System.err + .println("Skipping mapping from reverse complement as not yet supported"); + return null; + } + SequenceFeature sf = null; + + if (gff.length == 9) + { + String soTerm = gff[TYPE_COL]; + String atts = gff[ATTRIBUTES_COL]; + Map> attributes = parseNameValuePairs(atts); + + if (SequenceOntology.getInstance().isProteinMatch(soTerm)) + { + sf = processProteinMatch(attributes, seq, gff, align, + newseqs, relaxedIdMatching); + } + else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm)) + { + sf = processNucleotideMatch(attributes, seq, gff, align, + newseqs, relaxedIdMatching); + } + else + { + sf = buildSequenceFeature(gff, attributes); + } + } + else + { + /* + * fall back on generating a sequence feature with no special processing + */ + sf = buildSequenceFeature(gff, null); + } + + return sf; + } + + /** + * Processes one GFF3 nucleotide (e.g. cDNA to genome) match. + * + * @param attributes + * parsed GFF column 9 key/value(s) + * @param seq + * the sequence the GFF feature is on + * @param gffColumns + * the GFF column data + * @param align + * the alignment the sequence belongs to, where any new mappings + * should be added + * @param newseqs + * a list of new 'virtual sequences' generated while parsing GFF + * @param relaxedIdMatching + * if true allow fuzzy search for a matching target sequence + * @return a sequence feature, if one should be added to the sequence, else + * null + * @throws IOException + */ + protected SequenceFeature processNucleotideMatch( + Map> attributes, SequenceI seq, + String[] gffColumns, AlignmentI align, List newseqs, + boolean relaxedIdMatching) + throws IOException + { + String strand = gffColumns[STRAND_COL]; + if ("-1".equals(strand)) + { + System.err + .println("Currently ignoring mappings from reverse complement"); + return null; + } + + List targets = attributes.get(TARGET); + if (targets == null) + { + System.err.println("'Target' missing in GFF"); + return null; + } + + /* + * Typically we only expect one Target per GFF line, but this can handle + * multiple matches, to the same or different sequences (e.g. dna variants) + */ + for (String target : targets) + { + /* + * Process "seqid start end [strand]" + */ + String[] tokens = target.split(" "); + if (tokens.length < 3) + { + System.err.println("Incomplete Target: " + target); + continue; + } + + /* + * Locate the mapped sequence in the alignment, or as a + * (new or existing) virtual sequence in the newseqs list + */ + String targetId = findTargetId(tokens[0], attributes); + SequenceI mappedSequence1 = findSequence(targetId, align, + newseqs, relaxedIdMatching); + SequenceI mappedSequence = mappedSequence1; + if (mappedSequence == null) + { + continue; + } + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame acf = getMapping(align, seq, mappedSequence); + + try + { + int toStart = Integer.parseInt(tokens[1]); + int toEnd = Integer.parseInt(tokens[2]); + if (tokens.length > 3 && "-".equals(tokens[3])) + { + // mapping to reverse strand - swap start/end + int temp = toStart; + toStart = toEnd; + toEnd = temp; + } + + int fromStart = Integer.parseInt(gffColumns[START_COL]); + int fromEnd = Integer.parseInt(gffColumns[END_COL]); + MapList mapping = constructMappingFromAlign(fromStart, fromEnd, + toStart, toEnd, + MappingType.NucleotideToNucleotide); + + if (mapping != null) + { + acf.addMap(seq, mappedSequence, mapping); + align.addCodonFrame(acf); + } + } catch (NumberFormatException nfe) + { + System.err.println("Invalid start or end in Target " + target); + } + } + + SequenceFeature sf = buildSequenceFeature(gffColumns, attributes); + return sf; + } + + /** + * Returns the target sequence id extracted from the GFF name/value pairs. + * Default (standard behaviour) is the first token for "Target". This may be + * overridden where tools report this in a non-standard way. + * + * @param target + * first token of a "Target" value from GFF column 9, typically + * "seqid start end" + * @param set + * a map with all parsed column 9 attributes + * @return + */ + @SuppressWarnings("unused") + protected String findTargetId(String target, Map> set) + { + return target; + } + + /** + * Processes one GFF 'protein_match'; fields of interest are + *
    + *
  • feature group - the database reporting a match e.g. Pfam
  • + *
  • Name - the matched entry's accession id in the database
  • + *
  • ID - a sequence identifier for the matched region (which may be + * appended as FASTA in the GFF file)
  • + *
+ * + * @param set + * parsed GFF column 9 key/value(s) + * @param seq + * the sequence the GFF feature is on + * @param gffColumns + * the sequence feature holding GFF data + * @param align + * the alignment the sequence belongs to, where any new mappings + * should be added + * @param newseqs + * a list of new 'virtual sequences' generated while parsing GFF + * @param relaxedIdMatching + * if true allow fuzzy search for a matching target sequence + * @return the (real or virtual) sequence(s) mapped to by this match + * @throws IOException + */ + protected SequenceFeature processProteinMatch( + Map> set, SequenceI seq, + String[] gffColumns, AlignmentI align, List newseqs, + boolean relaxedIdMatching) + { + // This is currently tailored to InterProScan GFF output: + // ID holds the ID of the matched sequence, Target references the + // query sequence; this looks wrong, as ID should just be the GFF internal + // ID of the GFF feature, while Target would normally reference the matched + // sequence. + // TODO refactor as needed if other protein-protein GFF varies + + SequenceFeature sf = buildSequenceFeature(gffColumns, set); + + /* + * locate the mapped sequence in the alignment, or as a + * (new or existing) virtual sequence in the newseqs list + */ + List targets = set.get(TARGET); + if (targets != null) + { + for (String target : targets) + { + + SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align, + newseqs, relaxedIdMatching); + SequenceI mappedSequence = mappedSequence1; + if (mappedSequence == null) + { + continue; + } + + /* + * give the mapped sequence a copy of the sequence feature, with + * start/end range adjusted + */ + SequenceFeature sf2 = new SequenceFeature(sf); + sf2.setBegin(1); + int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin(); + sf2.setEnd(sequenceFeatureLength); + mappedSequence.addSequenceFeature(sf2); + + /* + * add a property to the mapped sequence so that it can eventually be + * renamed with its qualified accession id; renaming has to wait until + * all sequence reference resolution is complete + */ + String accessionId = StringUtils.listToDelimitedString( + set.get(NAME), ","); + if (accessionId.length() > 0) + { + String database = sf.getType(); // TODO InterProScan only?? + String qualifiedAccId = database + "|" + accessionId; + sf2.setValue(RENAME_TOKEN, qualifiedAccId); + } + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame alco = getMapping(align, seq, mappedSequence); + int[] from = new int[] { sf.getBegin(), sf.getEnd() }; + int[] to = new int[] { 1, sequenceFeatureLength }; + MapList mapping = new MapList(from, to, 1, 1); + + alco.addMap(seq, mappedSequence, mapping); + align.addCodonFrame(alco); + } + } + + return sf; + } + + /** + * Return '=' as the name-value separator used in column 9 attributes. + */ + @Override + protected char getNameValueSeparator() + { + return '='; + } + + /** + * Modifies the default SequenceFeature in order to set the Target sequence id + * as the description + */ + @Override + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> attributes) + { + SequenceFeature sf = super.buildSequenceFeature(gff, attributes); + String target = (String) sf.getValue(TARGET); + if (target != null) + { + sf.setDescription(target.split(" ")[0]); + } + return sf; + } +}