/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io.gff;
import jalview.analysis.SequenceIdMatcher;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.MappingType;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;
import jalview.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
/**
* Base class with common functionality for flavours of GFF handler (GFF2 or
* GFF3)
*/
public abstract class GffHelperBase implements GffHelperI
{
private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
protected static final String COMMA = ",";
protected static final String EQUALS = "=";
protected static final String NOTE = "Note";
/*
* GFF columns 1-9 (zero-indexed):
*/
protected static final int SEQID_COL = 0;
protected static final int SOURCE_COL = 1;
protected static final int TYPE_COL = 2;
protected static final int START_COL = 3;
protected static final int END_COL = 4;
protected static final int SCORE_COL = 5;
protected static final int STRAND_COL = 6;
protected static final int PHASE_COL = 7;
protected static final int ATTRIBUTES_COL = 8;
private AlignmentI lastmatchedAl = null;
private SequenceIdMatcher matcher = null;
/**
* Constructs and returns a mapping, or null if data appear invalid
*
* @param fromStart
* @param fromEnd
* @param toStart
* @param toEnd
* @param mappingType
* type of mapping (e.g. protein to nucleotide)
* @return
*/
protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
int toStart, int toEnd, MappingType mappingType)
{
int[] from = new int[] { fromStart, fromEnd };
int[] to = new int[] { toStart, toEnd };
/*
* Jalview always models from dna to protein, so switch values if the
* GFF mapping is from protein to dna
*/
if (mappingType == MappingType.PeptideToNucleotide)
{
int[] temp = from;
from = to;
to = temp;
mappingType = mappingType.getInverse();
}
int fromRatio = mappingType.getFromRatio();
int toRatio = mappingType.getToRatio();
/*
* sanity check that mapped residue counts match
* TODO understand why PASA generates such cases...
*/
if (!trimMapping(from, to, fromRatio, toRatio))
{
System.err.println("Ignoring mapping from " + Arrays.toString(from)
+ " to " + Arrays.toString(to) + " as counts don't match!");
return null;
}
/*
* If a codon has an intron gap, there will be contiguous 'toRanges';
* this is handled for us by the MapList constructor.
* (It is not clear that exonerate ever generates this case)
*/
return new MapList(from, to, fromRatio, toRatio);
}
/**
* Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
* tries to trim the end of the longer so they do. Returns true if the
* mappings could be made equivalent, else false. Note the range array values
* may be modified by this method.
*
* @param from
* @param to
* @param fromRatio
* @param toRatio
* @return
*/
protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
int toRatio)
{
int fromLength = Math.abs(from[1] - from[0]) + 1;
int toLength = Math.abs(to[1] - to[0]) + 1;
int fromOverlap = fromLength * toRatio - toLength * fromRatio;
if (fromOverlap == 0)
{
return true;
}
if (fromOverlap > 0 && fromOverlap % toRatio == 0)
{
/*
* restrict from range to make them match up
* it's kind of arbitrary which end we truncate - here it is the end
*/
System.err.print(
"Truncating mapping from " + Arrays.toString(from) + " to ");
if (from[1] > from[0])
{
from[1] -= fromOverlap / toRatio;
}
else
{
from[1] += fromOverlap / toRatio;
}
System.err.println(Arrays.toString(from));
return true;
}
else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
{
fromOverlap = -fromOverlap; // > 0
/*
* restrict to range to make them match up
*/
System.err.print(
"Truncating mapping to " + Arrays.toString(to) + " to ");
if (to[1] > to[0])
{
to[1] -= fromOverlap / fromRatio;
}
else
{
to[1] += fromOverlap / fromRatio;
}
System.err.println(Arrays.toString(to));
return true;
}
/*
* Couldn't truncate to an exact match..
*/
return false;
}
/**
* Returns a sequence matching the given id, as follows
*
* - strict matching is on exact sequence name
* - relaxed matching allows matching on a token within the sequence name,
* or a dbxref
* - first tries to find a match in the alignment sequences
* - else tries to find a match in the new sequences already generated while
* parsing the features file
* - else creates a new placeholder sequence, adds it to the new sequences
* list, and returns it
*
*
* @param seqId
* @param align
* @param newseqs
* @param relaxedIdMatching
*
* @return
*/
protected SequenceI findSequence(String seqId, AlignmentI align,
List newseqs, boolean relaxedIdMatching)
{
if (seqId == null)
{
return null;
}
SequenceI match = null;
if (relaxedIdMatching)
{
if (lastmatchedAl != align)
{
lastmatchedAl = align;
matcher = new SequenceIdMatcher(align.getSequencesArray());
if (newseqs != null)
{
matcher.addAll(newseqs);
}
}
match = matcher.findIdMatch(seqId);
}
else
{
match = align.findName(seqId, true);
if (match == null && newseqs != null)
{
for (SequenceI m : newseqs)
{
if (seqId.equals(m.getName()))
{
return m;
}
}
}
}
if (match == null && newseqs != null)
{
match = new SequenceDummy(seqId);
if (relaxedIdMatching)
{
matcher.addAll(Arrays.asList(new SequenceI[] { match }));
}
// add dummy sequence to the newseqs list
newseqs.add(match);
}
return match;
}
/**
* Parses the input line to a map of name / value(s) pairs. For example the
* line
*
*
* Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
*
*
* if parsed with delimiter=";" and separators {' ', '='}
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
* prediction}, source={Pfam}}
*
* This method supports parsing of either GFF2 format (which uses space ' ' as
* the name/value delimiter, and allows multiple occurrences of the same
* name), or GFF3 format (which uses '=' as the name/value delimiter, and
* strictly does not allow repeat occurrences of the same name - but does
* allow a comma-separated list of values).
*
* Returns a (possibly empty) map of lists of values by attribute name.
*
* @param text
* @param namesDelimiter
* the major delimiter between name-value pairs
* @param nameValueSeparator
* separator used between name and value
* @param valuesDelimiter
* delimits a list of more than one value
* @return
*/
public static Map> parseNameValuePairs(String text,
String namesDelimiter, char nameValueSeparator,
String valuesDelimiter)
{
Map> map = new HashMap<>();
if (text == null || text.trim().length() == 0)
{
return map;
}
/*
* split by major delimiter (; for GFF3)
*/
for (String nameValuePair : text.trim().split(namesDelimiter))
{
nameValuePair = nameValuePair.trim();
if (nameValuePair.length() == 0)
{
continue;
}
/*
* find name/value separator (= for GFF3)
*/
int sepPos = nameValuePair.indexOf(nameValueSeparator);
if (sepPos == -1)
{
// no name=value found
continue;
}
String name = nameValuePair.substring(0, sepPos).trim();
String values = nameValuePair.substring(sepPos + 1).trim();
if (values.isEmpty())
{
continue;
}
List vals = map.get(name);
if (vals == null)
{
vals = new ArrayList<>();
map.put(name, vals);
}
/*
* if 'values' contains more name/value separators, parse as a map
* (nested sub-attribute values)
*/
if (values.indexOf(nameValueSeparator) != -1)
{
vals.add(values);
}
else
{
for (String val : values.split(valuesDelimiter))
{
vals.add(val);
}
}
}
return map;
}
/**
* Constructs a SequenceFeature from the GFF column data. Subclasses may wish
* to call this method then adjust the SequenceFeature depending on the
* particular usage of different tools that generate GFF.
*
* @param gff
* @param attributes
* @return
*/
protected SequenceFeature buildSequenceFeature(String[] gff,
Map> attributes)
{
return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
}
/**
* @param gff
* @param typeColumn
* @param group
* @param attributes
* @return
*/
protected SequenceFeature buildSequenceFeature(String[] gff,
int typeColumn, String group, Map> attributes)
{
try
{
int start = Integer.parseInt(gff[START_COL]);
int end = Integer.parseInt(gff[END_COL]);
/*
* default 'score' is 0 rather than Float.NaN - see JAL-2554
*/
float score = 0f;
try
{
score = Float.parseFloat(gff[SCORE_COL]);
} catch (NumberFormatException nfe)
{
// e.g. '.' - leave as zero
}
SequenceFeature sf = new SequenceFeature(gff[typeColumn],
gff[SOURCE_COL], start, end, score, group);
sf.setStrand(gff[STRAND_COL]);
sf.setPhase(gff[PHASE_COL]);
if (attributes != null)
{
/*
* Add attributes in column 9 to the sequence feature's
* 'otherData' table; use Note as a best proxy for description;
* decode any encoded comma, equals, semi-colon as per GFF3 spec
*/
for (Entry> attr : attributes.entrySet())
{
String key = attr.getKey();
List values = attr.getValue();
if (values.size() == 1 && values.get(0).contains(EQUALS))
{
/*
* 'value' is actually nested subattributes as x=a,y=b,z=c
*/
Map valueMap = parseAttributeMap(values.get(0));
sf.setValue(key, valueMap);
}
else
{
String csvValues = StringUtils.listToDelimitedString(values,
COMMA);
csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
sf.setValue(key, csvValues);
if (NOTE.equals(key))
{
sf.setDescription(csvValues);
}
}
}
}
return sf;
} catch (NumberFormatException nfe)
{
System.err.println("Invalid number in gff: " + nfe.getMessage());
return null;
}
}
/**
* Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
* of {@code key,
* value}
* An input string like {@code a=b,c,d=e,f=g,h} is parsed to
*
*
* a = "b,c"
* d = "e"
* f = "g,h"
*
*
* @param s
*
* @return
*/
protected static Map parseAttributeMap(String s)
{
Map map = new HashMap<>();
String[] fields = s.split(EQUALS);
/*
* format validation
*/
boolean valid = true;
if (fields.length < 2)
{
/*
* need at least A=B here
*/
valid = false;
}
else if (fields[0].isEmpty() || fields[0].contains(COMMA))
{
/*
* A,B=C is not a valid start, nor is =C
*/
valid = false;
}
else
{
for (int i = 1; i < fields.length - 1; i++)
{
if (fields[i].isEmpty() || !fields[i].contains(COMMA))
{
/*
* intermediate tokens must include value,name
*/
valid = false;
}
}
}
if (!valid)
{
System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s);
return map;
}
int i = 0;
while (i < fields.length - 1)
{
boolean lastPair = i == fields.length - 2;
String before = fields[i];
String after = fields[i + 1];
/*
* if 'key' looks like a,b,c then the last token is the
* key
*/
String theKey = before.contains(COMMA)
? before.substring(before.lastIndexOf(COMMA) + 1)
: before;
theKey = theKey.trim();
if (theKey.isEmpty())
{
System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s);
map.clear();
return map;
}
/*
* if 'value' looks like a,b,c then all but the last token is the value,
* unless this is the last field (no more = to follow), in which case
* all of it makes up the value
*/
String theValue = after.contains(COMMA) && !lastPair
? after.substring(0, after.lastIndexOf(COMMA))
: after;
map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
StringUtils.urlDecode(theValue, GFF_ENCODABLE));
i += 1;
}
return map;
}
/**
* Returns any existing mapping held on the alignment between the given
* dataset sequences, or a new one if none found. This is a convenience method
* to facilitate processing multiple GFF lines that make up a single 'spliced'
* mapping, by extending the first mapping as the others are read.
*
* @param align
* @param fromSeq
* @param toSeq
* @return
*/
protected AlignedCodonFrame getMapping(AlignmentI align,
SequenceI fromSeq, SequenceI toSeq)
{
AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
if (acf == null)
{
acf = new AlignedCodonFrame();
}
return acf;
}
}