1 package jalview.io.gff;
3 import jalview.analysis.SequenceIdMatcher;
4 import jalview.datamodel.AlignedCodonFrame;
5 import jalview.datamodel.AlignmentI;
6 import jalview.datamodel.MappingType;
7 import jalview.datamodel.SequenceDummy;
8 import jalview.datamodel.SequenceFeature;
9 import jalview.datamodel.SequenceI;
10 import jalview.util.MapList;
11 import jalview.util.StringUtils;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.HashMap;
16 import java.util.List;
18 import java.util.Map.Entry;
21 * Base class with common functionality for flavours of GFF handler (GFF2 or
24 public abstract class GffHelperBase implements GffHelperI
26 private static final String NOTE = "Note";
29 * GFF columns 1-9 (zero-indexed):
31 protected static final int SEQID_COL = 0;
33 protected static final int SOURCE_COL = 1;
35 protected static final int TYPE_COL = 2;
37 protected static final int START_COL = 3;
39 protected static final int END_COL = 4;
41 protected static final int SCORE_COL = 5;
43 protected static final int STRAND_COL = 6;
45 protected static final int PHASE_COL = 7;
47 protected static final int ATTRIBUTES_COL = 8;
49 private AlignmentI lastmatchedAl = null;
51 private SequenceIdMatcher matcher = null;
54 * Constructs and returns a mapping, or null if data appear invalid
61 * type of mapping (e.g. protein to nucleotide)
64 protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
65 int toStart, int toEnd, MappingType mappingType)
67 int[] from = new int[] { fromStart, fromEnd };
68 int[] to = new int[] { toStart, toEnd };
71 * Jalview always models from dna to protein, so switch values if the
72 * GFF mapping is from protein to dna
74 if (mappingType == MappingType.PeptideToNucleotide)
79 mappingType = mappingType.getInverse();
82 int fromRatio = mappingType.getFromRatio();
83 int toRatio = mappingType.getToRatio();
86 * sanity check that mapped residue counts match
87 * TODO understand why PASA generates such cases...
89 if (!trimMapping(from, to, fromRatio, toRatio))
91 System.err.println("Ignoring mapping from " + Arrays.toString(from)
92 + " to " + Arrays.toString(to) + " as counts don't match!");
97 * If a codon has an intron gap, there will be contiguous 'toRanges';
98 * this is handled for us by the MapList constructor.
99 * (It is not clear that exonerate ever generates this case)
102 return new MapList(from, to, fromRatio, toRatio);
106 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
107 * tries to trim the end of the longer so they do. Returns true if the
108 * mappings could be made equivalent, else false. Note the range array values
109 * may be modified by this method.
117 protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
120 int fromLength = Math.abs(from[1] - from[0]) + 1;
121 int toLength = Math.abs(to[1] - to[0]) + 1;
122 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
123 if (fromOverlap == 0)
127 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
130 * restrict from range to make them match up
131 * it's kind of arbitrary which end we truncate - here it is the end
133 System.err.print("Truncating mapping from " + Arrays.toString(from)
135 if (from[1] > from[0])
137 from[1] -= fromOverlap / toRatio;
141 from[1] += fromOverlap / toRatio;
143 System.err.println(Arrays.toString(from));
146 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
148 fromOverlap = -fromOverlap; // > 0
150 * restrict to range to make them match up
152 System.err.print("Truncating mapping to " + Arrays.toString(to)
156 to[1] -= fromOverlap / fromRatio;
160 to[1] += fromOverlap / fromRatio;
162 System.err.println(Arrays.toString(to));
167 * Couldn't truncate to an exact match..
173 * Returns a sequence matching the given id, as follows
175 * <li>strict matching is on exact sequence name</li>
176 * <li>relaxed matching allows matching on a token within the sequence name,
178 * <li>first tries to find a match in the alignment sequences</li>
179 * <li>else tries to find a match in the new sequences already generated while
180 * parsing the features file</li>
181 * <li>else creates a new placeholder sequence, adds it to the new sequences
182 * list, and returns it</li>
188 * @param relaxedIdMatching
192 protected SequenceI findSequence(String seqId, AlignmentI align,
193 List<SequenceI> newseqs, boolean relaxedIdMatching)
199 SequenceI match = null;
200 if (relaxedIdMatching)
202 if (lastmatchedAl != align)
204 lastmatchedAl = align;
205 matcher = new SequenceIdMatcher(align.getSequencesArray());
208 matcher.addAll(newseqs);
211 match = matcher.findIdMatch(seqId);
215 match = align.findName(seqId, true);
216 if (match == null && newseqs != null)
218 for (SequenceI m : newseqs)
220 if (seqId.equals(m.getName()))
228 if (match == null && newseqs != null)
230 match = new SequenceDummy(seqId);
231 if (relaxedIdMatching)
233 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
235 // add dummy sequence to the newseqs list
242 * Parses the input line to a map of name / value(s) pairs. For example the
244 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal <br>
245 * if parsed with delimiter=";" and separators {' ', '='} <br>
246 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
247 * prediction}, source={Pfam}} <br>
249 * This method supports parsing of either GFF2 format (which uses space ' ' as
250 * the name/value delimiter, and allows multiple occurrences of the same
251 * name), or GFF3 format (which uses '=' as the name/value delimiter, and
252 * strictly does not allow repeat occurrences of the same name - but does
253 * allow a comma-separated list of values).
256 * @param namesDelimiter
257 * the major delimiter between name-value pairs
258 * @param nameValueSeparator
259 * one or more separators used between name and value
260 * @param valuesDelimiter
261 * delimits a list of more than one value
262 * @return the name-values map (which may be empty but never null)
264 public static Map<String, List<String>> parseNameValuePairs(String text,
265 String namesDelimiter, char nameValueSeparator,
266 String valuesDelimiter)
268 Map<String, List<String>> map = new HashMap<String, List<String>>();
269 if (text == null || text.trim().length() == 0)
274 for (String pair : text.trim().split(namesDelimiter))
277 if (pair.length() == 0)
282 int sepPos = pair.indexOf(nameValueSeparator);
285 // no name=value present
289 String key = pair.substring(0, sepPos).trim();
290 String values = pair.substring(sepPos + 1).trim();
291 if (values.length() > 0)
293 List<String> vals = map.get(key);
296 vals = new ArrayList<String>();
299 for (String val : values.split(valuesDelimiter))
309 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
310 * to call this method then adjust the SequenceFeature depending on the
311 * particular usage of different tools that generate GFF.
317 protected SequenceFeature buildSequenceFeature(String[] gff,
318 Map<String, List<String>> attributes)
322 int start = Integer.parseInt(gff[START_COL]);
323 int end = Integer.parseInt(gff[END_COL]);
326 * default 'score' is 0 rather than Float.NaN as the latter currently
327 * disables the 'graduated colour => colour by label' option
332 score = Float.parseFloat(gff[SCORE_COL]);
333 } catch (NumberFormatException nfe)
335 // e.g. '.' - leave as zero
338 SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
339 gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
341 sf.setStrand(gff[STRAND_COL]);
343 sf.setPhase(gff[PHASE_COL]);
345 if (attributes != null)
348 * save 'raw' column 9 to allow roundtrip output as input
350 sf.setAttributes(gff[ATTRIBUTES_COL]);
353 * Add attributes in column 9 to the sequence feature's
354 * 'otherData' table; use Note as a best proxy for description
356 for (Entry<String, List<String>> attr : attributes.entrySet())
358 String values = StringUtils.listToDelimitedString(
359 attr.getValue(), ",");
360 sf.setValue(attr.getKey(), values);
361 if (NOTE.equals(attr.getKey()))
363 sf.setDescription(values);
369 } catch (NumberFormatException nfe)
371 System.err.println("Invalid number in gff: " + nfe.getMessage());
377 * Returns the character used to separate attributes names from values in GFF
378 * column 9. This is space for GFF2, '=' for GFF3.
382 protected abstract char getNameValueSeparator();
385 * Returns any existing mapping held on the alignment between the given
386 * dataset sequences, or a new one if none found. This is a convenience method
387 * to facilitate processing multiple GFF lines that make up a single 'spliced'
388 * mapping, by extending the first mapping as the others are read.
395 protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq)
397 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
400 acf = new AlignedCodonFrame();