2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import static jalview.io.FeaturesFile.MAP_ATTRIBUTE_PREFIX;
25 import jalview.analysis.SequenceIdMatcher;
26 import jalview.datamodel.AlignedCodonFrame;
27 import jalview.datamodel.AlignmentI;
28 import jalview.datamodel.MappingType;
29 import jalview.datamodel.SequenceDummy;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.datamodel.SequenceI;
32 import jalview.io.FeaturesFile;
33 import jalview.util.MapList;
34 import jalview.util.StringUtils;
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.HashMap;
39 import java.util.List;
41 import java.util.Map.Entry;
44 * Base class with common functionality for flavours of GFF handler (GFF2 or
47 public abstract class GffHelperBase implements GffHelperI
49 private static final String COMMA = ",";
51 private static final String NOTE = "Note";
54 * GFF columns 1-9 (zero-indexed):
56 protected static final int SEQID_COL = 0;
58 protected static final int SOURCE_COL = 1;
60 protected static final int TYPE_COL = 2;
62 protected static final int START_COL = 3;
64 protected static final int END_COL = 4;
66 protected static final int SCORE_COL = 5;
68 protected static final int STRAND_COL = 6;
70 protected static final int PHASE_COL = 7;
72 protected static final int ATTRIBUTES_COL = 8;
74 private AlignmentI lastmatchedAl = null;
76 private SequenceIdMatcher matcher = null;
79 * Constructs and returns a mapping, or null if data appear invalid
86 * type of mapping (e.g. protein to nucleotide)
89 protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
90 int toStart, int toEnd, MappingType mappingType)
92 int[] from = new int[] { fromStart, fromEnd };
93 int[] to = new int[] { toStart, toEnd };
96 * Jalview always models from dna to protein, so switch values if the
97 * GFF mapping is from protein to dna
99 if (mappingType == MappingType.PeptideToNucleotide)
104 mappingType = mappingType.getInverse();
107 int fromRatio = mappingType.getFromRatio();
108 int toRatio = mappingType.getToRatio();
111 * sanity check that mapped residue counts match
112 * TODO understand why PASA generates such cases...
114 if (!trimMapping(from, to, fromRatio, toRatio))
116 System.err.println("Ignoring mapping from " + Arrays.toString(from)
117 + " to " + Arrays.toString(to) + " as counts don't match!");
122 * If a codon has an intron gap, there will be contiguous 'toRanges';
123 * this is handled for us by the MapList constructor.
124 * (It is not clear that exonerate ever generates this case)
127 return new MapList(from, to, fromRatio, toRatio);
131 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
132 * tries to trim the end of the longer so they do. Returns true if the
133 * mappings could be made equivalent, else false. Note the range array values
134 * may be modified by this method.
142 protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
145 int fromLength = Math.abs(from[1] - from[0]) + 1;
146 int toLength = Math.abs(to[1] - to[0]) + 1;
147 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
148 if (fromOverlap == 0)
152 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
155 * restrict from range to make them match up
156 * it's kind of arbitrary which end we truncate - here it is the end
159 "Truncating mapping from " + Arrays.toString(from) + " to ");
160 if (from[1] > from[0])
162 from[1] -= fromOverlap / toRatio;
166 from[1] += fromOverlap / toRatio;
168 System.err.println(Arrays.toString(from));
171 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
173 fromOverlap = -fromOverlap; // > 0
175 * restrict to range to make them match up
178 "Truncating mapping to " + Arrays.toString(to) + " to ");
181 to[1] -= fromOverlap / fromRatio;
185 to[1] += fromOverlap / fromRatio;
187 System.err.println(Arrays.toString(to));
192 * Couldn't truncate to an exact match..
198 * Returns a sequence matching the given id, as follows
200 * <li>strict matching is on exact sequence name</li>
201 * <li>relaxed matching allows matching on a token within the sequence name,
203 * <li>first tries to find a match in the alignment sequences</li>
204 * <li>else tries to find a match in the new sequences already generated while
205 * parsing the features file</li>
206 * <li>else creates a new placeholder sequence, adds it to the new sequences
207 * list, and returns it</li>
213 * @param relaxedIdMatching
217 protected SequenceI findSequence(String seqId, AlignmentI align,
218 List<SequenceI> newseqs, boolean relaxedIdMatching)
224 SequenceI match = null;
225 if (relaxedIdMatching)
227 if (lastmatchedAl != align)
229 lastmatchedAl = align;
230 matcher = new SequenceIdMatcher(align.getSequencesArray());
233 matcher.addAll(newseqs);
236 match = matcher.findIdMatch(seqId);
240 match = align.findName(seqId, true);
241 if (match == null && newseqs != null)
243 for (SequenceI m : newseqs)
245 if (seqId.equals(m.getName()))
253 if (match == null && newseqs != null)
255 match = new SequenceDummy(seqId);
256 if (relaxedIdMatching)
258 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
260 // add dummy sequence to the newseqs list
267 * Parses the input line to a map of name / value(s) pairs. For example the line
269 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
271 * if parsed with delimiter=";" and separators {' ', '='} <br>
272 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
273 * prediction}, source={Pfam}} <br>
275 * This method supports parsing of either GFF2 format (which uses space ' ' as
276 * the name/value delimiter, and allows multiple occurrences of the same name),
277 * or GFF3 format (which uses '=' as the name/value delimiter, and strictly does
278 * not allow repeat occurrences of the same name - but does allow a
279 * comma-separated list of values).
281 * Returns a (possibly empty) map of lists of values by attribute name.
284 * @param namesDelimiter
285 * the major delimiter between name-value pairs
286 * @param nameValueSeparator
287 * separator used between name and value
288 * @param valuesDelimiter
289 * delimits a list of more than one value
292 public static Map<String, List<String>> parseNameValuePairs(String text,
293 String namesDelimiter, char nameValueSeparator,
294 String valuesDelimiter)
296 Map<String, List<String>> map = new HashMap<>();
297 if (text == null || text.trim().length() == 0)
302 for (String pair : text.trim().split(namesDelimiter))
305 if (pair.length() == 0)
310 int sepPos = pair.indexOf(nameValueSeparator);
313 // no name=value found
317 String key = pair.substring(0, sepPos).trim();
318 String values = pair.substring(sepPos + 1).trim();
319 if (values.length() > 0)
321 List<String> vals = map.get(key);
324 vals = new ArrayList<>();
329 * special case: formatted as jvmap_AttName={a=b,c=d,...}
330 * save the value within { } for parsing at a later stage
332 if (key.startsWith(MAP_ATTRIBUTE_PREFIX))
335 if (key.length() > MAP_ATTRIBUTE_PREFIX.length()
336 && values.startsWith("{")
337 && values.endsWith("}"))
339 vals.add(values.substring(1, values.length() - 1));
343 System.err.println("Malformed GFF data '" + values.toString()
349 for (String val : values.split(valuesDelimiter))
360 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
361 * to call this method then adjust the SequenceFeature depending on the
362 * particular usage of different tools that generate GFF.
368 protected SequenceFeature buildSequenceFeature(String[] gff,
369 Map<String, List<String>> attributes)
371 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
381 protected SequenceFeature buildSequenceFeature(String[] gff,
382 int typeColumn, String group, Map<String, List<String>> attributes)
386 int start = Integer.parseInt(gff[START_COL]);
387 int end = Integer.parseInt(gff[END_COL]);
390 * default 'score' is 0 rather than Float.NaN - see JAL-2554
395 score = Float.parseFloat(gff[SCORE_COL]);
396 } catch (NumberFormatException nfe)
398 // e.g. '.' - leave as zero
401 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
402 gff[SOURCE_COL], start, end, score, group);
404 sf.setStrand(gff[STRAND_COL]);
406 sf.setPhase(gff[PHASE_COL]);
408 if (attributes != null)
411 * Add attributes in column 9 to the sequence feature's
412 * 'otherData' table; use Note as a best proxy for description;
413 * decode any encoded comma, equals, semi-colon as per GFF3 spec
415 for (Entry<String, List<String>> attr : attributes.entrySet())
417 String key = attr.getKey();
418 List<String> values = attr.getValue();
419 if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX))
421 key = key.substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length());
422 Map<String, String> valueMap = parseAttributeMap(values);
423 sf.setValue(key, valueMap);
427 String csvValues = StringUtils.listToDelimitedString(values,
429 csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
430 sf.setValue(key, csvValues);
431 if (NOTE.equals(key))
433 sf.setDescription(csvValues);
440 } catch (NumberFormatException nfe)
442 System.err.println("Invalid number in gff: " + nfe.getMessage());
448 * Parses one or more list of comma-separated key=value pairs into a Map of
454 protected Map<String, String> parseAttributeMap(List<String> values)
456 Map<String, String> map = new HashMap<>();
457 for (String entry : values)
459 String[] fields = entry.split(COMMA);
460 for (String field : fields)
462 String[] keyValue = field.split("=");
463 if (keyValue.length == 2)
465 String theKey = StringUtils.urlDecode(keyValue[0],
467 String theValue = StringUtils.urlDecode(keyValue[1],
469 map.put(theKey, theValue);
477 * Returns any existing mapping held on the alignment between the given
478 * dataset sequences, or a new one if none found. This is a convenience method
479 * to facilitate processing multiple GFF lines that make up a single 'spliced'
480 * mapping, by extending the first mapping as the others are read.
487 protected AlignedCodonFrame getMapping(AlignmentI align,
488 SequenceI fromSeq, SequenceI toSeq)
490 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
493 acf = new AlignedCodonFrame();