2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.AlignedCodonFrame;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.MappingType;
27 import jalview.datamodel.SequenceDummy;
28 import jalview.datamodel.SequenceFeature;
29 import jalview.datamodel.SequenceI;
30 import jalview.util.MapList;
31 import jalview.util.StringUtils;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.HashMap;
36 import java.util.List;
38 import java.util.Map.Entry;
41 * Base class with common functionality for flavours of GFF handler (GFF2 or
44 public abstract class GffHelperBase implements GffHelperI
46 private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
48 protected static final String COMMA = ",";
50 protected static final String EQUALS = "=";
52 protected static final String NOTE = "Note";
55 * GFF columns 1-9 (zero-indexed):
57 protected static final int SEQID_COL = 0;
59 protected static final int SOURCE_COL = 1;
61 protected static final int TYPE_COL = 2;
63 protected static final int START_COL = 3;
65 protected static final int END_COL = 4;
67 protected static final int SCORE_COL = 5;
69 protected static final int STRAND_COL = 6;
71 protected static final int PHASE_COL = 7;
73 protected static final int ATTRIBUTES_COL = 8;
75 private AlignmentI lastmatchedAl = null;
77 private SequenceIdMatcher matcher = null;
80 * Constructs and returns a mapping, or null if data appear invalid
87 * type of mapping (e.g. protein to nucleotide)
90 protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
91 int toStart, int toEnd, MappingType mappingType)
93 int[] from = new int[] { fromStart, fromEnd };
94 int[] to = new int[] { toStart, toEnd };
97 * Jalview always models from dna to protein, so switch values if the
98 * GFF mapping is from protein to dna
100 if (mappingType == MappingType.PeptideToNucleotide)
105 mappingType = mappingType.getInverse();
108 int fromRatio = mappingType.getFromRatio();
109 int toRatio = mappingType.getToRatio();
112 * sanity check that mapped residue counts match
113 * TODO understand why PASA generates such cases...
115 if (!trimMapping(from, to, fromRatio, toRatio))
117 jalview.bin.Console.errPrintln(
118 "Ignoring mapping from " + Arrays.toString(from) + " to "
119 + Arrays.toString(to) + " as counts don't match!");
124 * If a codon has an intron gap, there will be contiguous 'toRanges';
125 * this is handled for us by the MapList constructor.
126 * (It is not clear that exonerate ever generates this case)
129 return new MapList(from, to, fromRatio, toRatio);
133 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
134 * tries to trim the end of the longer so they do. Returns true if the
135 * mappings could be made equivalent, else false. Note the range array values
136 * may be modified by this method.
144 protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
147 int fromLength = Math.abs(from[1] - from[0]) + 1;
148 int toLength = Math.abs(to[1] - to[0]) + 1;
149 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
150 if (fromOverlap == 0)
154 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
157 * restrict from range to make them match up
158 * it's kind of arbitrary which end we truncate - here it is the end
161 "Truncating mapping from " + Arrays.toString(from) + " to ");
162 if (from[1] > from[0])
164 from[1] -= fromOverlap / toRatio;
168 from[1] += fromOverlap / toRatio;
170 jalview.bin.Console.errPrintln(Arrays.toString(from));
173 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
175 fromOverlap = -fromOverlap; // > 0
177 * restrict to range to make them match up
180 "Truncating mapping to " + Arrays.toString(to) + " to ");
183 to[1] -= fromOverlap / fromRatio;
187 to[1] += fromOverlap / fromRatio;
189 jalview.bin.Console.errPrintln(Arrays.toString(to));
194 * Couldn't truncate to an exact match..
200 * Returns a sequence matching the given id, as follows
202 * <li>strict matching is on exact sequence name</li>
203 * <li>relaxed matching allows matching on a token within the sequence name,
205 * <li>first tries to find a match in the alignment sequences</li>
206 * <li>else tries to find a match in the new sequences already generated while
207 * parsing the features file</li>
208 * <li>else creates a new placeholder sequence, adds it to the new sequences
209 * list, and returns it</li>
215 * @param relaxedIdMatching
219 protected SequenceI findSequence(String seqId, AlignmentI align,
220 List<SequenceI> newseqs, boolean relaxedIdMatching)
226 SequenceI match = null;
227 if (relaxedIdMatching)
229 if (lastmatchedAl != align)
231 lastmatchedAl = align;
232 matcher = new SequenceIdMatcher(align.getSequencesArray());
235 matcher.addAll(newseqs);
238 match = matcher.findIdMatch(seqId);
242 match = align.findName(seqId, true);
243 if (match == null && newseqs != null)
245 for (SequenceI m : newseqs)
247 if (seqId.equals(m.getName()))
255 if (match == null && newseqs != null)
257 match = new SequenceDummy(seqId);
258 if (relaxedIdMatching)
260 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
262 // add dummy sequence to the newseqs list
269 * Parses the input line to a map of name / value(s) pairs. For example the
273 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
276 * if parsed with delimiter=";" and separators {' ', '='} <br>
277 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
278 * prediction}, source={Pfam}} <br>
280 * This method supports parsing of either GFF2 format (which uses space ' ' as
281 * the name/value delimiter, and allows multiple occurrences of the same
282 * name), or GFF3 format (which uses '=' as the name/value delimiter, and
283 * strictly does not allow repeat occurrences of the same name - but does
284 * allow a comma-separated list of values).
286 * Returns a (possibly empty) map of lists of values by attribute name.
289 * @param namesDelimiter
290 * the major delimiter between name-value pairs
291 * @param nameValueSeparator
292 * separator used between name and value
293 * @param valuesDelimiter
294 * delimits a list of more than one value
297 public static Map<String, List<String>> parseNameValuePairs(String text,
298 String namesDelimiter, char nameValueSeparator,
299 String valuesDelimiter)
301 Map<String, List<String>> map = new HashMap<>();
302 if (text == null || text.trim().length() == 0)
308 * split by major delimiter (; for GFF3)
310 for (String nameValuePair : text.trim().split(namesDelimiter))
312 nameValuePair = nameValuePair.trim();
313 if (nameValuePair.length() == 0)
319 * find name/value separator (= for GFF3)
321 int sepPos = nameValuePair.indexOf(nameValueSeparator);
324 // no name=value found
328 String name = nameValuePair.substring(0, sepPos).trim();
329 String values = nameValuePair.substring(sepPos + 1).trim();
330 if (values.isEmpty())
335 List<String> vals = map.get(name);
338 vals = new ArrayList<>();
343 * if 'values' contains more name/value separators, parse as a map
344 * (nested sub-attribute values)
346 if (values.indexOf(nameValueSeparator) != -1)
352 for (String val : values.split(valuesDelimiter))
363 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
364 * to call this method then adjust the SequenceFeature depending on the
365 * particular usage of different tools that generate GFF.
371 protected SequenceFeature buildSequenceFeature(String[] gff,
372 Map<String, List<String>> attributes)
374 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
384 protected SequenceFeature buildSequenceFeature(String[] gff,
385 int typeColumn, String group,
386 Map<String, List<String>> attributes)
390 int start = Integer.parseInt(gff[START_COL]);
391 int end = Integer.parseInt(gff[END_COL]);
394 * default 'score' is 0 rather than Float.NaN - see JAL-2554
399 score = Float.parseFloat(gff[SCORE_COL]);
400 } catch (NumberFormatException nfe)
402 // e.g. '.' - leave as zero
405 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
406 gff[SOURCE_COL], start, end, score, group);
408 sf.setStrand(gff[STRAND_COL]);
410 sf.setPhase(gff[PHASE_COL]);
412 if (attributes != null)
415 * Add attributes in column 9 to the sequence feature's
416 * 'otherData' table; use Note as a best proxy for description;
417 * decode any encoded comma, equals, semi-colon as per GFF3 spec
419 for (Entry<String, List<String>> attr : attributes.entrySet())
421 String key = attr.getKey();
422 List<String> values = attr.getValue();
423 if (values.size() == 1 && values.get(0).contains(EQUALS))
426 * 'value' is actually nested subattributes as x=a,y=b,z=c
428 Map<String, String> valueMap = parseAttributeMap(values.get(0));
429 sf.setValue(key, valueMap);
433 String csvValues = StringUtils.listToDelimitedString(values,
435 csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
436 sf.setValue(key, csvValues);
437 if (NOTE.equals(key))
439 sf.setDescription(csvValues);
446 } catch (NumberFormatException nfe)
449 .errPrintln("Invalid number in gff: " + nfe.getMessage());
455 * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
458 * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
470 protected static Map<String, String> parseAttributeMap(String s)
472 Map<String, String> map = new HashMap<>();
473 String[] fields = s.split(EQUALS);
478 boolean valid = true;
479 if (fields.length < 2)
482 * need at least A=B here
486 else if (fields[0].isEmpty() || fields[0].contains(COMMA))
489 * A,B=C is not a valid start, nor is =C
495 for (int i = 1; i < fields.length - 1; i++)
497 if (fields[i].isEmpty() || !fields[i].contains(COMMA))
500 * intermediate tokens must include value,name
509 jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
514 while (i < fields.length - 1)
516 boolean lastPair = i == fields.length - 2;
517 String before = fields[i];
518 String after = fields[i + 1];
521 * if 'key' looks like a,b,c then the last token is the
524 String theKey = before.contains(COMMA)
525 ? before.substring(before.lastIndexOf(COMMA) + 1)
528 theKey = theKey.trim();
529 if (theKey.isEmpty())
531 jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
537 * if 'value' looks like a,b,c then all but the last token is the value,
538 * unless this is the last field (no more = to follow), in which case
539 * all of it makes up the value
541 String theValue = after.contains(COMMA) && !lastPair
542 ? after.substring(0, after.lastIndexOf(COMMA))
544 map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
545 StringUtils.urlDecode(theValue, GFF_ENCODABLE));
553 * Returns any existing mapping held on the alignment between the given
554 * dataset sequences, or a new one if none found. This is a convenience method
555 * to facilitate processing multiple GFF lines that make up a single 'spliced'
556 * mapping, by extending the first mapping as the others are read.
563 protected AlignedCodonFrame getMapping(AlignmentI align,
564 SequenceI fromSeq, SequenceI toSeq)
566 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
569 acf = new AlignedCodonFrame();