2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.AlignedCodonFrame;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.MappingType;
27 import jalview.datamodel.SequenceDummy;
28 import jalview.datamodel.SequenceFeature;
29 import jalview.datamodel.SequenceI;
30 import jalview.io.FeaturesFile;
31 import jalview.util.MapList;
32 import jalview.util.StringUtils;
34 import java.util.ArrayList;
35 import java.util.Arrays;
36 import java.util.HashMap;
37 import java.util.List;
39 import java.util.Map.Entry;
42 * Base class with common functionality for flavours of GFF handler (GFF2 or
45 public abstract class GffHelperBase implements GffHelperI
47 private static final String NOTE = "Note";
50 * GFF columns 1-9 (zero-indexed):
52 protected static final int SEQID_COL = 0;
54 protected static final int SOURCE_COL = 1;
56 protected static final int TYPE_COL = 2;
58 protected static final int START_COL = 3;
60 protected static final int END_COL = 4;
62 protected static final int SCORE_COL = 5;
64 protected static final int STRAND_COL = 6;
66 protected static final int PHASE_COL = 7;
68 protected static final int ATTRIBUTES_COL = 8;
70 private AlignmentI lastmatchedAl = null;
72 private SequenceIdMatcher matcher = null;
75 * Constructs and returns a mapping, or null if data appear invalid
82 * type of mapping (e.g. protein to nucleotide)
85 protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
86 int toStart, int toEnd, MappingType mappingType)
88 int[] from = new int[] { fromStart, fromEnd };
89 int[] to = new int[] { toStart, toEnd };
92 * Jalview always models from dna to protein, so switch values if the
93 * GFF mapping is from protein to dna
95 if (mappingType == MappingType.PeptideToNucleotide)
100 mappingType = mappingType.getInverse();
103 int fromRatio = mappingType.getFromRatio();
104 int toRatio = mappingType.getToRatio();
107 * sanity check that mapped residue counts match
108 * TODO understand why PASA generates such cases...
110 if (!trimMapping(from, to, fromRatio, toRatio))
112 System.err.println("Ignoring mapping from " + Arrays.toString(from)
113 + " to " + Arrays.toString(to) + " as counts don't match!");
118 * If a codon has an intron gap, there will be contiguous 'toRanges';
119 * this is handled for us by the MapList constructor.
120 * (It is not clear that exonerate ever generates this case)
123 return new MapList(from, to, fromRatio, toRatio);
127 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
128 * tries to trim the end of the longer so they do. Returns true if the
129 * mappings could be made equivalent, else false. Note the range array values
130 * may be modified by this method.
138 protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
141 int fromLength = Math.abs(from[1] - from[0]) + 1;
142 int toLength = Math.abs(to[1] - to[0]) + 1;
143 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
144 if (fromOverlap == 0)
148 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
151 * restrict from range to make them match up
152 * it's kind of arbitrary which end we truncate - here it is the end
155 "Truncating mapping from " + Arrays.toString(from) + " to ");
156 if (from[1] > from[0])
158 from[1] -= fromOverlap / toRatio;
162 from[1] += fromOverlap / toRatio;
164 System.err.println(Arrays.toString(from));
167 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
169 fromOverlap = -fromOverlap; // > 0
171 * restrict to range to make them match up
174 "Truncating mapping to " + Arrays.toString(to) + " to ");
177 to[1] -= fromOverlap / fromRatio;
181 to[1] += fromOverlap / fromRatio;
183 System.err.println(Arrays.toString(to));
188 * Couldn't truncate to an exact match..
194 * Returns a sequence matching the given id, as follows
196 * <li>strict matching is on exact sequence name</li>
197 * <li>relaxed matching allows matching on a token within the sequence name,
199 * <li>first tries to find a match in the alignment sequences</li>
200 * <li>else tries to find a match in the new sequences already generated while
201 * parsing the features file</li>
202 * <li>else creates a new placeholder sequence, adds it to the new sequences
203 * list, and returns it</li>
209 * @param relaxedIdMatching
213 protected SequenceI findSequence(String seqId, AlignmentI align,
214 List<SequenceI> newseqs, boolean relaxedIdMatching)
220 SequenceI match = null;
221 if (relaxedIdMatching)
223 if (lastmatchedAl != align)
225 lastmatchedAl = align;
226 matcher = new SequenceIdMatcher(align.getSequencesArray());
229 matcher.addAll(newseqs);
232 match = matcher.findIdMatch(seqId);
236 match = align.findName(seqId, true);
237 if (match == null && newseqs != null)
239 for (SequenceI m : newseqs)
241 if (seqId.equals(m.getName()))
249 if (match == null && newseqs != null)
251 match = new SequenceDummy(seqId);
252 if (relaxedIdMatching)
254 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
256 // add dummy sequence to the newseqs list
263 * Parses the input line to a map of name / value(s) pairs. For example the
265 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
267 * if parsed with delimiter=";" and separators {' ', '='} <br>
268 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
269 * prediction}, source={Pfam}} <br>
271 * This method supports parsing of either GFF2 format (which uses space ' ' as
272 * the name/value delimiter, and allows multiple occurrences of the same
273 * name), or GFF3 format (which uses '=' as the name/value delimiter, and
274 * strictly does not allow repeat occurrences of the same name - but does
275 * allow a comma-separated list of values).
278 * @param namesDelimiter
279 * the major delimiter between name-value pairs
280 * @param nameValueSeparator
281 * one or more separators used between name and value
282 * @param valuesDelimiter
283 * delimits a list of more than one value
284 * @return the name-values map (which may be empty but never null)
286 public static Map<String, List<String>> parseNameValuePairs(String text,
287 String namesDelimiter, char nameValueSeparator,
288 String valuesDelimiter)
290 Map<String, List<String>> map = new HashMap<>();
291 if (text == null || text.trim().length() == 0)
296 for (String pair : text.trim().split(namesDelimiter))
299 if (pair.length() == 0)
304 int sepPos = pair.indexOf(nameValueSeparator);
307 // no name=value present
311 String key = pair.substring(0, sepPos).trim();
312 String values = pair.substring(sepPos + 1).trim();
313 if (values.length() > 0)
315 List<String> vals = map.get(key);
318 vals = new ArrayList<>();
321 for (String val : values.split(valuesDelimiter))
331 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
332 * to call this method then adjust the SequenceFeature depending on the
333 * particular usage of different tools that generate GFF.
339 protected SequenceFeature buildSequenceFeature(String[] gff,
340 Map<String, List<String>> attributes)
342 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
352 protected SequenceFeature buildSequenceFeature(String[] gff,
353 int typeColumn, String group, Map<String, List<String>> attributes)
357 int start = Integer.parseInt(gff[START_COL]);
358 int end = Integer.parseInt(gff[END_COL]);
361 * default 'score' is 0 rather than Float.NaN - see JAL-2554
366 score = Float.parseFloat(gff[SCORE_COL]);
367 } catch (NumberFormatException nfe)
369 // e.g. '.' - leave as zero
372 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
373 gff[SOURCE_COL], start, end, score, group);
375 sf.setStrand(gff[STRAND_COL]);
377 sf.setPhase(gff[PHASE_COL]);
379 if (attributes != null)
382 * Add attributes in column 9 to the sequence feature's
383 * 'otherData' table; use Note as a best proxy for description;
384 * decode any encoded comma, equals, semi-colon as per GFF3 spec
386 for (Entry<String, List<String>> attr : attributes.entrySet())
388 String key = attr.getKey();
389 List<String> value = attr.getValue();
390 if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX))
393 * e.g. jvmap_CSQ={ALLELE_NUM=1,CDS_position=249,Codons=caG/caT}
396 .substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length());
397 if (trueKey.isEmpty() || value.isEmpty()
398 || !value.get(0).startsWith("{")
399 || !value.get(value.size() - 1).endsWith("}"))
401 System.err.println("Malformed GFF data '" + value.toString()
405 Map<String, String> values = new HashMap<>();
406 for (String entry : value)
408 if (entry.startsWith("{"))
410 entry = entry.substring(1);
412 if (entry.endsWith("}"))
414 entry = entry.substring(0, entry.length() - 1);
416 String[] fields = entry.split(",");
417 for (String field : fields)
419 String[] keyValue = field.split("=");
420 if (keyValue.length == 2)
422 String theKey = StringUtils.urlDecode(keyValue[0],
424 String theValue = StringUtils.urlDecode(keyValue[1],
426 values.put(theKey, theValue);
430 sf.setValue(trueKey, values);
434 String values = StringUtils
435 .listToDelimitedString(value, ",");
436 values = StringUtils.urlDecode(values, GFF_ENCODABLE);
437 sf.setValue(key, values);
438 if (NOTE.equals(key))
440 sf.setDescription(values);
447 } catch (NumberFormatException nfe)
449 System.err.println("Invalid number in gff: " + nfe.getMessage());
455 * Returns the character used to separate attributes names from values in GFF
456 * column 9. This is space for GFF2, '=' for GFF3.
460 protected abstract char getNameValueSeparator();
463 * Returns any existing mapping held on the alignment between the given
464 * dataset sequences, or a new one if none found. This is a convenience method
465 * to facilitate processing multiple GFF lines that make up a single 'spliced'
466 * mapping, by extending the first mapping as the others are read.
473 protected AlignedCodonFrame getMapping(AlignmentI align,
474 SequenceI fromSeq, SequenceI toSeq)
476 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
479 acf = new AlignedCodonFrame();