2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.analysis.SequenceIdMatcher;
24 import jalview.datamodel.AlignedCodonFrame;
25 import jalview.datamodel.AlignmentI;
26 import jalview.datamodel.MappingType;
27 import jalview.datamodel.SequenceDummy;
28 import jalview.datamodel.SequenceFeature;
29 import jalview.datamodel.SequenceI;
30 import jalview.util.MapList;
31 import jalview.util.StringUtils;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.HashMap;
36 import java.util.List;
38 import java.util.Map.Entry;
41 * Base class with common functionality for flavours of GFF handler (GFF2 or
44 public abstract class GffHelperBase implements GffHelperI
46 private static final String NOTE = "Note";
49 * GFF columns 1-9 (zero-indexed):
51 protected static final int SEQID_COL = 0;
53 protected static final int SOURCE_COL = 1;
55 protected static final int TYPE_COL = 2;
57 protected static final int START_COL = 3;
59 protected static final int END_COL = 4;
61 protected static final int SCORE_COL = 5;
63 protected static final int STRAND_COL = 6;
65 protected static final int PHASE_COL = 7;
67 protected static final int ATTRIBUTES_COL = 8;
69 private AlignmentI lastmatchedAl = null;
71 private SequenceIdMatcher matcher = null;
74 * Constructs and returns a mapping, or null if data appear invalid
81 * type of mapping (e.g. protein to nucleotide)
84 protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
85 int toStart, int toEnd, MappingType mappingType)
87 int[] from = new int[] { fromStart, fromEnd };
88 int[] to = new int[] { toStart, toEnd };
91 * Jalview always models from dna to protein, so switch values if the
92 * GFF mapping is from protein to dna
94 if (mappingType == MappingType.PeptideToNucleotide)
99 mappingType = mappingType.getInverse();
102 int fromRatio = mappingType.getFromRatio();
103 int toRatio = mappingType.getToRatio();
106 * sanity check that mapped residue counts match
107 * TODO understand why PASA generates such cases...
109 if (!trimMapping(from, to, fromRatio, toRatio))
111 System.err.println("Ignoring mapping from " + Arrays.toString(from)
112 + " to " + Arrays.toString(to) + " as counts don't match!");
117 * If a codon has an intron gap, there will be contiguous 'toRanges';
118 * this is handled for us by the MapList constructor.
119 * (It is not clear that exonerate ever generates this case)
122 return new MapList(from, to, fromRatio, toRatio);
126 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
127 * tries to trim the end of the longer so they do. Returns true if the
128 * mappings could be made equivalent, else false. Note the range array values
129 * may be modified by this method.
137 protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
140 int fromLength = Math.abs(from[1] - from[0]) + 1;
141 int toLength = Math.abs(to[1] - to[0]) + 1;
142 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
143 if (fromOverlap == 0)
147 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
150 * restrict from range to make them match up
151 * it's kind of arbitrary which end we truncate - here it is the end
153 System.err.print("Truncating mapping from " + Arrays.toString(from)
155 if (from[1] > from[0])
157 from[1] -= fromOverlap / toRatio;
161 from[1] += fromOverlap / toRatio;
163 System.err.println(Arrays.toString(from));
166 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
168 fromOverlap = -fromOverlap; // > 0
170 * restrict to range to make them match up
172 System.err.print("Truncating mapping to " + Arrays.toString(to)
176 to[1] -= fromOverlap / fromRatio;
180 to[1] += fromOverlap / fromRatio;
182 System.err.println(Arrays.toString(to));
187 * Couldn't truncate to an exact match..
193 * Returns a sequence matching the given id, as follows
195 * <li>strict matching is on exact sequence name</li>
196 * <li>relaxed matching allows matching on a token within the sequence name,
198 * <li>first tries to find a match in the alignment sequences</li>
199 * <li>else tries to find a match in the new sequences already generated while
200 * parsing the features file</li>
201 * <li>else creates a new placeholder sequence, adds it to the new sequences
202 * list, and returns it</li>
208 * @param relaxedIdMatching
212 protected SequenceI findSequence(String seqId, AlignmentI align,
213 List<SequenceI> newseqs, boolean relaxedIdMatching)
219 SequenceI match = null;
220 if (relaxedIdMatching)
222 if (lastmatchedAl != align)
224 lastmatchedAl = align;
225 matcher = new SequenceIdMatcher(align.getSequencesArray());
228 matcher.addAll(newseqs);
231 match = matcher.findIdMatch(seqId);
235 match = align.findName(seqId, true);
236 if (match == null && newseqs != null)
238 for (SequenceI m : newseqs)
240 if (seqId.equals(m.getName()))
248 if (match == null && newseqs != null)
250 match = new SequenceDummy(seqId);
251 if (relaxedIdMatching)
253 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
255 // add dummy sequence to the newseqs list
262 * Parses the input line to a map of name / value(s) pairs. For example the
264 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal <br>
265 * if parsed with delimiter=";" and separators {' ', '='} <br>
266 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
267 * prediction}, source={Pfam}} <br>
269 * This method supports parsing of either GFF2 format (which uses space ' ' as
270 * the name/value delimiter, and allows multiple occurrences of the same
271 * name), or GFF3 format (which uses '=' as the name/value delimiter, and
272 * strictly does not allow repeat occurrences of the same name - but does
273 * allow a comma-separated list of values).
276 * @param namesDelimiter
277 * the major delimiter between name-value pairs
278 * @param nameValueSeparator
279 * one or more separators used between name and value
280 * @param valuesDelimiter
281 * delimits a list of more than one value
282 * @return the name-values map (which may be empty but never null)
284 public static Map<String, List<String>> parseNameValuePairs(String text,
285 String namesDelimiter, char nameValueSeparator,
286 String valuesDelimiter)
288 Map<String, List<String>> map = new HashMap<String, List<String>>();
289 if (text == null || text.trim().length() == 0)
294 for (String pair : text.trim().split(namesDelimiter))
297 if (pair.length() == 0)
302 int sepPos = pair.indexOf(nameValueSeparator);
305 // no name=value present
309 String key = pair.substring(0, sepPos).trim();
310 String values = pair.substring(sepPos + 1).trim();
311 if (values.length() > 0)
313 List<String> vals = map.get(key);
316 vals = new ArrayList<String>();
319 for (String val : values.split(valuesDelimiter))
329 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
330 * to call this method then adjust the SequenceFeature depending on the
331 * particular usage of different tools that generate GFF.
337 protected SequenceFeature buildSequenceFeature(String[] gff,
338 Map<String, List<String>> attributes)
342 int start = Integer.parseInt(gff[START_COL]);
343 int end = Integer.parseInt(gff[END_COL]);
346 * default 'score' is 0 rather than Float.NaN as the latter currently
347 * disables the 'graduated colour => colour by label' option
352 score = Float.parseFloat(gff[SCORE_COL]);
353 } catch (NumberFormatException nfe)
355 // e.g. '.' - leave as zero
358 SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
359 gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
361 sf.setStrand(gff[STRAND_COL]);
363 sf.setPhase(gff[PHASE_COL]);
365 if (attributes != null)
368 * save 'raw' column 9 to allow roundtrip output as input
370 sf.setAttributes(gff[ATTRIBUTES_COL]);
373 * Add attributes in column 9 to the sequence feature's
374 * 'otherData' table; use Note as a best proxy for description
376 for (Entry<String, List<String>> attr : attributes.entrySet())
378 String values = StringUtils.listToDelimitedString(
379 attr.getValue(), ",");
380 sf.setValue(attr.getKey(), values);
381 if (NOTE.equals(attr.getKey()))
383 sf.setDescription(values);
389 } catch (NumberFormatException nfe)
391 System.err.println("Invalid number in gff: " + nfe.getMessage());
397 * Returns the character used to separate attributes names from values in GFF
398 * column 9. This is space for GFF2, '=' for GFF3.
402 protected abstract char getNameValueSeparator();
405 * Returns any existing mapping held on the alignment between the given
406 * dataset sequences, or a new one if none found. This is a convenience method
407 * to facilitate processing multiple GFF lines that make up a single 'spliced'
408 * mapping, by extending the first mapping as the others are read.
415 protected AlignedCodonFrame getMapping(AlignmentI align,
416 SequenceI fromSeq, SequenceI toSeq)
418 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
421 acf = new AlignedCodonFrame();