2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import java.util.Locale;
25 import jalview.datamodel.AlignedCodonFrame;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.MappingType;
28 import jalview.datamodel.SequenceFeature;
29 import jalview.datamodel.SequenceI;
30 import jalview.util.MapList;
32 import java.io.IOException;
33 import java.util.List;
37 * A handler to parse GFF in the format generated by the exonerate tool
39 public class ExonerateHelper extends Gff2Helper
41 private static final String SIMILARITY = "similarity";
43 private static final String GENOME2GENOME = "genome2genome";
45 private static final String CDNA2GENOME = "cdna2genome";
47 private static final String CODING2GENOME = "coding2genome";
49 private static final String CODING2CODING = "coding2coding";
51 private static final String PROTEIN2GENOME = "protein2genome";
53 private static final String PROTEIN2DNA = "protein2dna";
55 private static final String ALIGN = "Align";
57 private static final String QUERY = "Query";
59 private static final String TARGET = "Target";
62 * Process one GFF feature line (as modelled by SequenceFeature)
65 * the sequence with which this feature is associated
67 * the sequence feature with ATTRIBUTES property containing any
68 * additional attributes
70 * the alignment we are adding GFF to
72 * any new sequences referenced by the GFF
73 * @param relaxedIdMatching
74 * if true, match word tokens in sequence names
75 * @return true if the sequence feature should be added to the sequence, else
76 * false (i.e. it has been processed in another way e.g. to generate a
80 public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
81 AlignmentI align, List<SequenceI> newseqs,
82 boolean relaxedIdMatching)
84 String attr = gffColumns[ATTRIBUTES_COL];
85 Map<String, List<String>> set = parseNameValuePairs(attr);
89 processGffSimilarity(set, seq, gffColumns, align, newseqs,
91 } catch (IOException ivfe)
93 System.err.println(ivfe);
97 * return null to indicate we don't want to add a sequence feature for
98 * similarity (only process it to create mappings)
104 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
105 * an exonerate GFF similarity feature; these properties define the mapping of
106 * the annotated range to a related sequence.
109 * parsed GFF column 9 key/value(s)
111 * the sequence the GFF feature is on
113 * the GFF column data
115 * the alignment the sequence belongs to, where any new mappings
118 * a list of new 'virtual sequences' generated while parsing GFF
119 * @param relaxedIdMatching
120 * if true allow fuzzy search for a matching target sequence
121 * @throws IOException
123 protected void processGffSimilarity(Map<String, List<String>> set,
124 SequenceI seq, String[] gff, AlignmentI align,
125 List<SequenceI> newseqs, boolean relaxedIdMatching)
129 * exonerate may be run with
130 * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
131 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
132 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
133 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
134 * where the Align spec may repeat
136 // TODO handle coding2coding and similar as well
137 boolean featureIsOnTarget = true;
138 List<String> mapTo = set.get(QUERY);
141 mapTo = set.get(TARGET);
142 featureIsOnTarget = false;
144 MappingType type = getMappingType(gff[SOURCE_COL]);
148 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
151 if (mapTo == null || mapTo.size() != 1)
153 throw new IOException(
154 "Expecting exactly one sequence in Query or Target field (got "
159 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
161 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
165 * If mapping is from protein to dna, we store it as dna to protein instead
167 SequenceI mapFromSequence = seq;
168 SequenceI mapToSequence = mappedSequence;
169 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
170 || (type == MappingType.PeptideToNucleotide
171 && !featureIsOnTarget))
173 mapFromSequence = mappedSequence;
178 * Process the Align maps and create mappings.
179 * These may be cdna-genome, cdna-protein, genome-protein.
180 * The mapped sequences may or may not be in the alignment
181 * (they may be included later in the GFF file).
185 * get any existing mapping for these sequences (or start one),
186 * and add this mapped range
188 AlignedCodonFrame acf = getMapping(align, mapFromSequence,
192 * exonerate GFF has the strand of the target in column 7
193 * (differs from GFF3 which has it in the Target descriptor)
195 String strand = gff[STRAND_COL];
196 boolean forwardStrand = true;
197 if ("-".equals(strand))
199 forwardStrand = false;
201 else if (!"+".equals(strand))
203 System.err.println("Strand must be specified for alignment");
207 List<String> alignedRegions = set.get(ALIGN);
208 for (String region : alignedRegions)
210 MapList mapping = buildMapping(region, type, forwardStrand,
211 featureIsOnTarget, gff);
218 acf.addMap(mapFromSequence, mapToSequence, mapping);
220 align.addCodonFrame(acf);
224 * Construct the mapping
228 * @param forwardStrand
229 * @param featureIsOnTarget
233 protected MapList buildMapping(String region, MappingType type,
234 boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
237 * process one "fromStart toStart fromCount" descriptor
239 String[] tokens = region.split(" ");
240 if (tokens.length != 3)
242 System.err.println("Malformed Align descriptor: " + region);
247 * get start/end of from/to mappings
248 * if feature is on the target sequence we have to invert the sense
255 alignFromStart = Integer.parseInt(tokens[0]);
256 alignToStart = Integer.parseInt(tokens[1]);
257 alignCount = Integer.parseInt(tokens[2]);
258 } catch (NumberFormatException nfe)
260 System.err.println(nfe.toString());
269 if (featureIsOnTarget)
271 fromStart = alignToStart;
272 toStart = alignFromStart;
273 toEnd = forwardStrand ? toStart + alignCount - 1
274 : toStart - (alignCount - 1);
275 int toLength = Math.abs(toEnd - toStart) + 1;
276 int fromLength = toLength * type.getFromRatio() / type.getToRatio();
277 fromEnd = fromStart + fromLength - 1;
281 // we use the 'Align' values here not the feature start/end
282 // not clear why they may differ but it seems they can
283 fromStart = alignFromStart;
284 fromEnd = alignFromStart + alignCount - 1;
285 int fromLength = fromEnd - fromStart + 1;
286 int toLength = fromLength * type.getToRatio() / type.getFromRatio();
287 toStart = alignToStart;
290 toEnd = toStart + toLength - 1;
294 toEnd = toStart - (toLength - 1);
298 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
299 toStart, toEnd, type);
304 * Returns a MappingType depending on the exonerate 'model' value.
309 protected static MappingType getMappingType(String model)
311 MappingType result = null;
313 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
315 result = MappingType.PeptideToNucleotide;
317 else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
318 || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
320 result = MappingType.NucleotideToNucleotide;
326 * Tests whether the GFF data looks like it was generated by exonerate, and is
327 * a format we are willing to handle
332 public static boolean recognises(String[] columns)
334 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
340 * inspect alignment model
342 String model = columns[SOURCE_COL];
343 // e.g. exonerate:protein2genome:local
346 String mdl = model.toLowerCase(Locale.ROOT);
347 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
348 || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
349 || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
354 System.err.println("Sorry, I don't handle exonerate model " + model);
359 * An override to set feature group to "exonerate" instead of the default GFF
360 * source value (column 2)
363 protected SequenceFeature buildSequenceFeature(String[] gff,
364 Map<String, List<String>> set)
366 SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,