2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.MappingType;
26 import jalview.datamodel.SequenceFeature;
27 import jalview.datamodel.SequenceI;
28 import jalview.util.MapList;
30 import java.io.IOException;
31 import java.util.List;
35 * A handler to parse GFF in the format generated by the exonerate tool
37 public class ExonerateHelper extends Gff2Helper
39 private static final String SIMILARITY = "similarity";
41 private static final String GENOME2GENOME = "genome2genome";
43 private static final String CDNA2GENOME = "cdna2genome";
45 private static final String CODING2GENOME = "coding2genome";
47 private static final String CODING2CODING = "coding2coding";
49 private static final String PROTEIN2GENOME = "protein2genome";
51 private static final String PROTEIN2DNA = "protein2dna";
53 private static final String ALIGN = "Align";
55 private static final String QUERY = "Query";
57 private static final String TARGET = "Target";
60 * Process one GFF feature line (as modelled by SequenceFeature)
63 * the sequence with which this feature is associated
65 * the sequence feature with ATTRIBUTES property containing any
66 * additional attributes
68 * the alignment we are adding GFF to
70 * any new sequences referenced by the GFF
71 * @param relaxedIdMatching
72 * if true, match word tokens in sequence names
73 * @return true if the sequence feature should be added to the sequence, else
74 * false (i.e. it has been processed in another way e.g. to generate a
78 public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
79 AlignmentI align, List<SequenceI> newseqs,
80 boolean relaxedIdMatching)
82 String attr = gffColumns[ATTRIBUTES_COL];
83 Map<String, List<String>> set = parseNameValuePairs(attr);
87 processGffSimilarity(set, seq, gffColumns, align, newseqs,
89 } catch (IOException ivfe)
91 System.err.println(ivfe);
95 * return null to indicate we don't want to add a sequence feature for
96 * similarity (only process it to create mappings)
102 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
103 * an exonerate GFF similarity feature; these properties define the mapping of
104 * the annotated range to a related sequence.
107 * parsed GFF column 9 key/value(s)
109 * the sequence the GFF feature is on
111 * the GFF column data
113 * the alignment the sequence belongs to, where any new mappings
116 * a list of new 'virtual sequences' generated while parsing GFF
117 * @param relaxedIdMatching
118 * if true allow fuzzy search for a matching target sequence
119 * @throws IOException
121 protected void processGffSimilarity(Map<String, List<String>> set,
122 SequenceI seq, String[] gff, AlignmentI align,
123 List<SequenceI> newseqs, boolean relaxedIdMatching)
127 * exonerate may be run with
128 * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
129 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
130 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
131 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
132 * where the Align spec may repeat
134 // TODO handle coding2coding and similar as well
135 boolean featureIsOnTarget = true;
136 List<String> mapTo = set.get(QUERY);
139 mapTo = set.get(TARGET);
140 featureIsOnTarget = false;
142 MappingType type = getMappingType(gff[SOURCE_COL]);
146 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
149 if (mapTo == null || mapTo.size() != 1)
151 throw new IOException(
152 "Expecting exactly one sequence in Query or Target field (got "
157 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
159 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
163 * If mapping is from protein to dna, we store it as dna to protein instead
165 SequenceI mapFromSequence = seq;
166 SequenceI mapToSequence = mappedSequence;
167 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
168 || (type == MappingType.PeptideToNucleotide
169 && !featureIsOnTarget))
171 mapFromSequence = mappedSequence;
176 * Process the Align maps and create mappings.
177 * These may be cdna-genome, cdna-protein, genome-protein.
178 * The mapped sequences may or may not be in the alignment
179 * (they may be included later in the GFF file).
183 * get any existing mapping for these sequences (or start one),
184 * and add this mapped range
186 AlignedCodonFrame acf = getMapping(align, mapFromSequence,
190 * exonerate GFF has the strand of the target in column 7
191 * (differs from GFF3 which has it in the Target descriptor)
193 String strand = gff[STRAND_COL];
194 boolean forwardStrand = true;
195 if ("-".equals(strand))
197 forwardStrand = false;
199 else if (!"+".equals(strand))
201 System.err.println("Strand must be specified for alignment");
205 List<String> alignedRegions = set.get(ALIGN);
206 for (String region : alignedRegions)
208 MapList mapping = buildMapping(region, type, forwardStrand,
209 featureIsOnTarget, gff);
216 acf.addMap(mapFromSequence, mapToSequence, mapping);
218 align.addCodonFrame(acf);
222 * Construct the mapping
226 * @param forwardStrand
227 * @param featureIsOnTarget
231 protected MapList buildMapping(String region, MappingType type,
232 boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
235 * process one "fromStart toStart fromCount" descriptor
237 String[] tokens = region.split(" ");
238 if (tokens.length != 3)
240 System.err.println("Malformed Align descriptor: " + region);
245 * get start/end of from/to mappings
246 * if feature is on the target sequence we have to invert the sense
253 alignFromStart = Integer.parseInt(tokens[0]);
254 alignToStart = Integer.parseInt(tokens[1]);
255 alignCount = Integer.parseInt(tokens[2]);
256 } catch (NumberFormatException nfe)
258 System.err.println(nfe.toString());
267 if (featureIsOnTarget)
269 fromStart = alignToStart;
270 toStart = alignFromStart;
271 toEnd = forwardStrand ? toStart + alignCount - 1
272 : toStart - (alignCount - 1);
273 int toLength = Math.abs(toEnd - toStart) + 1;
274 int fromLength = toLength * type.getFromRatio() / type.getToRatio();
275 fromEnd = fromStart + fromLength - 1;
279 // we use the 'Align' values here not the feature start/end
280 // not clear why they may differ but it seems they can
281 fromStart = alignFromStart;
282 fromEnd = alignFromStart + alignCount - 1;
283 int fromLength = fromEnd - fromStart + 1;
284 int toLength = fromLength * type.getToRatio() / type.getFromRatio();
285 toStart = alignToStart;
288 toEnd = toStart + toLength - 1;
292 toEnd = toStart - (toLength - 1);
296 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
297 toStart, toEnd, type);
302 * Returns a MappingType depending on the exonerate 'model' value.
307 protected static MappingType getMappingType(String model)
309 MappingType result = null;
311 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
313 result = MappingType.PeptideToNucleotide;
315 else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
316 || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
318 result = MappingType.NucleotideToNucleotide;
324 * Tests whether the GFF data looks like it was generated by exonerate, and is
325 * a format we are willing to handle
330 public static boolean recognises(String[] columns)
332 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
338 * inspect alignment model
340 String model = columns[SOURCE_COL];
341 // e.g. exonerate:protein2genome:local
344 String mdl = model.toLowerCase();
345 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
346 || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
347 || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
352 System.err.println("Sorry, I don't handle exonerate model " + model);
357 * An override to set feature group to "exonerate" instead of the default GFF
358 * source value (column 2)
361 protected SequenceFeature buildSequenceFeature(String[] gff,
362 Map<String, List<String>> set)
364 SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,