2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import java.util.Locale;
25 import jalview.datamodel.AlignedCodonFrame;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.MappingType;
28 import jalview.datamodel.SequenceFeature;
29 import jalview.datamodel.SequenceI;
30 import jalview.util.MapList;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.List;
38 * A handler to parse GFF in the format generated by the exonerate tool
40 public class ExonerateHelper extends Gff2Helper
42 private static final String SIMILARITY = "similarity";
44 private static final String GENOME2GENOME = "genome2genome";
46 private static final String CDNA2GENOME = "cdna2genome";
48 private static final String CODING2GENOME = "coding2genome";
50 private static final String CODING2CODING = "coding2coding";
52 private static final String PROTEIN2GENOME = "protein2genome";
54 private static final String PROTEIN2DNA = "protein2dna";
56 private static final String ALIGN = "Align";
58 private static final String QUERY = "Query";
60 private static final String TARGET = "Target";
63 * Process one GFF feature line (as modelled by SequenceFeature)
66 * the sequence with which this feature is associated
68 * the sequence feature with ATTRIBUTES property containing any
69 * additional attributes
71 * the alignment we are adding GFF to
73 * any new sequences referenced by the GFF
74 * @param relaxedIdMatching
75 * if true, match word tokens in sequence names
76 * @return true if the sequence feature should be added to the sequence, else
77 * false (i.e. it has been processed in another way e.g. to generate a
81 public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
82 AlignmentI align, List<SequenceI> newseqs,
83 boolean relaxedIdMatching)
85 String attr = gffColumns[ATTRIBUTES_COL];
86 Map<String, List<String>> set = parseNameValuePairs(attr);
90 processGffSimilarity(set, seq, gffColumns, align, newseqs,
92 } catch (IOException ivfe)
94 System.err.println(ivfe);
98 * return null to indicate we don't want to add a sequence feature for
99 * similarity (only process it to create mappings)
105 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
106 * an exonerate GFF similarity feature; these properties define the mapping of
107 * the annotated range to a related sequence.
110 * parsed GFF column 9 key/value(s)
112 * the sequence the GFF feature is on
114 * the GFF column data
116 * the alignment the sequence belongs to, where any new mappings
119 * a list of new 'virtual sequences' generated while parsing GFF
120 * @param relaxedIdMatching
121 * if true allow fuzzy search for a matching target sequence
122 * @throws IOException
124 protected void processGffSimilarity(Map<String, List<String>> set,
125 SequenceI seq, String[] gff, AlignmentI align,
126 List<SequenceI> newseqs, boolean relaxedIdMatching)
130 * exonerate may be run with
131 * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
132 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
133 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
134 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
135 * where the Align spec may repeat
137 // TODO handle coding2coding and similar as well
138 boolean featureIsOnTarget = true;
139 List<String> mapTo = set.get(QUERY);
142 mapTo = set.get(TARGET);
143 featureIsOnTarget = false;
145 MappingType type = getMappingType(gff[SOURCE_COL]);
149 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
152 if (mapTo == null || mapTo.size() != 1)
154 throw new IOException(
155 "Expecting exactly one sequence in Query or Target field (got "
160 * similarity start and end can tell us
161 * which part of the alignment refers to which sequence
163 int similarityFrom,similarityTo;
165 similarityFrom = Integer.parseInt(gff[START_COL]);
166 similarityTo = Integer.parseInt(gff[END_COL]);
167 } catch (Exception x) {
168 throw new IOException("Couldn't parse start/end of the similarity feature",x);
172 * Process the Align maps and create mappings.
173 * These may be cdna-genome, cdna-protein, genome-protein.
174 * The mapped sequences may or may not be in the alignment
175 * (they may be included later in the GFF file).
179 * exonerate GFF has the strand of the target in column 7
180 * (differs from GFF3 which has it in the Target descriptor)
182 String strand = gff[STRAND_COL];
183 boolean forwardStrand = true;
184 if ("-".equals(strand))
186 forwardStrand = false;
188 else if (!"+".equals(strand))
190 System.err.println("Strand must be specified for alignment");
194 List<String> alignedRegions = set.get(ALIGN);
195 List<MapList> mappings = new ArrayList<MapList>();
196 int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0;
197 for (String region : alignedRegions)
199 MapList mapping = buildMapping(region, type, forwardStrand,
200 featureIsOnTarget, gff);
208 * record total extent of aligned region(s) for later
210 if (mappings.size() == 0)
212 if (mapping.getFromLowest() < mapping.getFromHighest())
214 fromLowest = mapping.getFromLowest();
215 fromHighest = mapping.getFromHighest();
219 fromLowest = mapping.getFromHighest();
220 fromHighest = mapping.getFromLowest();
222 if (mapping.getToLowest() < mapping.getToHighest())
224 toLowest = mapping.getToLowest();
225 toHighest = mapping.getToHighest();
229 toLowest = mapping.getToHighest();
230 toHighest = mapping.getToLowest();
235 int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(),
236 tl = mapping.getToLowest(), th = mapping.getToHighest();
240 fh = mapping.getFromLowest();
245 th = mapping.getToLowest();
252 if (fromHighest < fh)
265 mappings.add(mapping);
269 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
271 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
275 * finally, resolve the sense of the mapping
277 SequenceI mapFromSequence = seq;
278 SequenceI mapToSequence = mappedSequence;
281 * If mapping is from protein to dna, we store it as dna to protein instead
283 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
284 || (type == MappingType.PeptideToNucleotide
285 && !featureIsOnTarget))
287 mapFromSequence = mappedSequence;
291 * the sense of 'align' mappings for nucleotide alignments
292 * from exonerate seem to be ambiguous, so we need to do a bit more work
294 if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide)
297 * then check whether the aligned region is contained
298 * by the feature to determine sense of mapping
300 if (fromHighest==toHighest && fromLowest==toLowest)
302 // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions
303 if (featureIsOnTarget)
305 // TODO: raise a warning since we don't have test coverage for this case
306 mapFromSequence=mappedSequence; // Target sequence
307 mapToSequence=seq; // annotated sequence
309 } else if (similarityFrom == fromLowest && similarityTo == fromHighest)
311 mapFromSequence = seq;
312 mapToSequence = mappedSequence;
314 else if (similarityFrom == toLowest && similarityTo == toHighest)
316 mapFromSequence = mappedSequence;
321 throw new IOException(
322 "Couldn't determine sense for similarity feature");
327 * get any existing mapping for these sequences (or start one),
328 * and add this mapped range
330 AlignedCodonFrame acf = getMapping(align, mapFromSequence,
332 for (MapList mapping : mappings)
334 acf.addMap(mapFromSequence, mapToSequence, mapping);
336 align.addCodonFrame(acf);
340 * Construct the mapping
344 * @param forwardStrand
345 * @param featureIsOnTarget
349 protected MapList buildMapping(String region, MappingType type,
350 boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
353 * process one "fromStart toStart fromCount" descriptor
355 String[] tokens = region.split(" ");
356 if (tokens.length != 3)
358 System.err.println("Malformed Align descriptor: " + region);
363 * get start/end of from/to mappings
364 * if feature is on the target sequence we have to invert the sense
371 alignFromStart = Integer.parseInt(tokens[0]);
372 alignToStart = Integer.parseInt(tokens[1]);
373 alignCount = Integer.parseInt(tokens[2]);
374 } catch (NumberFormatException nfe)
376 System.err.println(nfe.toString());
385 if (featureIsOnTarget)
387 fromStart = alignToStart;
388 toStart = alignFromStart;
389 toEnd = forwardStrand ? toStart + alignCount - 1
390 : toStart - (alignCount - 1);
391 int toLength = Math.abs(toEnd - toStart) + 1;
392 int fromLength = toLength * type.getFromRatio() / type.getToRatio();
393 fromEnd = fromStart + fromLength - 1;
397 // we use the 'Align' values here not the feature start/end
398 // not clear why they may differ but it seems they can
399 fromStart = alignFromStart;
400 fromEnd = alignFromStart + alignCount - 1;
401 int fromLength = fromEnd - fromStart + 1;
402 int toLength = fromLength * type.getToRatio() / type.getFromRatio();
403 toStart = alignToStart;
406 toEnd = toStart + toLength - 1;
410 toEnd = toStart - (toLength - 1);
414 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
415 toStart, toEnd, type);
420 * Returns a MappingType depending on the exonerate 'model' value.
425 protected static MappingType getMappingType(String model)
427 MappingType result = null;
429 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
431 result = MappingType.PeptideToNucleotide;
433 else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
434 || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
436 result = MappingType.NucleotideToNucleotide;
442 * Tests whether the GFF data looks like it was generated by exonerate, and is
443 * a format we are willing to handle
448 public static boolean recognises(String[] columns)
450 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
456 * inspect alignment model
458 String model = columns[SOURCE_COL];
459 // e.g. exonerate:protein2genome:local
462 String mdl = model.toLowerCase(Locale.ROOT);
463 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
464 || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
465 || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
470 System.err.println("Sorry, I don't handle exonerate model " + model);
475 * An override to set feature group to "exonerate" instead of the default GFF
476 * source value (column 2)
479 protected SequenceFeature buildSequenceFeature(String[] gff,
480 Map<String, List<String>> set)
482 SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,