2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.io.gff;
23 import jalview.datamodel.AlignedCodonFrame;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.MappingType;
26 import jalview.datamodel.SequenceFeature;
27 import jalview.datamodel.SequenceI;
28 import jalview.util.MapList;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.List;
36 * A handler to parse GFF in the format generated by the exonerate tool
38 public class ExonerateHelper extends Gff2Helper
40 private static final String SIMILARITY = "similarity";
42 private static final String GENOME2GENOME = "genome2genome";
44 private static final String CDNA2GENOME = "cdna2genome";
46 private static final String CODING2GENOME = "coding2genome";
48 private static final String CODING2CODING = "coding2coding";
50 private static final String PROTEIN2GENOME = "protein2genome";
52 private static final String PROTEIN2DNA = "protein2dna";
54 private static final String ALIGN = "Align";
56 private static final String QUERY = "Query";
58 private static final String TARGET = "Target";
61 * Process one GFF feature line (as modelled by SequenceFeature)
64 * the sequence with which this feature is associated
66 * the sequence feature with ATTRIBUTES property containing any
67 * additional attributes
69 * the alignment we are adding GFF to
71 * any new sequences referenced by the GFF
72 * @param relaxedIdMatching
73 * if true, match word tokens in sequence names
74 * @return true if the sequence feature should be added to the sequence, else
75 * false (i.e. it has been processed in another way e.g. to generate a
79 public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
80 AlignmentI align, List<SequenceI> newseqs,
81 boolean relaxedIdMatching)
83 String attr = gffColumns[ATTRIBUTES_COL];
84 Map<String, List<String>> set = parseNameValuePairs(attr);
88 processGffSimilarity(set, seq, gffColumns, align, newseqs,
90 } catch (IOException ivfe)
92 System.err.println(ivfe);
96 * return null to indicate we don't want to add a sequence feature for
97 * similarity (only process it to create mappings)
103 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
104 * an exonerate GFF similarity feature; these properties define the mapping of
105 * the annotated range to a related sequence.
108 * parsed GFF column 9 key/value(s)
110 * the sequence the GFF feature is on
112 * the GFF column data
114 * the alignment the sequence belongs to, where any new mappings
117 * a list of new 'virtual sequences' generated while parsing GFF
118 * @param relaxedIdMatching
119 * if true allow fuzzy search for a matching target sequence
120 * @throws IOException
122 protected void processGffSimilarity(Map<String, List<String>> set,
123 SequenceI seq, String[] gff, AlignmentI align,
124 List<SequenceI> newseqs, boolean relaxedIdMatching)
128 * exonerate may be run with
129 * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
130 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
131 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
132 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
133 * where the Align spec may repeat
135 // TODO handle coding2coding and similar as well
136 boolean featureIsOnTarget = true;
137 List<String> mapTo = set.get(QUERY);
140 mapTo = set.get(TARGET);
141 featureIsOnTarget = false;
143 MappingType type = getMappingType(gff[SOURCE_COL]);
147 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
150 if (mapTo == null || mapTo.size() != 1)
152 throw new IOException(
153 "Expecting exactly one sequence in Query or Target field (got "
158 * similarity start and end can tell us
159 * which part of the alignment refers to which sequence
161 int similarityFrom,similarityTo;
163 similarityFrom = Integer.parseInt(gff[START_COL]);
164 similarityTo = Integer.parseInt(gff[END_COL]);
165 } catch (Exception x) {
166 throw new IOException("Couldn't parse start/end of the similarity feature",x);
170 * Process the Align maps and create mappings.
171 * These may be cdna-genome, cdna-protein, genome-protein.
172 * The mapped sequences may or may not be in the alignment
173 * (they may be included later in the GFF file).
177 * exonerate GFF has the strand of the target in column 7
178 * (differs from GFF3 which has it in the Target descriptor)
180 String strand = gff[STRAND_COL];
181 boolean forwardStrand = true;
182 if ("-".equals(strand))
184 forwardStrand = false;
186 else if (!"+".equals(strand))
188 System.err.println("Strand must be specified for alignment");
192 List<String> alignedRegions = set.get(ALIGN);
193 List<MapList> mappings = new ArrayList<MapList>();
194 int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0;
195 for (String region : alignedRegions)
197 MapList mapping = buildMapping(region, type, forwardStrand,
198 featureIsOnTarget, gff);
206 * record total extent of aligned region(s) for later
208 if (mappings.size() == 0)
210 if (mapping.getFromLowest() < mapping.getFromHighest())
212 fromLowest = mapping.getFromLowest();
213 fromHighest = mapping.getFromHighest();
217 fromLowest = mapping.getFromHighest();
218 fromHighest = mapping.getFromLowest();
220 if (mapping.getToLowest() < mapping.getToHighest())
222 toLowest = mapping.getToLowest();
223 toHighest = mapping.getToHighest();
227 toLowest = mapping.getToHighest();
228 toHighest = mapping.getToLowest();
233 int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(),
234 tl = mapping.getToLowest(), th = mapping.getToHighest();
238 fh = mapping.getFromLowest();
243 th = mapping.getToLowest();
250 if (fromHighest < fh)
263 mappings.add(mapping);
267 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
269 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
273 * finally, resolve the sense of the mapping
275 SequenceI mapFromSequence = seq;
276 SequenceI mapToSequence = mappedSequence;
279 * If mapping is from protein to dna, we store it as dna to protein instead
281 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
282 || (type == MappingType.PeptideToNucleotide
283 && !featureIsOnTarget))
285 mapFromSequence = mappedSequence;
289 * the sense of 'align' mappings for nucleotide alignments
290 * from exonerate seem to be ambiguous, so we need to do a bit more work
292 if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide)
295 * then check whether the aligned region is contained
296 * by the feature to determine sense of mapping
298 if (fromHighest==toHighest && fromLowest==toLowest)
300 // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions
301 if (featureIsOnTarget)
303 // TODO: raise a warning since we don't have test coverage for this case
304 mapFromSequence=mappedSequence; // Target sequence
305 mapToSequence=seq; // annotated sequence
307 } else if (similarityFrom == fromLowest && similarityTo == fromHighest)
309 mapFromSequence = seq;
310 mapToSequence = mappedSequence;
312 else if (similarityFrom == toLowest && similarityTo == toHighest)
314 mapFromSequence = mappedSequence;
319 throw new IOException(
320 "Couldn't determine sense for similarity feature");
325 * get any existing mapping for these sequences (or start one),
326 * and add this mapped range
328 AlignedCodonFrame acf = getMapping(align, mapFromSequence,
330 for (MapList mapping : mappings)
332 acf.addMap(mapFromSequence, mapToSequence, mapping);
334 align.addCodonFrame(acf);
338 * Construct the mapping
342 * @param forwardStrand
343 * @param featureIsOnTarget
347 protected MapList buildMapping(String region, MappingType type,
348 boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
351 * process one "fromStart toStart fromCount" descriptor
353 String[] tokens = region.split(" ");
354 if (tokens.length != 3)
356 System.err.println("Malformed Align descriptor: " + region);
361 * get start/end of from/to mappings
362 * if feature is on the target sequence we have to invert the sense
369 alignFromStart = Integer.parseInt(tokens[0]);
370 alignToStart = Integer.parseInt(tokens[1]);
371 alignCount = Integer.parseInt(tokens[2]);
372 } catch (NumberFormatException nfe)
374 System.err.println(nfe.toString());
383 if (featureIsOnTarget)
385 fromStart = alignToStart;
386 toStart = alignFromStart;
387 toEnd = forwardStrand ? toStart + alignCount - 1
388 : toStart - (alignCount - 1);
389 int toLength = Math.abs(toEnd - toStart) + 1;
390 int fromLength = toLength * type.getFromRatio() / type.getToRatio();
391 fromEnd = fromStart + fromLength - 1;
395 // we use the 'Align' values here not the feature start/end
396 // not clear why they may differ but it seems they can
397 fromStart = alignFromStart;
398 fromEnd = alignFromStart + alignCount - 1;
399 int fromLength = fromEnd - fromStart + 1;
400 int toLength = fromLength * type.getToRatio() / type.getFromRatio();
401 toStart = alignToStart;
404 toEnd = toStart + toLength - 1;
408 toEnd = toStart - (toLength - 1);
412 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
413 toStart, toEnd, type);
418 * Returns a MappingType depending on the exonerate 'model' value.
423 protected static MappingType getMappingType(String model)
425 MappingType result = null;
427 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
429 result = MappingType.PeptideToNucleotide;
431 else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
432 || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
434 result = MappingType.NucleotideToNucleotide;
440 * Tests whether the GFF data looks like it was generated by exonerate, and is
441 * a format we are willing to handle
446 public static boolean recognises(String[] columns)
448 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
454 * inspect alignment model
456 String model = columns[SOURCE_COL];
457 // e.g. exonerate:protein2genome:local
460 String mdl = model.toLowerCase();
461 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
462 || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
463 || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
468 System.err.println("Sorry, I don't handle exonerate model " + model);
473 * An override to set feature group to "exonerate" instead of the default GFF
474 * source value (column 2)
477 protected SequenceFeature buildSequenceFeature(String[] gff,
478 Map<String, List<String>> set)
480 SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,