1 package jalview.io.gff;
3 import jalview.datamodel.AlignedCodonFrame;
4 import jalview.datamodel.AlignmentI;
5 import jalview.datamodel.MappingType;
6 import jalview.datamodel.SequenceFeature;
7 import jalview.datamodel.SequenceI;
8 import jalview.util.MapList;
10 import java.io.IOException;
11 import java.util.List;
15 * A handler to parse GFF in the format generated by the exonerate tool
17 public class ExonerateHelper extends Gff2Helper
19 private static final String SIMILARITY = "similarity";
21 private static final String GENOME2GENOME = "genome2genome";
23 private static final String CDNA2GENOME = "cdna2genome";
25 private static final String CODING2GENOME = "coding2genome";
27 private static final String CODING2CODING = "coding2coding";
29 private static final String PROTEIN2GENOME = "protein2genome";
31 private static final String PROTEIN2DNA = "protein2dna";
33 private static final String ALIGN = "Align";
35 private static final String QUERY = "Query";
37 private static final String TARGET = "Target";
40 * Process one GFF feature line (as modelled by SequenceFeature)
43 * the sequence with which this feature is associated
45 * the sequence feature with ATTRIBUTES property containing any
46 * additional attributes
48 * the alignment we are adding GFF to
50 * any new sequences referenced by the GFF
51 * @param relaxedIdMatching
52 * if true, match word tokens in sequence names
53 * @return true if the sequence feature should be added to the sequence, else
54 * false (i.e. it has been processed in another way e.g. to generate a
58 public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
59 AlignmentI align, List<SequenceI> newseqs,
60 boolean relaxedIdMatching)
62 String attr = gffColumns[ATTRIBUTES_COL];
63 Map<String, List<String>> set = parseNameValuePairs(attr);
67 processGffSimilarity(set, seq, gffColumns,
68 align, newseqs, relaxedIdMatching);
69 } catch (IOException ivfe)
71 System.err.println(ivfe);
75 * return null to indicate we don't want to add a sequence feature for
76 * similarity (only process it to create mappings)
82 * Processes the 'Query' (or 'Target') and 'Align' properties associated with
83 * an exonerate GFF similarity feature; these properties define the mapping of
84 * the annotated range to a related sequence.
87 * parsed GFF column 9 key/value(s)
89 * the sequence the GFF feature is on
93 * the alignment the sequence belongs to, where any new mappings
96 * a list of new 'virtual sequences' generated while parsing GFF
97 * @param relaxedIdMatching
98 * if true allow fuzzy search for a matching target sequence
101 protected void processGffSimilarity(
102 Map<String, List<String>> set,
103 SequenceI seq, String[] gff, AlignmentI align,
104 List<SequenceI> newseqs, boolean relaxedIdMatching)
108 * exonerate may be run with
109 * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
110 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
111 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
112 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
113 * where the Align spec may repeat
115 // TODO handle coding2coding and similar as well
116 boolean featureIsOnTarget = true;
117 List<String> mapTo = set.get(QUERY);
120 mapTo = set.get(TARGET);
121 featureIsOnTarget = false;
123 MappingType type = getMappingType(gff[SOURCE_COL]);
127 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
130 if (mapTo == null || mapTo.size() != 1)
132 throw new IOException(
133 "Expecting exactly one sequence in Query or Target field (got "
138 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
140 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
144 * If mapping is from protein to dna, we store it as dna to protein instead
146 SequenceI mapFromSequence = seq;
147 SequenceI mapToSequence = mappedSequence;
148 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
149 || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
151 mapFromSequence = mappedSequence;
156 * Process the Align maps and create mappings.
157 * These may be cdna-genome, cdna-protein, genome-protein.
158 * The mapped sequences may or may not be in the alignment
159 * (they may be included later in the GFF file).
163 * get any existing mapping for these sequences (or start one),
164 * and add this mapped range
166 AlignedCodonFrame acf = getMapping(align, mapFromSequence,
170 * exonerate GFF has the strand of the target in column 7
171 * (differs from GFF3 which has it in the Target descriptor)
173 String strand = gff[STRAND_COL];
174 boolean forwardStrand = true;
175 if ("-".equals(strand))
177 forwardStrand = false;
179 else if (!"+".equals(strand))
181 System.err.println("Strand must be specified for alignment");
185 List<String> alignedRegions = set.get(ALIGN);
186 for (String region : alignedRegions)
188 MapList mapping = buildMapping(region, type, forwardStrand,
189 featureIsOnTarget, gff);
196 acf.addMap(mapFromSequence, mapToSequence, mapping);
198 align.addCodonFrame(acf);
202 * Construct the mapping
206 * @param forwardStrand
207 * @param featureIsOnTarget
211 protected MapList buildMapping(String region, MappingType type,
212 boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
215 * process one "fromStart toStart fromCount" descriptor
217 String[] tokens = region.split(" ");
218 if (tokens.length != 3)
220 System.err.println("Malformed Align descriptor: " + region);
225 * get start/end of from/to mappings
226 * if feature is on the target sequence we have to invert the sense
232 alignFromStart = Integer.parseInt(tokens[0]);
233 alignToStart = Integer.parseInt(tokens[1]);
234 alignCount = Integer.parseInt(tokens[2]);
235 } catch (NumberFormatException nfe) {
236 System.err.println(nfe.toString());
245 if (featureIsOnTarget)
247 fromStart = alignToStart;
248 toStart = alignFromStart;
249 toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
251 int toLength = Math.abs(toEnd - toStart) + 1;
252 int fromLength = toLength * type.getFromRatio() / type.getToRatio();
253 fromEnd = fromStart + fromLength - 1;
257 // we use the 'Align' values here not the feature start/end
258 // not clear why they may differ but it seems they can
259 fromStart = alignFromStart;
260 fromEnd = alignFromStart + alignCount - 1;
261 int fromLength = fromEnd - fromStart + 1;
262 int toLength = fromLength * type.getToRatio() / type.getFromRatio();
263 toStart = alignToStart;
266 toEnd = toStart + toLength - 1;
270 toEnd = toStart - (toLength - 1);
274 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
275 toStart, toEnd, type);
280 * Returns a MappingType depending on the exonerate 'model' value.
285 protected static MappingType getMappingType(String model)
287 MappingType result = null;
289 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
291 result = MappingType.PeptideToNucleotide;
293 else if (model.contains(CODING2CODING)
294 || model.contains(CODING2GENOME)
295 || model.contains(CDNA2GENOME)
296 || model.contains(GENOME2GENOME))
298 result = MappingType.NucleotideToNucleotide;
304 * Tests whether the GFF data looks like it was generated by exonerate, and is
305 * a format we are willing to handle
310 public static boolean recognises(String[] columns)
312 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
318 * inspect alignment model
320 String model = columns[SOURCE_COL];
321 // e.g. exonerate:protein2genome:local
324 String mdl = model.toLowerCase();
325 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
326 || mdl.contains(CODING2CODING)
327 || mdl.contains(CODING2GENOME)
328 || mdl.contains(CDNA2GENOME)
329 || mdl.contains(GENOME2GENOME))
334 System.err.println("Sorry, I don't handle exonerate model " + model);
339 protected SequenceFeature buildSequenceFeature(String[] gff,
340 Map<String, List<String>> set)
342 SequenceFeature sf = super.buildSequenceFeature(gff, set);
343 sf.setFeatureGroup("exonerate");