--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A handler to parse GFF in the format generated by the exonerate tool
+ */
+public class ExonerateHelper extends Gff2Helper
+{
+ private static final String SIMILARITY = "similarity";
+
+ private static final String GENOME2GENOME = "genome2genome";
+
+ private static final String CDNA2GENOME = "cdna2genome";
+
+ private static final String CODING2GENOME = "coding2genome";
+
+ private static final String CODING2CODING = "coding2coding";
+
+ private static final String PROTEIN2GENOME = "protein2genome";
+
+ private static final String PROTEIN2DNA = "protein2dna";
+
+ private static final String ALIGN = "Align";
+
+ private static final String QUERY = "Query";
+
+ private static final String TARGET = "Target";
+
+ /**
+ * Process one GFF feature line (as modelled by SequenceFeature)
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param gffColumns
+ * the sequence feature with ATTRIBUTES property containing any
+ * additional attributes
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return true if the sequence feature should be added to the sequence, else
+ * false (i.e. it has been processed in another way e.g. to generate a
+ * mapping)
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ {
+ String attr = gffColumns[ATTRIBUTES_COL];
+ Map<String, List<String>> set = parseNameValuePairs(attr);
+
+ try
+ {
+ processGffSimilarity(set, seq, gffColumns,
+ align, newseqs, relaxedIdMatching);
+ } catch (IOException ivfe)
+ {
+ System.err.println(ivfe);
+ }
+
+ /*
+ * return null to indicate we don't want to add a sequence feature for
+ * similarity (only process it to create mappings)
+ */
+ return null;
+ }
+
+ /**
+ * Processes the 'Query' (or 'Target') and 'Align' properties associated with
+ * an exonerate GFF similarity feature; these properties define the mapping of
+ * the annotated range to a related sequence.
+ *
+ * @param set
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gff
+ * the GFF column data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @throws IOException
+ */
+ protected void processGffSimilarity(
+ Map<String, List<String>> set,
+ SequenceI seq, String[] gff, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
+ throws IOException
+ {
+ /*
+ * exonerate may be run with
+ * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
+ * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
+ * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
+ * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
+ * where the Align spec may repeat
+ */
+ // TODO handle coding2coding and similar as well
+ boolean featureIsOnTarget = true;
+ List<String> mapTo = set.get(QUERY);
+ if (mapTo == null)
+ {
+ mapTo = set.get(TARGET);
+ featureIsOnTarget = false;
+ }
+ MappingType type = getMappingType(gff[SOURCE_COL]);
+
+ if (type == null)
+ {
+ throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
+ }
+
+ if (mapTo == null || mapTo.size() != 1)
+ {
+ throw new IOException(
+ "Expecting exactly one sequence in Query or Target field (got "
+ + mapTo + ")");
+ }
+
+ /*
+ * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
+ */
+ SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
+ relaxedIdMatching);
+
+ /*
+ * If mapping is from protein to dna, we store it as dna to protein instead
+ */
+ SequenceI mapFromSequence = seq;
+ SequenceI mapToSequence = mappedSequence;
+ if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
+ || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
+ {
+ mapFromSequence = mappedSequence;
+ mapToSequence = seq;
+ }
+
+ /*
+ * Process the Align maps and create mappings.
+ * These may be cdna-genome, cdna-protein, genome-protein.
+ * The mapped sequences may or may not be in the alignment
+ * (they may be included later in the GFF file).
+ */
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame acf = getMapping(align, mapFromSequence,
+ mapToSequence);
+
+ /*
+ * exonerate GFF has the strand of the target in column 7
+ * (differs from GFF3 which has it in the Target descriptor)
+ */
+ String strand = gff[STRAND_COL];
+ boolean forwardStrand = true;
+ if ("-".equals(strand))
+ {
+ forwardStrand = false;
+ }
+ else if (!"+".equals(strand))
+ {
+ System.err.println("Strand must be specified for alignment");
+ return;
+ }
+
+ List<String> alignedRegions = set.get(ALIGN);
+ for (String region : alignedRegions)
+ {
+ MapList mapping = buildMapping(region, type, forwardStrand,
+ featureIsOnTarget, gff);
+
+ if (mapping == null)
+ {
+ continue;
+ }
+
+ acf.addMap(mapFromSequence, mapToSequence, mapping);
+ }
+ align.addCodonFrame(acf);
+ }
+
+ /**
+ * Construct the mapping
+ *
+ * @param region
+ * @param type
+ * @param forwardStrand
+ * @param featureIsOnTarget
+ * @param gff
+ * @return
+ */
+ protected MapList buildMapping(String region, MappingType type,
+ boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
+ {
+ /*
+ * process one "fromStart toStart fromCount" descriptor
+ */
+ String[] tokens = region.split(" ");
+ if (tokens.length != 3)
+ {
+ System.err.println("Malformed Align descriptor: " + region);
+ return null;
+ }
+
+ /*
+ * get start/end of from/to mappings
+ * if feature is on the target sequence we have to invert the sense
+ */
+ int alignFromStart;
+ int alignToStart;
+ int alignCount;
+ try {
+ alignFromStart = Integer.parseInt(tokens[0]);
+ alignToStart = Integer.parseInt(tokens[1]);
+ alignCount = Integer.parseInt(tokens[2]);
+ } catch (NumberFormatException nfe) {
+ System.err.println(nfe.toString());
+ return null;
+ }
+
+ int fromStart;
+ int fromEnd;
+ int toStart;
+ int toEnd;
+
+ if (featureIsOnTarget)
+ {
+ fromStart = alignToStart;
+ toStart = alignFromStart;
+ toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
+ - (alignCount - 1);
+ int toLength = Math.abs(toEnd - toStart) + 1;
+ int fromLength = toLength * type.getFromRatio() / type.getToRatio();
+ fromEnd = fromStart + fromLength - 1;
+ }
+ else
+ {
+ // we use the 'Align' values here not the feature start/end
+ // not clear why they may differ but it seems they can
+ fromStart = alignFromStart;
+ fromEnd = alignFromStart + alignCount - 1;
+ int fromLength = fromEnd - fromStart + 1;
+ int toLength = fromLength * type.getToRatio() / type.getFromRatio();
+ toStart = alignToStart;
+ if (forwardStrand)
+ {
+ toEnd = toStart + toLength - 1;
+ }
+ else
+ {
+ toEnd = toStart - (toLength - 1);
+ }
+ }
+
+ MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
+ toStart, toEnd, type);
+ return codonmapping;
+ }
+
+ /**
+ * Returns a MappingType depending on the exonerate 'model' value.
+ *
+ * @param model
+ * @return
+ */
+ protected static MappingType getMappingType(String model)
+ {
+ MappingType result = null;
+
+ if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
+ {
+ result = MappingType.PeptideToNucleotide;
+ }
+ else if (model.contains(CODING2CODING)
+ || model.contains(CODING2GENOME)
+ || model.contains(CDNA2GENOME)
+ || model.contains(GENOME2GENOME))
+ {
+ result = MappingType.NucleotideToNucleotide;
+ }
+ return result;
+ }
+
+ /**
+ * Tests whether the GFF data looks like it was generated by exonerate, and is
+ * a format we are willing to handle
+ *
+ * @param columns
+ * @return
+ */
+ public static boolean recognises(String[] columns)
+ {
+ if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
+ {
+ return false;
+ }
+
+ /*
+ * inspect alignment model
+ */
+ String model = columns[SOURCE_COL];
+ // e.g. exonerate:protein2genome:local
+ if (model != null)
+ {
+ String mdl = model.toLowerCase();
+ if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
+ || mdl.contains(CODING2CODING)
+ || mdl.contains(CODING2GENOME)
+ || mdl.contains(CDNA2GENOME)
+ || mdl.contains(GENOME2GENOME))
+ {
+ return true;
+ }
+ }
+ System.err.println("Sorry, I don't handle exonerate model " + model);
+ return false;
+ }
+
+ @Override
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> set)
+ {
+ SequenceFeature sf = super.buildSequenceFeature(gff, set);
+ sf.setFeatureGroup("exonerate");
+
+ return sf;
+ }
+
+}