/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.io.gff; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentI; import jalview.datamodel.MappingType; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.MapList; import java.io.IOException; import java.util.List; import java.util.Map; /** * A handler to parse GFF in the format generated by the exonerate tool */ public class ExonerateHelper extends Gff2Helper { private static final String SIMILARITY = "similarity"; private static final String GENOME2GENOME = "genome2genome"; private static final String CDNA2GENOME = "cdna2genome"; private static final String CODING2GENOME = "coding2genome"; private static final String CODING2CODING = "coding2coding"; private static final String PROTEIN2GENOME = "protein2genome"; private static final String PROTEIN2DNA = "protein2dna"; private static final String ALIGN = "Align"; private static final String QUERY = "Query"; private static final String TARGET = "Target"; /** * Process one GFF feature line (as modelled by SequenceFeature) * * @param seq * the sequence with which this feature is associated * @param gffColumns * the sequence feature with ATTRIBUTES property containing any * additional attributes * @param align * the alignment we are adding GFF to * @param newseqs * any new sequences referenced by the GFF * @param relaxedIdMatching * if true, match word tokens in sequence names * @return true if the sequence feature should be added to the sequence, else * false (i.e. it has been processed in another way e.g. to generate a * mapping) */ @Override public SequenceFeature processGff(SequenceI seq, String[] gffColumns, AlignmentI align, List newseqs, boolean relaxedIdMatching) { String attr = gffColumns[ATTRIBUTES_COL]; Map> set = parseNameValuePairs(attr); try { processGffSimilarity(set, seq, gffColumns, align, newseqs, relaxedIdMatching); } catch (IOException ivfe) { System.err.println(ivfe); } /* * return null to indicate we don't want to add a sequence feature for * similarity (only process it to create mappings) */ return null; } /** * Processes the 'Query' (or 'Target') and 'Align' properties associated with * an exonerate GFF similarity feature; these properties define the mapping of * the annotated range to a related sequence. * * @param set * parsed GFF column 9 key/value(s) * @param seq * the sequence the GFF feature is on * @param gff * the GFF column data * @param align * the alignment the sequence belongs to, where any new mappings * should be added * @param newseqs * a list of new 'virtual sequences' generated while parsing GFF * @param relaxedIdMatching * if true allow fuzzy search for a matching target sequence * @throws IOException */ protected void processGffSimilarity(Map> set, SequenceI seq, String[] gff, AlignmentI align, List newseqs, boolean relaxedIdMatching) throws IOException { /* * exonerate may be run with * --showquerygff - outputs 'features on the query' e.g. (protein2genome) * Target ; Align proteinStartPos dnaStartPos proteinCount * --showtargetgff - outputs 'features on the target' e.g. (protein2genome) * Query ; Align dnaStartPos proteinStartPos nucleotideCount * where the Align spec may repeat */ // TODO handle coding2coding and similar as well boolean featureIsOnTarget = true; List mapTo = set.get(QUERY); if (mapTo == null) { mapTo = set.get(TARGET); featureIsOnTarget = false; } MappingType type = getMappingType(gff[SOURCE_COL]); if (type == null) { throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]); } if (mapTo == null || mapTo.size() != 1) { throw new IOException( "Expecting exactly one sequence in Query or Target field (got " + mapTo + ")"); } /* * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; */ SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, relaxedIdMatching); /* * If mapping is from protein to dna, we store it as dna to protein instead */ SequenceI mapFromSequence = seq; SequenceI mapToSequence = mappedSequence; if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget)) { mapFromSequence = mappedSequence; mapToSequence = seq; } /* * Process the Align maps and create mappings. * These may be cdna-genome, cdna-protein, genome-protein. * The mapped sequences may or may not be in the alignment * (they may be included later in the GFF file). */ /* * get any existing mapping for these sequences (or start one), * and add this mapped range */ AlignedCodonFrame acf = getMapping(align, mapFromSequence, mapToSequence); /* * exonerate GFF has the strand of the target in column 7 * (differs from GFF3 which has it in the Target descriptor) */ String strand = gff[STRAND_COL]; boolean forwardStrand = true; if ("-".equals(strand)) { forwardStrand = false; } else if (!"+".equals(strand)) { System.err.println("Strand must be specified for alignment"); return; } List alignedRegions = set.get(ALIGN); for (String region : alignedRegions) { MapList mapping = buildMapping(region, type, forwardStrand, featureIsOnTarget, gff); if (mapping == null) { continue; } acf.addMap(mapFromSequence, mapToSequence, mapping); } align.addCodonFrame(acf); } /** * Construct the mapping * * @param region * @param type * @param forwardStrand * @param featureIsOnTarget * @param gff * @return */ protected MapList buildMapping(String region, MappingType type, boolean forwardStrand, boolean featureIsOnTarget, String[] gff) { /* * process one "fromStart toStart fromCount" descriptor */ String[] tokens = region.split(" "); if (tokens.length != 3) { System.err.println("Malformed Align descriptor: " + region); return null; } /* * get start/end of from/to mappings * if feature is on the target sequence we have to invert the sense */ int alignFromStart; int alignToStart; int alignCount; try { alignFromStart = Integer.parseInt(tokens[0]); alignToStart = Integer.parseInt(tokens[1]); alignCount = Integer.parseInt(tokens[2]); } catch (NumberFormatException nfe) { System.err.println(nfe.toString()); return null; } int fromStart; int fromEnd; int toStart; int toEnd; if (featureIsOnTarget) { fromStart = alignToStart; toStart = alignFromStart; toEnd = forwardStrand ? toStart + alignCount - 1 : toStart - (alignCount - 1); int toLength = Math.abs(toEnd - toStart) + 1; int fromLength = toLength * type.getFromRatio() / type.getToRatio(); fromEnd = fromStart + fromLength - 1; } else { // we use the 'Align' values here not the feature start/end // not clear why they may differ but it seems they can fromStart = alignFromStart; fromEnd = alignFromStart + alignCount - 1; int fromLength = fromEnd - fromStart + 1; int toLength = fromLength * type.getToRatio() / type.getFromRatio(); toStart = alignToStart; if (forwardStrand) { toEnd = toStart + toLength - 1; } else { toEnd = toStart - (toLength - 1); } } MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd, toStart, toEnd, type); return codonmapping; } /** * Returns a MappingType depending on the exonerate 'model' value. * * @param model * @return */ protected static MappingType getMappingType(String model) { MappingType result = null; if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME)) { result = MappingType.PeptideToNucleotide; } else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME) || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME)) { result = MappingType.NucleotideToNucleotide; } return result; } /** * Tests whether the GFF data looks like it was generated by exonerate, and is * a format we are willing to handle * * @param columns * @return */ public static boolean recognises(String[] columns) { if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL])) { return false; } /* * inspect alignment model */ String model = columns[SOURCE_COL]; // e.g. exonerate:protein2genome:local if (model != null) { String mdl = model.toLowerCase(); if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME) || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME) || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME)) { return true; } } System.err.println("Sorry, I don't handle exonerate model " + model); return false; } @Override protected SequenceFeature buildSequenceFeature(String[] gff, Map> set) { SequenceFeature sf = super.buildSequenceFeature(gff, set); sf.setFeatureGroup("exonerate"); return sf; } }