/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.io.gff;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.MappingType;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* A handler to parse GFF in the format generated by the exonerate tool
*/
public class ExonerateHelper extends Gff2Helper
{
private static final String SIMILARITY = "similarity";
private static final String GENOME2GENOME = "genome2genome";
private static final String CDNA2GENOME = "cdna2genome";
private static final String CODING2GENOME = "coding2genome";
private static final String CODING2CODING = "coding2coding";
private static final String PROTEIN2GENOME = "protein2genome";
private static final String PROTEIN2DNA = "protein2dna";
private static final String ALIGN = "Align";
private static final String QUERY = "Query";
private static final String TARGET = "Target";
/**
* Process one GFF feature line (as modelled by SequenceFeature)
*
* @param seq
* the sequence with which this feature is associated
* @param gffColumns
* the sequence feature with ATTRIBUTES property containing any
* additional attributes
* @param align
* the alignment we are adding GFF to
* @param newseqs
* any new sequences referenced by the GFF
* @param relaxedIdMatching
* if true, match word tokens in sequence names
* @return true if the sequence feature should be added to the sequence, else
* false (i.e. it has been processed in another way e.g. to generate a
* mapping)
*/
@Override
public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
AlignmentI align, List newseqs,
boolean relaxedIdMatching)
{
String attr = gffColumns[ATTRIBUTES_COL];
Map> set = parseNameValuePairs(attr);
try
{
processGffSimilarity(set, seq, gffColumns, align, newseqs,
relaxedIdMatching);
} catch (IOException ivfe)
{
System.err.println(ivfe);
}
/*
* return null to indicate we don't want to add a sequence feature for
* similarity (only process it to create mappings)
*/
return null;
}
/**
* Processes the 'Query' (or 'Target') and 'Align' properties associated with
* an exonerate GFF similarity feature; these properties define the mapping of
* the annotated range to a related sequence.
*
* @param set
* parsed GFF column 9 key/value(s)
* @param seq
* the sequence the GFF feature is on
* @param gff
* the GFF column data
* @param align
* the alignment the sequence belongs to, where any new mappings
* should be added
* @param newseqs
* a list of new 'virtual sequences' generated while parsing GFF
* @param relaxedIdMatching
* if true allow fuzzy search for a matching target sequence
* @throws IOException
*/
protected void processGffSimilarity(Map> set,
SequenceI seq, String[] gff, AlignmentI align,
List newseqs, boolean relaxedIdMatching)
throws IOException
{
/*
* exonerate may be run with
* --showquerygff - outputs 'features on the query' e.g. (protein2genome)
* Target ; Align proteinStartPos dnaStartPos proteinCount
* --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
* Query ; Align dnaStartPos proteinStartPos nucleotideCount
* where the Align spec may repeat
*/
// TODO handle coding2coding and similar as well
boolean featureIsOnTarget = true;
List mapTo = set.get(QUERY);
if (mapTo == null)
{
mapTo = set.get(TARGET);
featureIsOnTarget = false;
}
MappingType type = getMappingType(gff[SOURCE_COL]);
if (type == null)
{
throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
}
if (mapTo == null || mapTo.size() != 1)
{
throw new IOException(
"Expecting exactly one sequence in Query or Target field (got "
+ mapTo + ")");
}
/*
* locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
*/
SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
relaxedIdMatching);
/*
* If mapping is from protein to dna, we store it as dna to protein instead
*/
SequenceI mapFromSequence = seq;
SequenceI mapToSequence = mappedSequence;
if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
|| (type == MappingType.PeptideToNucleotide
&& !featureIsOnTarget))
{
mapFromSequence = mappedSequence;
mapToSequence = seq;
}
/*
* Process the Align maps and create mappings.
* These may be cdna-genome, cdna-protein, genome-protein.
* The mapped sequences may or may not be in the alignment
* (they may be included later in the GFF file).
*/
/*
* get any existing mapping for these sequences (or start one),
* and add this mapped range
*/
AlignedCodonFrame acf = getMapping(align, mapFromSequence,
mapToSequence);
/*
* exonerate GFF has the strand of the target in column 7
* (differs from GFF3 which has it in the Target descriptor)
*/
String strand = gff[STRAND_COL];
boolean forwardStrand = true;
if ("-".equals(strand))
{
forwardStrand = false;
}
else if (!"+".equals(strand))
{
System.err.println("Strand must be specified for alignment");
return;
}
List alignedRegions = set.get(ALIGN);
for (String region : alignedRegions)
{
MapList mapping = buildMapping(region, type, forwardStrand,
featureIsOnTarget, gff);
if (mapping == null)
{
continue;
}
acf.addMap(mapFromSequence, mapToSequence, mapping);
}
align.addCodonFrame(acf);
}
/**
* Construct the mapping
*
* @param region
* @param type
* @param forwardStrand
* @param featureIsOnTarget
* @param gff
* @return
*/
protected MapList buildMapping(String region, MappingType type,
boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
{
/*
* process one "fromStart toStart fromCount" descriptor
*/
String[] tokens = region.split(" ");
if (tokens.length != 3)
{
System.err.println("Malformed Align descriptor: " + region);
return null;
}
/*
* get start/end of from/to mappings
* if feature is on the target sequence we have to invert the sense
*/
int alignFromStart;
int alignToStart;
int alignCount;
try
{
alignFromStart = Integer.parseInt(tokens[0]);
alignToStart = Integer.parseInt(tokens[1]);
alignCount = Integer.parseInt(tokens[2]);
} catch (NumberFormatException nfe)
{
System.err.println(nfe.toString());
return null;
}
int fromStart;
int fromEnd;
int toStart;
int toEnd;
if (featureIsOnTarget)
{
fromStart = alignToStart;
toStart = alignFromStart;
toEnd = forwardStrand ? toStart + alignCount - 1
: toStart - (alignCount - 1);
int toLength = Math.abs(toEnd - toStart) + 1;
int fromLength = toLength * type.getFromRatio() / type.getToRatio();
fromEnd = fromStart + fromLength - 1;
}
else
{
// we use the 'Align' values here not the feature start/end
// not clear why they may differ but it seems they can
fromStart = alignFromStart;
fromEnd = alignFromStart + alignCount - 1;
int fromLength = fromEnd - fromStart + 1;
int toLength = fromLength * type.getToRatio() / type.getFromRatio();
toStart = alignToStart;
if (forwardStrand)
{
toEnd = toStart + toLength - 1;
}
else
{
toEnd = toStart - (toLength - 1);
}
}
MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
toStart, toEnd, type);
return codonmapping;
}
/**
* Returns a MappingType depending on the exonerate 'model' value.
*
* @param model
* @return
*/
protected static MappingType getMappingType(String model)
{
MappingType result = null;
if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
{
result = MappingType.PeptideToNucleotide;
}
else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
|| model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
{
result = MappingType.NucleotideToNucleotide;
}
return result;
}
/**
* Tests whether the GFF data looks like it was generated by exonerate, and is
* a format we are willing to handle
*
* @param columns
* @return
*/
public static boolean recognises(String[] columns)
{
if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
{
return false;
}
/*
* inspect alignment model
*/
String model = columns[SOURCE_COL];
// e.g. exonerate:protein2genome:local
if (model != null)
{
String mdl = model.toLowerCase();
if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
|| mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
|| mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
{
return true;
}
}
System.err.println("Sorry, I don't handle exonerate model " + model);
return false;
}
/**
* An override to set feature group to "exonerate" instead of the default GFF
* source value (column 2)
*/
@Override
protected SequenceFeature buildSequenceFeature(String[] gff,
Map> set)
{
SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
"exonerate", set);
return sf;
}
}