contig_1146 exonerate:protein2genome:local gene 8534 11269 3652 - . gene_id 0 ; sequence DDB_G0269124 ; gene_orientation .
contig_1146 exonerate:protein2genome:local cds 8534 11269 . - .
contig_1146 exonerate:protein2genome:local exon 8534 11269 . - . insertions 3 ; deletions 6
+#TODO need to understand why GFF features is from 11269 but Align is from 11270
contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0269124 ; Align 11270 143 120 ; Align 11150 187 282 ; Align 10865 281 888 ; Align 9977 578 1068 ; Align 8909 935 375
# and a made-up alignment to a sequence in exonerateseqs.fa
contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0280897 ; Align 11270 143 120
##date 2015-01-16
##type DNA
#
+# exonerate run with --showtargetgff generates 'features on the target' i.e. mappings to the query
# tab-delimited
# seqname source feature start end score strand frame attributes
#
seq1 exonerate:protein2genome:local gene 8 11 3652 - . gene_id 0 ; sequence seq2 ; gene_orientation .
seq1 exonerate:protein2genome:local cds 9 11 . - .
seq1 exonerate:protein2genome:local exon 9 11 . - . insertions 3 ; deletions 6
-seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
+#seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
+seq1 exonerate:protein2genome:local similarity 9 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
#
# appending FASTA sequences is strictly a GFF3 format feature
# but Jalview is able to handle this mixture of GFF2 / GFF3 :-)
*
* @see java.lang.Object#finalize()
*/
+ @Override
protected void finalize() throws Throwable
{
map = null;
--- /dev/null
+package jalview.datamodel;
+
+/**
+ * An enumeration of the kinds of mapping (from nucleotide or peptide, to
+ * nucleotide or peptide), and the corresponding word lengths
+ */
+public enum MappingType
+{
+ NucleotideToPeptide(3, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return PeptideToNucleotide;
+ }
+ },
+ PeptideToNucleotide(1, 3)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return NucleotideToPeptide;
+ }
+ },
+ NucleotideToNucleotide(1, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return NucleotideToNucleotide;
+ }
+ },
+ PeptideToPeptide(1, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return PeptideToPeptide;
+ }
+ };
+
+ private int fromRatio;
+
+ private int toRatio;
+
+ private MappingType(int fromSize, int toSize)
+ {
+ fromRatio = fromSize;
+ toRatio = toSize;
+ }
+
+ public abstract MappingType getInverse();
+
+ public int getFromRatio()
+ {
+ return fromRatio;
+ }
+
+ public int getToRatio()
+ {
+ return toRatio;
+ }
+}
*/
package jalview.io;
+import jalview.analysis.AlignmentUtils;
import jalview.analysis.SequenceIdMatcher;
import jalview.api.AlignViewportI;
+import jalview.api.FeaturesSourceI;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.io.gff.GffHelperBase;
+import jalview.io.gff.GffHelperFactory;
+import jalview.io.gff.GffHelperI;
import jalview.schemes.AnnotationColourGradient;
import jalview.schemes.GraduatedColor;
import jalview.schemes.UserColourScheme;
* @author jbprocter
* @author gmcarstairs
*/
-public class FeaturesFile extends AlignFile
+public class FeaturesFile extends AlignFile implements FeaturesSourceI
{
- private static final String NOTE = "Note";
-
- private static final String ALIGN = "Align";
-
- private static final String QUERY = "Query";
-
- private static final String TARGET = "Target";
-
- private static final String SIMILARITY = "similarity";
+ private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED";
- protected static final String STRAND = "STRAND";
+ private static final String NOTE = "Note";
protected static final String FRAME = "FRAME";
- protected static final String ATTRIBUTES = "ATTRIBUTES";
-
protected static final String TAB = "\t";
protected static final String GFF_VERSION = "##gff-version";
String line = null;
try
{
- StringTokenizer st;
+ String[] gffColumns;
String featureGroup = null;
while ((line = nextLine()) != null)
continue;
}
- st = new StringTokenizer(line, TAB);
- if (st.countTokens() == 1)
+ gffColumns = line.split("\\t"); // tab as regex
+ if (gffColumns.length == 1)
{
if (line.trim().equalsIgnoreCase("GFF"))
{
/*
- * Jalview features file with appendded GFF
- * assume GFF2 (though it may declare gff-version 3)
+ * Jalview features file with appended GFF
+ * assume GFF2 (though it may declare ##gff-version 3)
*/
gffVersion = 2;
continue;
}
}
- if (st.countTokens() > 1 && st.countTokens() < 4)
+ if (gffColumns.length > 1 && gffColumns.length < 4)
{
/*
* if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or
- * a feature type colour specification; not GFF format
+ * a feature type colour specification
*/
- String ft = st.nextToken();
+ String ft = gffColumns[0];
if (ft.equalsIgnoreCase("startgroup"))
{
- featureGroup = st.nextToken();
+ featureGroup = gffColumns[1];
}
else if (ft.equalsIgnoreCase("endgroup"))
{
// We should check whether this is the current group,
// but at present theres no way of showing more than 1 group
- st.nextToken();
featureGroup = null;
}
else
{
- parseFeatureColour(line, ft, st, colours);
+ parseFeatureColour(line, ft, gffColumns, colours);
}
continue;
}
*/
if (gffVersion == 0)
{
- parseJalviewFeature(line, st, align, colours, removeHTML,
+ parseJalviewFeature(line, gffColumns, align, colours, removeHTML,
relaxedIdmatching, featureGroup);
}
else
{
- parseGffFeature(st, align, relaxedIdmatching, newseqs);
+ parseGff(gffColumns, align, relaxedIdmatching, newseqs);
}
}
resetMatcher();
}
/**
- * Try to parse a Jalview format feature specification. Returns true if
- * successful or false if not.
+ * Try to parse a Jalview format feature specification and add it as a
+ * sequence feature to any matching sequences in the alignment. Returns true
+ * if successful (a feature was added), or false if not.
*
* @param line
- * @param st
+ * @param gffColumns
* @param alignment
* @param featureColours
* @param removeHTML
* @param relaxedIdmatching
* @param featureGroup
*/
- protected boolean parseJalviewFeature(String line, StringTokenizer st,
+ protected boolean parseJalviewFeature(String line, String[] gffColumns,
AlignmentI alignment, Map<String, Object> featureColours,
boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
{
/*
- * Jalview: description seqid seqIndex start end type [score]
+ * tokens: description seqid seqIndex start end type [score]
*/
- if (st.countTokens() < 6)
+ if (gffColumns.length < 6)
{
System.err.println("Ignoring feature line '" + line
- + "' with unexpected number of columns (" + st.countTokens()
- + ")");
+ + "' with too few columns (" + gffColumns.length + ")");
return false;
}
- String desc = st.nextToken();
- String seqId = st.nextToken();
- SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId);
+ String desc = gffColumns[0];
+ String seqId = gffColumns[1];
+ SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching);
- if (!seqId.equals("ID_NOT_SPECIFIED"))
+ if (!ID_NOT_SPECIFIED.equals(seqId))
{
- seq = findName(alignment, null, relaxedIdMatching, seqId);
- st.nextToken();
+ seq = findSequence(seqId, alignment, null, relaxedIdMatching);
}
else
{
seqId = null;
seq = null;
+ String seqIndex = gffColumns[2];
try
{
- int idx = Integer.parseInt(st.nextToken());
+ int idx = Integer.parseInt(seqIndex);
seq = alignment.getSequenceAt(idx);
} catch (NumberFormatException ex)
{
- // continue
+ System.err.println("Invalid sequence index: " + seqIndex);
}
}
return false;
}
- int startPos = Integer.parseInt(st.nextToken());
- int endPos = Integer.parseInt(st.nextToken());
+ int startPos = Integer.parseInt(gffColumns[3]);
+ int endPos = Integer.parseInt(gffColumns[4]);
- String ft = st.nextToken();
+ String ft = gffColumns[5];
if (!featureColours.containsKey(ft))
{
UserColourScheme ucs = new UserColourScheme(ft);
featureColours.put(ft, ucs.findColour('A'));
}
- SequenceFeature sf = new SequenceFeature(ft, desc, "",
- startPos, endPos, featureGroup);
- if (st.hasMoreTokens())
+ SequenceFeature sf = new SequenceFeature(ft, desc, "", startPos,
+ endPos, featureGroup);
+ if (gffColumns.length > 6)
{
- float score = 0f;
+ float score = Float.NaN;
try
{
- score = new Float(st.nextToken()).floatValue();
+ score = new Float(gffColumns[6]).floatValue();
// update colourgradient bounds if allowed to
} catch (NumberFormatException ex)
{
- // leave as 0
+ // leave as NaN
}
sf.setScore(score);
}
* the current input line (for error messages only)
* @param featureType
* the first token on the line
- * @param st
- * holds remaining tokens on the line
+ * @param gffColumns
+ * holds tokens on the line
* @param colours
* map to which to add derived colour specification
*/
protected void parseFeatureColour(String line, String featureType,
- StringTokenizer st, Map<String, Object> colours)
+ String[] gffColumns, Map<String, Object> colours)
{
Object colour = null;
- String colscheme = st.nextToken();
+ String colscheme = gffColumns[1];
if (colscheme.indexOf("|") > -1
|| colscheme.trim().equalsIgnoreCase("label"))
{
* list, and returns it</li>
* </ul>
*
+ * @param seqId
* @param align
* @param newseqs
* @param relaxedIdMatching
- * @param seqId
+ *
* @return
*/
- protected SequenceI findName(AlignmentI align, List<SequenceI> newseqs,
- boolean relaxedIdMatching, String seqId)
+ protected SequenceI findSequence(String seqId, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
{
+ // TODO encapsulate in SequenceIdMatcher, share the matcher
+ // with the GffHelper (removing code duplication)
SequenceI match = null;
if (relaxedIdMatching)
{
* a map whose keys are the type names of visible features
* @return
*/
- public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible)
+ public String printGffFormat(SequenceI[] sequences,
+ Map<String, Object> visible)
{
return printGffFormat(sequences, visible, true, true);
}
* @param includeNonPositionalFeatures
* @return
*/
- public String printGffFormat(SequenceI[] sequences, Map<String, Object> visible, boolean outputVisibleOnly,
+ public String printGffFormat(SequenceI[] sequences,
+ Map<String, Object> visible, boolean outputVisibleOnly,
boolean includeNonPositionalFeatures)
{
StringBuilder out = new StringBuilder(256);
*/
continue;
}
-
+
source = sf.featureGroup;
if (source == null)
{
source = sf.getDescription();
}
-
+
out.append(seq.getName());
out.append(TAB);
out.append(source);
out.append(TAB);
out.append(sf.score);
out.append(TAB);
-
- out.append(sf.getValue(STRAND, "."));
+
+ int strand = sf.getStrand();
+ out.append(strand == 1 ? "+" : (strand == -1 ? "-" : "."));
out.append(TAB);
-
+
out.append(sf.getValue(FRAME, "."));
-
+
// miscellaneous key-values (GFF column 9)
- String attributes = (String) sf.getValue(ATTRIBUTES);
+ String attributes = sf.getAttributes();
if (attributes != null)
{
out.append(TAB).append(attributes);
}
-
+
out.append(newline);
}
}
}
-
+
return out.toString();
}
toRanges[toRangesIndex++] = toStart;
toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
}
-
+
return new MapList(fromRanges, toRanges, 3, 1);
}
/**
- * Parse a GFF format feature. This may include creating a 'dummy' sequence
- * for the feature or its mapped sequence
+ * Parse a GFF format feature. This may include creating a 'dummy' sequence to
+ * hold the feature, or for its mapped sequence, or both, to be resolved
+ * either later in the GFF file (##FASTA section), or when the user loads
+ * additional sequences.
*
- * @param st
+ * @param gffColumns
* @param alignment
* @param relaxedIdMatching
* @param newseqs
* @return
*/
- protected SequenceI parseGffFeature(StringTokenizer st,
- AlignmentI alignment, boolean relaxedIdMatching,
- List<SequenceI> newseqs)
+ protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
+ boolean relaxedIdMatching, List<SequenceI> newseqs)
{
- SequenceI seq;
/*
* GFF: seqid source type start end score strand phase [attributes]
*/
- if (st.countTokens() < 8)
+ if (gffColumns.length < 5)
{
- System.err
- .println("Ignoring GFF feature line with unexpected number of columns ("
- + st.countTokens() + ")");
+ System.err.println("Ignoring GFF feature line with too few columns ("
+ + gffColumns.length + ")");
return null;
}
- String seqId = st.nextToken();
-
+
/*
* locate referenced sequence in alignment _or_
- * as a forward reference (SequenceDummy)
+ * as a forward or external reference (SequenceDummy)
*/
- seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
-
- String desc = st.nextToken();
- String group = null;
- if (desc.indexOf(' ') == -1)
- {
- // could also be a source term rather than description line
- group = desc;
- }
- String ft = st.nextToken();
- int startPos = StringUtils.parseInt(st.nextToken());
- int endPos = StringUtils.parseInt(st.nextToken());
- // TODO: decide if non positional feature assertion for input data
- // where end==0 is generally valid
- if (endPos == 0)
- {
- // treat as non-positional feature, regardless.
- startPos = 0;
- }
- float score = 0f;
- try
- {
- score = new Float(st.nextToken()).floatValue();
- } catch (NumberFormatException ex)
- {
- // leave at 0
- }
-
- SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
- endPos, score, group);
- if (st.hasMoreTokens())
- {
- sf.setValue(STRAND, st.nextToken());
- }
- if (st.hasMoreTokens())
- {
- sf.setValue(FRAME, st.nextToken());
- }
-
- if (st.hasMoreTokens())
- {
- processGffColumnNine(st.nextToken(), sf);
- }
-
- if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
- relaxedIdMatching))
+ String seqId = gffColumns[0];
+ SequenceI seq = findSequence(seqId, alignment, newseqs,
+ relaxedIdMatching);
+
+ SequenceFeature sf = null;
+ GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
+ if (helper != null)
{
- // check whether we should add the sequence feature to any other
- // sequences in the alignment with the same or similar
- while ((seq = alignment.findName(seq, seqId, true)) != null)
+ try
+ {
+ sf = helper.processGff(seq, gffColumns, alignment, newseqs,
+ relaxedIdMatching);
+ if (sf != null)
+ {
+ seq.addSequenceFeature(sf);
+ while ((seq = alignment.findName(seq, seqId, true)) != null)
+ {
+ seq.addSequenceFeature(new SequenceFeature(sf));
+ }
+ }
+ } catch (IOException e)
{
- seq.addSequenceFeature(new SequenceFeature(sf));
+ System.err.println("GFF parsing failed with: " + e.getMessage());
+ return null;
}
}
+
return seq;
}
*/
protected void processGffColumnNine(String attributes, SequenceFeature sf)
{
- sf.setValue(ATTRIBUTES, attributes);
-
+ sf.setAttributes(attributes);
+
/*
* Parse attributes in column 9 and add them to the sequence feature's
* 'otherData' table; use Note as a best proxy for description
*/
- char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
- Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
- nameValueSeparator);
+ char nameValueSeparator = gffVersion == 3 ? '=' : ' ';
+ // TODO check we don't break GFF2 values which include commas here
+ Map<String, List<String>> nameValues = GffHelperBase
+ .parseNameValuePairs(attributes, ";", nameValueSeparator, ",");
for (Entry<String, List<String>> attr : nameValues.entrySet())
{
String values = StringUtils.listToDelimitedString(attr.getValue(),
}
FastaFile parser = new FastaFile(this);
List<SequenceI> includedseqs = parser.getSeqs();
+
SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
- // iterate over includedseqs, and replacing matching ones with newseqs
- // sequences. Generic iterator not used here because we modify includedseqs
- // as we go
+
+ /*
+ * iterate over includedseqs, and replacing matching ones with newseqs
+ * sequences. Generic iterator not used here because we modify
+ * includedseqs as we go
+ */
for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
{
// search for any dummy seqs that this sequence can be used to update
- SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
- if (dummyseq != null)
+ SequenceI includedSeq = includedseqs.get(p);
+ SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
+ if (dummyseq != null && dummyseq instanceof SequenceDummy)
{
- // dummyseq was created so it could be annotated and referred to in
- // alignments/codon mappings
-
- SequenceI mseq = includedseqs.get(p);
- // mseq is the 'template' imported from the FASTA file which we'll use
- // to coomplete dummyseq
- if (dummyseq instanceof SequenceDummy)
+ // probably have the pattern wrong
+ // idea is that a flyweight proxy for a sequence ID can be created for
+ // 1. stable reference creation
+ // 2. addition of annotation
+ // 3. future replacement by a real sequence
+ // current pattern is to create SequenceDummy objects - a convenience
+ // constructor for a Sequence.
+ // problem is that when promoted to a real sequence, all references
+ // need to be updated somehow. We avoid that by keeping the same object.
+ ((SequenceDummy) dummyseq).become(includedSeq);
+ dummyseq.createDatasetSequence();
+
+ /*
+ * Update mappings so they are now to the dataset sequence
+ */
+ for (AlignedCodonFrame mapping : align.getCodonFrames())
{
- // probably have the pattern wrong
- // idea is that a flyweight proxy for a sequence ID can be created for
- // 1. stable reference creation
- // 2. addition of annotation
- // 3. future replacement by a real sequence
- // current pattern is to create SequenceDummy objects - a convenience
- // constructor for a Sequence.
- // problem is that when promoted to a real sequence, all references
- // need
- // to be updated somehow.
- ((SequenceDummy) dummyseq).become(mseq);
- includedseqs.set(p, dummyseq); // template is no longer needed
+ mapping.updateToDataset(dummyseq);
}
+
+ /*
+ * replace parsed sequence with the realised forward reference
+ */
+ includedseqs.set(p, dummyseq);
}
}
- // finally add sequences to the dataset
+
+ /*
+ * finally add sequences to the dataset
+ */
for (SequenceI seq : includedseqs)
{
+ // experimental: mapping-based 'alignment' to query sequence
+ AlignmentUtils.alignSequenceAs(seq, align,
+ String.valueOf(align.getGapCharacter()), false, true);
+
+ // rename sequences if GFF handler requested this
+ // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
+ SequenceFeature[] sfs = seq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ String newName = (String) sfs[0].getValue(GffHelperI.RENAME_TOKEN);
+ if (newName != null)
+ {
+ seq.setName(newName);
+ }
+ }
align.addSequence(seq);
}
}
* @param newseqs
* @throws IOException
*/
- protected void processGffPragma(String line, Map<String, String> gffProps, AlignmentI align,
+ protected void processGffPragma(String line,
+ Map<String, String> gffProps, AlignmentI align,
List<SequenceI> newseqs) throws IOException
{
line = line.trim();
// close off any open 'forward references'
return;
}
-
+
String[] tokens = line.substring(2).split(" ");
String pragma = tokens[0];
String value = tokens.length == 1 ? null : tokens[1];
-
+
if ("gff-version".equalsIgnoreCase(pragma))
{
if (value != null)
}
}
}
+ else if ("sequence-region".equalsIgnoreCase(pragma))
+ {
+ // could capture <seqid start end> if wanted here
+ }
else if ("feature-ontology".equalsIgnoreCase(pragma))
{
// should resolve against the specified feature ontology URI
System.err.println("Ignoring unknown pragma: " + line);
}
}
-
- /**
- * Processes the 'Query' (or 'Target') and 'Align' properties associated with
- * an exonerate GFF similarity feature; these properties define the mapping of
- * the annotated feature (e.g. 'exon') to a related sequence.
- *
- * @param set
- * @param seq
- * @param sf
- * @param align
- * @param newseqs
- * @param relaxedIdMatching
- * @throws IOException
- */
- public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
- SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
- throws IOException
- {
- if (!validateExonerateModel(sf))
- {
- return;
- }
-
- int strand = sf.getStrand();
-
- /*
- * exonerate (protein2dna or protein2genome) may be run with
- * --showquerygff outputs
- * Target <dnaseqid> ; Align proteinStartPos dnaStartPos peptideCount
- * --showtargetgff outputs
- * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
- * where the Align spec may repeat
- */
- boolean mapIsFromCdna = true;
- List<String> mapTo = set.get(QUERY);
- if (mapTo == null)
- {
- mapTo = set.get(TARGET);
- mapIsFromCdna = false;
- }
- if (mapTo == null || mapTo.size() != 1)
- {
- throw new IOException(
- "Expecting exactly one sequence in Query field (got " + mapTo
- + ")");
- }
-
- /*
- * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
- */
- SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching,
- mapTo.get(0));
- /*
- * Process the Align maps and create cdna/protein maps;
- * ideally, the query sequences are in the alignment, but maybe not...
- */
- AlignedCodonFrame alco = new AlignedCodonFrame();
- MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN),
- mapIsFromCdna, strand);
-
- /*
- * Jalview always maps from dna to protein
- */
- if (mapIsFromCdna)
- {
- alco.addMap(seq, mappedSequence, codonmapping);
- }
- else
- {
- alco.addMap(mappedSequence, seq, codonmapping);
- }
- align.addCodonFrame(alco);
- }
-
- /**
- * Returns true if the exonerate model (saved from column 2 of the GFF as the
- * SequenceFeature's group) is one that we are willing to process, else false
- *
- * @param sf
- */
- protected boolean validateExonerateModel(SequenceFeature sf)
- {
- /*
- * we don't handle protein-to-protein or dna-to-dna alignment here
- */
- String source = sf.getFeatureGroup();
- if (source == null
- || (!source.contains("protein2dna") && !source
- .contains("protein2genome")))
- {
- System.err
- .println("I only accept protein2dna or protein2genome but found "
- + source);
- return false;
- }
- return true;
- }
-
- /**
- * take a sequence feature and examine its attributes to decide how it should
- * be added to a sequence
- *
- * @param seq
- * - the destination sequence constructed or discovered in the
- * current context
- * @param sf
- * - the base feature with ATTRIBUTES property containing any
- * additional attributes
- * @param gFFFile
- * - true if we are processing a GFF annotation file
- * @return true if sf was actually added to the sequence, false if it was
- * processed in another way
- */
- public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs,
- SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching)
- {
- String attr = (String) sf.getValue(ATTRIBUTES);
- boolean addFeature = true;
- if (attr != null)
- {
- for (String attset : attr.split(TAB))
- {
- Map<String, List<String>> set = StringUtils.parseNameValuePairs(
- attset, ";", new char[] { ' ', '-' });
-
- if (SIMILARITY.equals(sf.getType()))
- {
- try
- {
- addFeature = false;
- processGffSimilarity(set, seq, sf, align, newseqs,
- relaxedIdMatching);
- } catch (IOException ivfe)
- {
- System.err.println(ivfe);
- }
- }
- }
- }
- if (addFeature)
- {
- seq.addSequenceFeature(sf);
- }
- return addFeature;
- }
-
}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A handler to parse GFF in the format generated by the exonerate tool
+ */
+public class ExonerateHelper extends Gff2Helper
+{
+ private static final String SIMILARITY = "similarity";
+
+ private static final String GENOME2GENOME = "genome2genome";
+
+ private static final String CDNA2GENOME = "cdna2genome";
+
+ private static final String CODING2GENOME = "coding2genome";
+
+ private static final String CODING2CODING = "coding2coding";
+
+ private static final String PROTEIN2GENOME = "protein2genome";
+
+ private static final String PROTEIN2DNA = "protein2dna";
+
+ private static final String ALIGN = "Align";
+
+ private static final String QUERY = "Query";
+
+ private static final String TARGET = "Target";
+
+ /**
+ * Process one GFF feature line (as modelled by SequenceFeature)
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param sf
+ * the sequence feature with ATTRIBUTES property containing any
+ * additional attributes
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return true if the sequence feature should be added to the sequence, else
+ * false (i.e. it has been processed in another way e.g. to generate a
+ * mapping)
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ {
+ String attr = gffColumns[ATTRIBUTES_COL];
+ Map<String, List<String>> set = parseNameValuePairs(attr);
+
+ try
+ {
+ processGffSimilarity(set, seq, gffColumns,
+ align, newseqs, relaxedIdMatching);
+ } catch (IOException ivfe)
+ {
+ System.err.println(ivfe);
+ }
+
+ /*
+ * return null to indicate we don't want to add a sequence feature for
+ * similarity (only process it to create mappings)
+ */
+ return null;
+ }
+
+ /**
+ * Processes the 'Query' (or 'Target') and 'Align' properties associated with
+ * an exonerate GFF similarity feature; these properties define the mapping of
+ * the annotated range to a related sequence.
+ *
+ * @param set
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gff
+ * the GFF column data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @throws IOException
+ */
+ protected void processGffSimilarity(
+ Map<String, List<String>> set,
+ SequenceI seq, String[] gff, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
+ throws IOException
+ {
+ /*
+ * exonerate may be run with
+ * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
+ * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
+ * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
+ * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
+ * where the Align spec may repeat
+ */
+ // TODO handle coding2coding and similar as well
+ boolean featureIsOnTarget = true;
+ List<String> mapTo = set.get(QUERY);
+ if (mapTo == null)
+ {
+ mapTo = set.get(TARGET);
+ featureIsOnTarget = false;
+ }
+ MappingType type = getMappingType(gff[SOURCE_COL]);
+
+ if (type == null)
+ {
+ throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
+ }
+
+ if (mapTo == null || mapTo.size() != 1)
+ {
+ throw new IOException(
+ "Expecting exactly one sequence in Query or Target field (got "
+ + mapTo + ")");
+ }
+
+ /*
+ * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
+ */
+ SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
+ relaxedIdMatching);
+
+ /*
+ * If mapping is from protein to dna, we store it as dna to protein instead
+ */
+ SequenceI mapFromSequence = seq;
+ SequenceI mapToSequence = mappedSequence;
+ if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
+ || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
+ {
+ mapFromSequence = mappedSequence;
+ mapToSequence = seq;
+ }
+
+ /*
+ * Process the Align maps and create mappings.
+ * These may be cdna-genome, cdna-protein, genome-protein.
+ * The mapped sequences may or may not be in the alignment
+ * (they may be included later in the GFF file).
+ */
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame acf = getMapping(align, mapFromSequence,
+ mapToSequence);
+
+ /*
+ * exonerate GFF has the strand of the target in column 7 rather
+ * (differs from GFF3 which has it in the Target descriptor)
+ */
+ String strand = gff[STRAND_COL];
+ boolean forwardStrand = true;
+ if ("-".equals(strand))
+ {
+ forwardStrand = false;
+ }
+ else if (!"+".equals(strand))
+ {
+ System.err.println("Strand must be specified for alignment");
+ return;
+ }
+
+ List<String> alignedRegions = set.get(ALIGN);
+ for (String region : alignedRegions)
+ {
+ MapList mapping = buildMapping(region, type, forwardStrand,
+ featureIsOnTarget, gff);
+
+ if (mapping == null)
+ {
+ continue;
+ }
+
+ acf.addMap(mapFromSequence, mapToSequence, mapping);
+ }
+ align.addCodonFrame(acf);
+ }
+
+ /**
+ * Construct the mapping
+ *
+ * @param region
+ * @param type
+ * @param forwardStrand
+ * @param featureIsOnTarget
+ * @param gff
+ * @return
+ */
+ protected MapList buildMapping(String region, MappingType type,
+ boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
+ {
+ /*
+ * process one "fromStart toStart fromCount" descriptor
+ */
+ String[] tokens = region.split(" ");
+ if (tokens.length != 3)
+ {
+ System.err.println("Malformed Align descriptor: " + region);
+ return null;
+ }
+
+ /*
+ * get start/end of from/to mappings
+ * if feature is on the target sequence we have to invert the sense
+ */
+ int alignFromStart;
+ int alignToStart;
+ int alignCount;
+ try {
+ alignFromStart = Integer.parseInt(tokens[0]);
+ alignToStart = Integer.parseInt(tokens[1]);
+ alignCount = Integer.parseInt(tokens[2]);
+ } catch (NumberFormatException nfe) {
+ System.err.println(nfe.toString());
+ return null;
+ }
+
+ int fromStart;
+ int fromEnd;
+ int toStart;
+ int toEnd;
+
+ if (featureIsOnTarget)
+ {
+ fromStart = alignToStart;
+ toStart = alignFromStart;
+ toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
+ - (alignCount - 1);
+ int toLength = Math.abs(toEnd - toStart) + 1;
+ int fromLength = toLength * type.getFromRatio() / type.getToRatio();
+ fromEnd = fromStart + fromLength - 1;
+ }
+ else
+ {
+ // we use the 'Align' values here not the feature start/end
+ // not clear why they may differ but it seems they can
+ fromStart = alignFromStart;
+ fromEnd = alignFromStart + alignCount - 1;
+ int fromLength = fromEnd - fromStart + 1;
+ int toLength = fromLength * type.getToRatio() / type.getFromRatio();
+ toStart = alignToStart;
+ if (forwardStrand)
+ {
+ toEnd = toStart + toLength - 1;
+ }
+ else
+ {
+ toEnd = toStart - (toLength - 1);
+ }
+ }
+
+ MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
+ toStart, toEnd, type);
+ return codonmapping;
+ }
+
+ /**
+ * Returns a MappingType depending on the exonerate 'model' value.
+ *
+ * @param model
+ * @return
+ */
+ protected static MappingType getMappingType(String model)
+ {
+ MappingType result = null;
+
+ if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
+ {
+ result = MappingType.PeptideToNucleotide;
+ }
+ else if (model.contains(CODING2CODING)
+ || model.contains(CODING2GENOME)
+ || model.contains(CDNA2GENOME)
+ || model.contains(GENOME2GENOME))
+ {
+ result = MappingType.NucleotideToNucleotide;
+ }
+ return result;
+ }
+
+ /**
+ * Tests whether the GFF data looks like it was generated by exonerate, and is
+ * a format we are willing to handle
+ *
+ * @param sf
+ * @return
+ */
+ public static boolean recognises(String[] columns)
+ {
+ if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
+ {
+ return false;
+ }
+
+ /*
+ * inspect alignment model
+ */
+ String model = columns[SOURCE_COL];
+ // e.g. exonerate:protein2genome:local
+ if (model != null)
+ {
+ String mdl = model.toLowerCase();
+ if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
+ || mdl.contains(CODING2CODING)
+ || mdl.contains(CODING2GENOME)
+ || mdl.contains(CDNA2GENOME)
+ || mdl.contains(GENOME2GENOME))
+ {
+ return true;
+ }
+ }
+ System.err.println("Sorry, I don't handle exonerate model " + model);
+ return false;
+ }
+
+ @Override
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> set)
+ {
+ SequenceFeature sf = super.buildSequenceFeature(gff, set);
+ sf.setFeatureGroup("exonerate");
+
+ return sf;
+ }
+
+}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+public class Gff2Helper extends GffHelperBase
+{
+ /**
+ * GFF2 uses space character to delimit name/value pairs on column 9
+ *
+ * @param text
+ * @return
+ */
+ public static Map<String, List<String>> parseNameValuePairs(String text)
+ {
+ // TODO: can a value include a comma? if so it will be broken by this
+ return parseNameValuePairs(text, ";", ' ', ",");
+ }
+
+ /**
+ * Return ' ' as the name-value separator used in column 9 attributes.
+ */
+ @Override
+ protected char getNameValueSeparator()
+ {
+ return ' ';
+ }
+
+ /**
+ * Default processing if not overridden is just to construct a sequence
+ * feature
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gff,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching) throws IOException
+ {
+ Map<String, List<String>> attributes = null;
+ if (gff.length > ATTRIBUTES_COL)
+ {
+ attributes = parseNameValuePairs(gff[ATTRIBUTES_COL]);
+ }
+ return buildSequenceFeature(gff, attributes);
+ }
+
+}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+import jalview.util.StringUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Base class with generic / common functionality for processing GFF3 data.
+ * Override this as required for any specialisations resulting from
+ * peculiarities of GFF3 generated by particular tools.
+ */
+public class Gff3Helper extends GffHelperBase
+{
+ protected static final String TARGET = "Target";
+
+ protected static final String ID = "ID";
+
+ private static final String NAME = "Name";
+
+ /**
+ * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
+ * separate multiple values for a name
+ *
+ * @param text
+ * @return
+ */
+ public static Map<String, List<String>> parseNameValuePairs(String text)
+ {
+ return parseNameValuePairs(text, ";", '=', ",");
+ }
+
+ /**
+ * Process one GFF feature line (as modelled by SequenceFeature)
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param sf
+ * the sequence feature with ATTRIBUTES property containing any
+ * additional attributes
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return true if the sequence feature should be added to the sequence, else
+ * false (i.e. it has been processed in another way e.g. to generate a
+ * mapping)
+ * @throws IOException
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gff,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching) throws IOException
+ {
+ /*
+ * (For now) we don't process mappings from reverse complement ; to do
+ * this would require (a) creating a virtual sequence placeholder for
+ * the reverse complement (b) resolving the sequence by its id from some
+ * source (GFF ##FASTA or other) (c) creating the reverse complement
+ * sequence (d) updating the mapping to be to the reverse complement
+ */
+ if ("-".equals(gff[STRAND_COL]))
+ {
+ System.err
+ .println("Skipping mapping from reverse complement as not yet supported");
+ return null;
+ }
+ SequenceFeature sf = null;
+
+ if (gff.length == 9)
+ {
+ String soTerm = gff[TYPE_COL];
+ String atts = gff[ATTRIBUTES_COL];
+ Map<String, List<String>> attributes = parseNameValuePairs(atts);
+
+ if (SequenceOntology.getInstance().isProteinMatch(soTerm))
+ {
+ sf = processProteinMatch(attributes, seq, gff, align,
+ newseqs, relaxedIdMatching);
+ }
+ else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
+ {
+ sf = processNucleotideMatch(attributes, seq, gff, align,
+ newseqs, relaxedIdMatching);
+ }
+ else
+ {
+ sf = buildSequenceFeature(gff, attributes);
+ }
+ }
+ else
+ {
+ /*
+ * fall back on generating a sequence feature with no special processing
+ */
+ sf = buildSequenceFeature(gff, null);
+ }
+
+ return sf;
+ }
+
+ /**
+ * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
+ *
+ * @param attributes
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gffColumns
+ * the GFF column data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @return a sequence feature, if one should be added to the sequence, else
+ * null
+ * @throws IOException
+ */
+ protected SequenceFeature processNucleotideMatch(
+ Map<String, List<String>> attributes, SequenceI seq,
+ String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ throws IOException
+ {
+ String strand = gffColumns[STRAND_COL];
+ if ("-1".equals(strand))
+ {
+ System.err
+ .println("Currently ignoring mappings from reverse complement");
+ return null;
+ }
+
+ List<String> targets = attributes.get(TARGET);
+ if (targets == null)
+ {
+ System.err.println("'Target' missing in GFF");
+ return null;
+ }
+
+ /*
+ * Typically we only expect one Target per GFF line, but this can handle
+ * multiple matches, to the same or different sequences (e.g. dna variants)
+ */
+ for (String target : targets)
+ {
+ /*
+ * Process "seqid start end [strand]"
+ */
+ String[] tokens = target.split(" ");
+ if (tokens.length < 3)
+ {
+ System.err.println("Incomplete Target: " + target);
+ continue;
+ }
+
+ /*
+ * Locate the mapped sequence in the alignment, or as a
+ * (new or existing) virtual sequence in the newseqs list
+ */
+ String targetId = findTargetId(tokens[0], attributes);
+ SequenceI mappedSequence1 = findSequence(targetId, align,
+ newseqs, relaxedIdMatching);
+ SequenceI mappedSequence = mappedSequence1;
+ if (mappedSequence == null)
+ {
+ continue;
+ }
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
+
+ try
+ {
+ int toStart = Integer.parseInt(tokens[1]);
+ int toEnd = Integer.parseInt(tokens[2]);
+ if (tokens.length > 3 && "-".equals(tokens[3]))
+ {
+ // mapping to reverse strand - swap start/end
+ int temp = toStart;
+ toStart = toEnd;
+ toEnd = temp;
+ }
+
+ int fromStart = Integer.parseInt(gffColumns[START_COL]);
+ int fromEnd = Integer.parseInt(gffColumns[END_COL]);
+ MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
+ toStart, toEnd,
+ MappingType.NucleotideToNucleotide);
+
+ if (mapping != null)
+ {
+ acf.addMap(seq, mappedSequence, mapping);
+ align.addCodonFrame(acf);
+ }
+ } catch (NumberFormatException nfe)
+ {
+ System.err.println("Invalid start or end in Target " + target);
+ }
+ }
+
+ SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
+ return sf;
+ }
+
+ /**
+ * Returns the target sequence id extracted from the GFF name/value pairs.
+ * Default (standard behaviour) is the first token for "Target". This may be
+ * overridden where tools report this in a non-standard way.
+ *
+ * @param target
+ * first token of a "Target" value from GFF column 9, typically
+ * "seqid start end"
+ * @param set
+ * a map with all parsed column 9 attributes
+ * @return
+ */
+ @SuppressWarnings("unused")
+ protected String findTargetId(String target, Map<String, List<String>> set)
+ {
+ return target;
+ }
+
+ /**
+ * Processes one GFF 'protein_match'; fields of interest are
+ * <ul>
+ * <li>feature group - the database reporting a match e.g. Pfam</li>
+ * <li>Name - the matched entry's accession id in the database</li>
+ * <li>ID - a sequence identifier for the matched region (which may be
+ * appended as FASTA in the GFF file)</li>
+ * </ul>
+ *
+ * @param set
+ * parsed GFF column 9 key/value(s)
+ * @param seq
+ * the sequence the GFF feature is on
+ * @param gffColumns
+ * the sequence feature holding GFF data
+ * @param align
+ * the alignment the sequence belongs to, where any new mappings
+ * should be added
+ * @param newseqs
+ * a list of new 'virtual sequences' generated while parsing GFF
+ * @param relaxedIdMatching
+ * if true allow fuzzy search for a matching target sequence
+ * @return the (real or virtual) sequence(s) mapped to by this match
+ * @throws IOException
+ */
+ protected SequenceFeature processProteinMatch(
+ Map<String, List<String>> set, SequenceI seq,
+ String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching)
+ {
+ // This is currently tailored to InterProScan GFF output:
+ // ID holds the ID of the matched sequence, Target references the
+ // query sequence; this looks wrong, as ID should just be the GFF internal
+ // ID of the GFF feature, while Target would normally reference the matched
+ // sequence.
+ // TODO refactor as needed if other protein-protein GFF varies
+
+ SequenceFeature sf = buildSequenceFeature(gffColumns, set);
+
+ /*
+ * locate the mapped sequence in the alignment, or as a
+ * (new or existing) virtual sequence in the newseqs list
+ */
+ List<String> targets = set.get(TARGET);
+ if (targets != null)
+ {
+ for (String target : targets)
+ {
+
+ SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
+ newseqs, relaxedIdMatching);
+ SequenceI mappedSequence = mappedSequence1;
+ if (mappedSequence == null)
+ {
+ continue;
+ }
+
+ /*
+ * give the mapped sequence a copy of the sequence feature, with
+ * start/end range adjusted
+ */
+ SequenceFeature sf2 = new SequenceFeature(sf);
+ sf2.setBegin(1);
+ int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
+ sf2.setEnd(sequenceFeatureLength);
+ mappedSequence.addSequenceFeature(sf2);
+
+ /*
+ * add a property to the mapped sequence so that it can eventually be
+ * renamed with its qualified accession id; renaming has to wait until
+ * all sequence reference resolution is complete
+ */
+ String accessionId = StringUtils.listToDelimitedString(
+ set.get(NAME), ",");
+ if (accessionId.length() > 0)
+ {
+ String database = sf.getType(); // TODO InterProScan only??
+ String qualifiedAccId = database + "|" + accessionId;
+ sf2.setValue(RENAME_TOKEN, qualifiedAccId);
+ }
+
+ /*
+ * get any existing mapping for these sequences (or start one),
+ * and add this mapped range
+ */
+ AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
+ int[] from = new int[] { sf.getBegin(), sf.getEnd() };
+ int[] to = new int[] { 1, sequenceFeatureLength };
+ MapList mapping = new MapList(from, to, 1, 1);
+
+ alco.addMap(seq, mappedSequence, mapping);
+ align.addCodonFrame(alco);
+ }
+ }
+
+ return sf;
+ }
+
+ /**
+ * Return '=' as the name-value separator used in column 9 attributes.
+ */
+ @Override
+ protected char getNameValueSeparator()
+ {
+ return '=';
+ }
+
+ /**
+ * Modifies the default SequenceFeature in order to set the Target sequence id
+ * as the description
+ */
+ @Override
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> attributes)
+ {
+ SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
+ String target = (String) sf.getValue(TARGET);
+ if (target != null)
+ {
+ sf.setDescription(target.split(" ")[0]);
+ }
+ return sf;
+ }
+}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.analysis.SequenceIdMatcher;
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceDummy;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+import jalview.util.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+/**
+ * Base class with common functionality for flavours of GFF handler (GFF2 or
+ * GFF3)
+ */
+public abstract class GffHelperBase implements GffHelperI
+{
+ private static final String NOTE = "Note";
+
+ /*
+ * GFF columns 1-9 (zero-indexed):
+ */
+ protected static final int SEQID_COL = 0;
+
+ protected static final int SOURCE_COL = 1;
+
+ protected static final int TYPE_COL = 2;
+
+ protected static final int START_COL = 3;
+
+ protected static final int END_COL = 4;
+
+ protected static final int SCORE_COL = 5;
+
+ protected static final int STRAND_COL = 6;
+
+ protected static final int PHASE_COL = 7;
+
+ protected static final int ATTRIBUTES_COL = 8;
+
+ private AlignmentI lastmatchedAl = null;
+
+ private SequenceIdMatcher matcher = null;
+
+ /**
+ * Constructs and returns a mapping, or null if data appear invalid
+ *
+ * @param fromStart
+ * @param fromEnd
+ * @param toStart
+ * @param toEnd
+ * @param mappingType
+ * type of mapping (e.g. protein to nucleotide)
+ * @return
+ */
+ protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
+ int toStart, int toEnd, MappingType mappingType)
+ {
+ int[] from = new int[] { fromStart, fromEnd };
+ int[] to = new int[] { toStart, toEnd };
+
+ /*
+ * Jalview always models from dna to protein, so switch values if the
+ * GFF mapping is from protein to dna
+ */
+ if (mappingType == MappingType.PeptideToNucleotide)
+ {
+ int[] temp = from;
+ from = to;
+ to = temp;
+ mappingType = mappingType.getInverse();
+ }
+
+ int fromRatio = mappingType.getFromRatio();
+ int toRatio = mappingType.getToRatio();
+
+ /*
+ * sanity check that mapped residue counts match
+ * TODO understand why PASA generates such cases...
+ */
+ if (!trimMapping(from, to, fromRatio, toRatio))
+ {
+ System.err.println("Ignoring mapping from " + Arrays.toString(from)
+ + " to " + Arrays.toString(to) + " as counts don't match!");
+ return null;
+ }
+
+ /*
+ * If a codon has an intron gap, there will be contiguous 'toRanges';
+ * this is handled for us by the MapList constructor.
+ * (It is not clear that exonerate ever generates this case)
+ */
+
+ return new MapList(from, to, fromRatio, toRatio);
+ }
+
+ /**
+ * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
+ * tries to trim the end of the longer so they do. Returns true if the
+ * mappings could be made equivalent, else false. Note the range array values
+ * may be modified by this method.
+ *
+ * @param from
+ * @param to
+ * @param fromRatio
+ * @param toRatio
+ * @return
+ */
+ protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
+ int toRatio)
+ {
+ int fromLength = Math.abs(from[1] - from[0]) + 1;
+ int toLength = Math.abs(to[1] - to[0]) + 1;
+ int fromOverlap = fromLength * toRatio - toLength * fromRatio;
+ if (fromOverlap == 0)
+ {
+ return true;
+ }
+ if (fromOverlap > 0 && fromOverlap % toRatio == 0)
+ {
+ /*
+ * restrict from range to make them match up
+ * it's kind of arbitrary which end we truncate - here it is the end
+ */
+ System.err.print("Truncating mapping from " + Arrays.toString(from)
+ + " to ");
+ if (from[1] > from[0])
+ {
+ from[1] -= fromOverlap / toRatio;
+ }
+ else
+ {
+ from[1] += fromOverlap / toRatio;
+ }
+ System.err.println(Arrays.toString(from));
+ return true;
+ }
+ else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
+ {
+ fromOverlap = -fromOverlap; // > 0
+ /*
+ * restrict to range to make them match up
+ */
+ System.err.print("Truncating mapping to " + Arrays.toString(to)
+ + " to ");
+ if (to[1] > to[0])
+ {
+ to[1] -= fromOverlap / fromRatio;
+ }
+ else
+ {
+ to[1] += fromOverlap / fromRatio;
+ }
+ System.err.println(Arrays.toString(to));
+ return true;
+ }
+
+ /*
+ * Couldn't truncate to an exact match..
+ */
+ return false;
+ }
+
+ /**
+ * Returns a sequence matching the given id, as follows
+ * <ul>
+ * <li>strict matching is on exact sequence name</li>
+ * <li>relaxed matching allows matching on a token within the sequence name,
+ * or a dbxref</li>
+ * <li>first tries to find a match in the alignment sequences</li>
+ * <li>else tries to find a match in the new sequences already generated while
+ * parsing the features file</li>
+ * <li>else creates a new placeholder sequence, adds it to the new sequences
+ * list, and returns it</li>
+ * </ul>
+ *
+ * @param seqId
+ * @param align
+ * @param newseqs
+ * @param relaxedIdMatching
+ *
+ * @return
+ */
+ protected SequenceI findSequence(String seqId, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
+ {
+ if (seqId == null)
+ {
+ return null;
+ }
+ SequenceI match = null;
+ if (relaxedIdMatching)
+ {
+ if (lastmatchedAl != align)
+ {
+ lastmatchedAl = align;
+ matcher = new SequenceIdMatcher(align.getSequencesArray());
+ if (newseqs != null)
+ {
+ matcher.addAll(newseqs);
+ }
+ }
+ match = matcher.findIdMatch(seqId);
+ }
+ else
+ {
+ match = align.findName(seqId, true);
+ if (match == null && newseqs != null)
+ {
+ for (SequenceI m : newseqs)
+ {
+ if (seqId.equals(m.getName()))
+ {
+ return m;
+ }
+ }
+ }
+
+ }
+ if (match == null && newseqs != null)
+ {
+ match = new SequenceDummy(seqId);
+ if (relaxedIdMatching)
+ {
+ matcher.addAll(Arrays.asList(new SequenceI[] { match }));
+ }
+ // add dummy sequence to the newseqs list
+ newseqs.add(match);
+ }
+ return match;
+ }
+
+ /**
+ * Parses the input line to a map of name / value(s) pairs. For example the
+ * line <br>
+ * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal <br>
+ * if parsed with delimiter=";" and separators {' ', '='} <br>
+ * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
+ * prediction}, source={Pfam}} <br>
+ *
+ * This method supports parsing of either GFF2 format (which uses space ' ' as
+ * the name/value delimiter, and allows multiple occurrences of the same
+ * name), or GFF3 format (which uses '=' as the name/value delimiter, and
+ * strictly does not allow repeat occurrences of the same name - but does
+ * allow a comma-separated list of values).
+ *
+ * @param text
+ * @param namesDelimiter
+ * the major delimiter between name-value pairs
+ * @param nameValueSeparator
+ * one or more separators used between name and value
+ * @param valuesDelimiter
+ * delimits a list of more than one value
+ * @return the name-values map (which may be empty but never null)
+ */
+ public static Map<String, List<String>> parseNameValuePairs(String text,
+ String namesDelimiter, char nameValueSeparator,
+ String valuesDelimiter)
+ {
+ Map<String, List<String>> map = new HashMap<String, List<String>>();
+ if (text == null || text.trim().length() == 0)
+ {
+ return map;
+ }
+
+ for (String pair : text.trim().split(namesDelimiter))
+ {
+ pair = pair.trim();
+ if (pair.length() == 0)
+ {
+ continue;
+ }
+
+ int sepPos = pair.indexOf(nameValueSeparator);
+ if (sepPos == -1)
+ {
+ // no name=value present
+ continue;
+ }
+
+ String key = pair.substring(0, sepPos).trim();
+ String values = pair.substring(sepPos + 1).trim();
+ if (values.length() > 0)
+ {
+ List<String> vals = map.get(key);
+ if (vals == null)
+ {
+ vals = new ArrayList<String>();
+ map.put(key, vals);
+ }
+ for (String val : values.split(valuesDelimiter))
+ {
+ vals.add(val);
+ }
+ }
+ }
+ return map;
+ }
+
+ /**
+ * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
+ * to call this method then adjust the SequenceFeature depending on the
+ * particular usage of different tools that generate GFF.
+ *
+ * @param gff
+ * @param attributes
+ * @return
+ */
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> attributes)
+ {
+ try
+ {
+ int start = Integer.parseInt(gff[START_COL]);
+ int end = Integer.parseInt(gff[END_COL]);
+ float score = Float.NaN;
+ try
+ {
+ score = Float.parseFloat(gff[SCORE_COL]);
+ } catch (NumberFormatException nfe)
+ {
+ // e.g. '.' - leave as NaN to indicate no score
+ }
+
+ SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
+ gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
+
+ if (attributes != null)
+ {
+ /*
+ * save 'raw' column 9 to allow roundtrip output as input
+ */
+ sf.setAttributes(gff[ATTRIBUTES_COL]);
+
+ /*
+ * Add attributes in column 9 to the sequence feature's
+ * 'otherData' table; use Note as a best proxy for description
+ */
+ for (Entry<String, List<String>> attr : attributes.entrySet())
+ {
+ String values = StringUtils.listToDelimitedString(
+ attr.getValue(), "; ");
+ sf.setValue(attr.getKey(), values);
+ if (NOTE.equals(attr.getKey()))
+ {
+ sf.setDescription(values);
+ }
+ }
+ }
+
+ return sf;
+ } catch (NumberFormatException nfe)
+ {
+ System.err.println("Invalid number in gff: " + nfe.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * Returns the character used to separate attributes names from values in GFF
+ * column 9. This is space for GFF2, '=' for GFF3.
+ *
+ * @return
+ */
+ protected abstract char getNameValueSeparator();
+
+ /**
+ * Returns any existing mapping held on the alignment between the given
+ * dataset sequences, or a new one if none found. This is a convenience method
+ * to facilitate processing multiple GFF lines that make up a single 'spliced'
+ * mapping, by extending the first mapping as the others are read.
+ *
+ * @param align
+ * @param fromSeq
+ * @param toSeq
+ * @return
+ */
+ protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq)
+ {
+ AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
+ if (acf == null)
+ {
+ acf = new AlignedCodonFrame();
+ }
+ return acf;
+ }
+
+}
--- /dev/null
+package jalview.io.gff;
+
+
+/**
+ * A factory to serve instances of GFF helper classes
+ */
+public class GffHelperFactory
+{
+
+ /**
+ * Returns a class to process the GFF line based on inspecting its column
+ * data. This may return a general-purpose GFF2 or GFF3 helper, or a
+ * specialisation for a flavour of GFF generated by a particular tool.
+ *
+ * @param gff
+ * @return
+ */
+ public static GffHelperI getHelper(String[] gff)
+ {
+ if (gff == null || gff.length < 6)
+ {
+ return null;
+ }
+
+ GffHelperI result = null;
+ if (ExonerateHelper.recognises(gff))
+ {
+ result = new ExonerateHelper();
+ }
+ else if (InterProScanHelper.recognises(gff))
+ {
+ result = new InterProScanHelper();
+ }
+ else if (looksLikeGff3(gff))
+ {
+ result = new Gff3Helper();
+ }
+ else
+ {
+ result = new Gff2Helper();
+ }
+
+ return result;
+ }
+
+ /**
+ * Heuristic rule: if column 9 seems to have Name=Value entries, assume this
+ * is GFF3. GFF3 uses '=' as name-value separator, GFF2 uses space ' '.
+ *
+ * @param gff
+ * @return
+ */
+ protected static boolean looksLikeGff3(String[] gff)
+ {
+ if (gff.length >= 9)
+ {
+ String attributes = gff[8].trim();
+ int pos1 = attributes.indexOf(';');
+ int pos2 = attributes.indexOf('=');
+ if (pos2 != -1 && (pos1 == -1 || pos2 < pos1))
+ {
+ // there is an '=' before the first ';' (if any)
+ // not foolproof as theoretically GFF2 could be like "Name Value=123;"
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * An interface to described common functionality of different flavours of GFF
+ *
+ * @author gmcarstairs
+ *
+ */
+public interface GffHelperI
+{
+
+ final String RENAME_TOKEN = "$RENAME_TO$";
+
+ /**
+ * Process one GFF feature line
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param gffColumns
+ * the GFF column data
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return a SequenceFeature if one should be created, else null
+ * @throws IOException
+ */
+ SequenceFeature processGff(SequenceI seq, String[] gffColumns,
+ AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
+ throws IOException;
+
+ // java 8 will allow static methods in interfaces:
+ // static boolean recognises(String [] columns);
+}
--- /dev/null
+package jalview.io.gff;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.StringUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A handler to parse GFF in the format generated by InterProScan
+ */
+public class InterProScanHelper extends Gff3Helper
+{
+ private static final String INTER_PRO_SCAN = "InterProScan";
+
+ private static final String SIGNATURE_DESC = "signature_desc";
+
+ /**
+ * Process one GFF feature line (as modelled by SequenceFeature)
+ *
+ * @param seq
+ * the sequence with which this feature is associated
+ * @param gff
+ * the gff column data
+ * @param align
+ * the alignment we are adding GFF to
+ * @param newseqs
+ * any new sequences referenced by the GFF
+ * @param relaxedIdMatching
+ * if true, match word tokens in sequence names
+ * @return a sequence feature if one should be added to the sequence, else
+ * null (i.e. it has been processed in another way e.g. to generate a
+ * mapping)
+ * @throws IOException
+ */
+ @Override
+ public SequenceFeature processGff(SequenceI seq, String[] gff,
+ AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching) throws IOException
+ {
+ /*
+ * ignore the 'polypeptide' match of the whole sequence
+ */
+ if (".".equals(gff[SOURCE_COL]))
+ {
+ return null;
+ }
+
+ return super.processGff(seq, gff, align, newseqs, relaxedIdMatching);
+ }
+
+ /**
+ *
+ */
+ @Override
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ Map<String, List<String>> attributes)
+ {
+ SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
+
+ /*
+ * signature_desc is a more informative source of description
+ */
+ List<String> desc = attributes.get(SIGNATURE_DESC);
+ String description = StringUtils.listToDelimitedString(desc, ", ");
+ if (description.length() > 0)
+ {
+ sf.setDescription(description);
+ }
+
+ /*
+ * Set sequence feature group as 'InterProScan', and type as the source
+ * database for this match (e.g. 'Pfam')
+ */
+ sf.setType(gff[SOURCE_COL]);
+ sf.setFeatureGroup(INTER_PRO_SCAN);
+
+ return sf;
+ }
+
+ /**
+ * Tests whether the GFF data looks like it was generated by InterProScan
+ *
+ * @param columns
+ * @return
+ */
+ public static boolean recognises(String[] columns)
+ {
+ SequenceOntology so = SequenceOntology.getInstance();
+ String type = columns[TYPE_COL];
+ if (so.isProteinMatch(type)
+ || (".".equals(columns[SOURCE_COL]) && so.isPolypeptide(type)))
+ {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Overriden method, because InterProScan GFF has the target sequence id in
+ * GFF field 'ID' rather than the usual 'Target' :-O
+ */
+ @Override
+ protected String findTargetId(String target, Map<String, List<String>> set)
+ {
+ List<String> ids = set.get(ID);
+ if (ids == null || ids.size() != 1)
+ {
+ return null;
+ }
+ return ids.get(0);
+ }
+
+}
package jalview.util;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.regex.Pattern;
public class StringUtils
}
/**
- * Parses the input line to a map of name / value(s) pairs. For example the
- * line <br>
- * Notes=Fe-S;Method=manual curation; source = Pfam; Notes = Metal <br>
- * if parsed with delimiter=";" and separators {' ', '='} <br>
- * would return a map with { Notes={Fe=S, Metal}, Method={manual curation},
- * source={Pfam}} <br>
- * Note the name/value strings are trimmed of leading / trailing spaces; the
- * first separator encountered is used
- *
- * @param line
- * @param delimiter
- * the major delimiter between name-value pairs
- * @param separators
- * one or more separators used between name and value
- * @return the name-values map (which may be empty but never null)
- */
- public static Map<String, List<String>> parseNameValuePairs(String line,
- String delimiter, char[] separators)
- {
- Map<String, List<String>> map = new HashMap<String, List<String>>();
- if (line == null || line.trim().length() == 0)
- {
- return map;
- }
-
- for (String pair : line.trim().split(delimiter))
- {
- pair = pair.trim();
- if (pair.length() == 0)
- {
- continue;
- }
-
- int sepPos = -1;
- for (char sep : separators)
- {
- int pos = pair.indexOf(sep);
- if (pos > -1 && (sepPos == -1 || pos < sepPos))
- {
- sepPos = pos;
- }
- }
-
- if (sepPos == -1)
- {
- // no name=value detected
- continue;
- }
-
- String key = pair.substring(0, sepPos).trim();
- String value = pair.substring(sepPos + 1).trim();
- if (value.length() > 0)
- {
- List<String> vals = map.get(key);
- if (vals == null)
- {
- vals = new ArrayList<String>();
- map.put(key, vals);
- }
- vals.add(value);
- }
- }
- return map;
- }
-
- /**
* Converts a list to a string with a delimiter before each term except the
* first. Returns an empty string given a null or zero-length argument. This
* can be replaced with StringJoiner in Java 8.
--- /dev/null
+package jalview.datamodel;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertSame;
+
+import jalview.datamodel.MappingType;
+
+import org.testng.annotations.Test;
+
+public class MappingTypeTest
+{
+
+ @Test(groups = "Functional")
+ public void testGetInverse()
+ {
+ assertSame(MappingType.PeptideToNucleotide,
+ MappingType.NucleotideToPeptide.getInverse());
+ assertSame(MappingType.NucleotideToPeptide,
+ MappingType.PeptideToNucleotide.getInverse());
+ assertSame(MappingType.NucleotideToNucleotide,
+ MappingType.NucleotideToNucleotide.getInverse());
+ assertSame(MappingType.PeptideToPeptide,
+ MappingType.PeptideToPeptide.getInverse());
+ }
+
+ @Test(groups = "Functional")
+ public void testGetFromRatio()
+ {
+ assertEquals(1, MappingType.NucleotideToNucleotide.getFromRatio());
+ assertEquals(1, MappingType.PeptideToNucleotide.getFromRatio());
+ assertEquals(1, MappingType.PeptideToPeptide.getFromRatio());
+ assertEquals(3, MappingType.NucleotideToPeptide.getFromRatio());
+ }
+
+ @Test(groups = "Functional")
+ public void testGetToRatio()
+ {
+ assertEquals(1, MappingType.NucleotideToNucleotide.getToRatio());
+ assertEquals(3, MappingType.PeptideToNucleotide.getToRatio());
+ assertEquals(1, MappingType.PeptideToPeptide.getToRatio());
+ assertEquals(1, MappingType.NucleotideToPeptide.getToRatio());
+ }
+}
String gffData = "##gff-version 3\n"
+ "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t"
+ "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n"
- + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.";
+ + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23";
FeaturesFile featuresFile = new FeaturesFile(gffData,
FormatAdapter.PASTE);
assertTrue("Failed to parse features file",
assertEquals("no sequences extracted from GFF3 file", 2,
dataset.getHeight());
- SequenceI seq1 = dataset.findName("seq1"), seq2 = dataset
- .findName("seq2");
+ SequenceI seq1 = dataset.findName("seq1");
+ SequenceI seq2 = dataset.findName("seq2");
assertNotNull(seq1);
assertNotNull(seq2);
assertFalse(
assertTrue(
"Didn't read the alignment into an alignframe from Gff3 File",
af != null);
- // FIXME codon mappings are on the alignment but not on the dataset
- checkDatasetfromSimpleGff3(af.getViewport().getAlignment()/* .getDataset() */);
+ checkDatasetfromSimpleGff3(af.getViewport().getAlignment());
}
@Test(groups = { "Functional" })
parseResult);
checkDatasetfromSimpleGff3(dataset);
}
-
- /**
- * Tests loading exonerate GFF2 output, including 'similarity' alignment
- * feature, on to sequences
- */
- @Test(groups = { "Functional" })
- public void testExonerateImport()
- {
- FileLoader loader = new FileLoader(false);
- AlignFrame af = loader.LoadFileWaitTillLoaded(
- "examples/testdata/exonerateseqs.fa",
- FormatAdapter.FILE);
-
- af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff",
- FormatAdapter.FILE, null, null);
-
- /*
- * verify one mapping to a dummy sequence, one to a real one
- */
- Set<AlignedCodonFrame> mappings = af
- .getViewport().getAlignment().getDataset().getCodonFrames();
- assertEquals(2, mappings.size());
- Iterator<AlignedCodonFrame> iter = mappings.iterator();
-
- // first mapping is to dummy sequence
- AlignedCodonFrame mapping = iter.next();
- Mapping[] mapList = mapping.getProtMappings();
- assertEquals(1, mapList.length);
- assertTrue(mapList[0].getTo() instanceof SequenceDummy);
- assertEquals("DDB_G0269124", mapList[0].getTo().getName());
-
- // second mapping is to a sequence in the alignment
- mapping = iter.next();
- mapList = mapping.getProtMappings();
- assertEquals(1, mapList.length);
- SequenceI proteinSeq = af.getViewport().getAlignment()
- .findName("DDB_G0280897");
- assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo());
- assertEquals(1, mapping.getdnaToProt().length);
-
- // 143 in protein should map to codon [11270, 11269, 11268] in dna
- int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143);
- assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion);
-
- // 182 in protein should map to codon [11153, 11152, 11151] in dna
- mappedRegion = mapList[0].getMap().locateInFrom(182, 182);
- assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion);
-
- // and the reverse mapping:
- mappedRegion = mapList[0].getMap().locateInTo(11151, 11153);
- assertArrayEquals(new int[] { 182, 182 }, mappedRegion);
-
- // 11150 in dna should _not_ map to protein
- mappedRegion = mapList[0].getMap().locateInTo(11150, 11150);
- assertNull(mappedRegion);
-
- // similarly 183 in protein should _not_ map to dna
- mappedRegion = mapList[0].getMap().locateInFrom(183, 183);
- assertNull(mappedRegion);
- }
}
--- /dev/null
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertNull;
+import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceDummy;
+import jalview.datamodel.SequenceI;
+import jalview.gui.AlignFrame;
+import jalview.io.FileLoader;
+import jalview.io.FormatAdapter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.testng.annotations.Test;
+
+public class ExonerateHelperTest
+{
+ @Test(groups = "Functional")
+ public void testGetMappingType()
+ {
+ // protein-to-dna:
+ assertSame(MappingType.PeptideToNucleotide,
+ ExonerateHelper
+ .getMappingType("exonerate:protein2genome:local"));
+ assertSame(MappingType.PeptideToNucleotide,
+ ExonerateHelper.getMappingType("exonerate:protein2dna:local"));
+
+ // dna-to-dna:
+ assertSame(MappingType.NucleotideToNucleotide,
+ ExonerateHelper.getMappingType("coding2coding"));
+ assertSame(MappingType.NucleotideToNucleotide,
+ ExonerateHelper.getMappingType("coding2genome"));
+ assertSame(MappingType.NucleotideToNucleotide,
+ ExonerateHelper.getMappingType("cdna2genome"));
+ assertSame(MappingType.NucleotideToNucleotide,
+ ExonerateHelper.getMappingType("genome2genome"));
+ assertNull(ExonerateHelper.getMappingType("affine:local"));
+ }
+
+ /**
+ * Test processing one exonerate GFF line for the case where the mapping is
+ * protein2dna, similarity feature is on the query (the protein), match to the
+ * forward strand, target sequence is in neither the alignment nor the 'new
+ * sequences'
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessGffSimilarity_protein2dna_forward_querygff()
+ throws IOException
+ {
+ ExonerateHelper testee = new ExonerateHelper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "Seq\texonerate:protein2dna:local\tsimilarity\t3\t10\t.\t+\t.\talignment_id 0 ; Target dna1 ; Align 3 400 8"
+ .split("\\t");
+ SequenceI seq = new Sequence("Seq", "PQRASTGKEEDVMIWCHQN");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+ Map<String, List<String>> set = Gff2Helper.parseNameValuePairs(gff[8]);
+
+ /*
+ * this should create a mapping from Seq2/3-10 to virtual sequence
+ * dna1 (added to newseqs) positions 400-423
+ */
+ testee.processGffSimilarity(set, seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("dna1", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 400, 423 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Test processing one exonerate GFF line for the case where the mapping is
+ * protein2dna, similarity feature is on the query (the protein), match to the
+ * reverse strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessGffSimilarity_protein2dna_reverse_querygff()
+ throws IOException
+ {
+ ExonerateHelper testee = new ExonerateHelper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "Seq\texonerate:protein2dna:local\tsimilarity\t3\t10\t0\t-\t.\talignment_id 0 ; Target dna1 ; Align 3 400 8"
+ .split("\\t");
+ SequenceI seq = new Sequence("Seq", "PQRASTGKEEDVMIWCHQN");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+ Map<String, List<String>> set = Gff2Helper.parseNameValuePairs(gff[8]);
+
+ /*
+ * this should create a mapping from Seq2/3-10 to virtual sequence
+ * dna1 (added to newseqs) positions 400-377 (reverse)
+ */
+ testee.processGffSimilarity(set, seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("dna1", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 400, 377 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Test processing one exonerate GFF line for the case where the mapping is
+ * protein2dna, similarity feature is on the target (the dna), match to the
+ * forward strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessGffSimilarity_protein2dna_forward_targetgff()
+ throws IOException
+ {
+ ExonerateHelper testee = new ExonerateHelper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "dna1\texonerate:protein2dna:local\tsimilarity\t400\t423\t0\t+\t.\talignment_id 0 ; Query Prot1 ; Align 400 3 24"
+ .split("\\t");
+ SequenceI seq = new Sequence("dna1/391-430",
+ "CGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATC");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] { seq });
+ // GFF feature on the target describes mapping from base 400 for
+ // count 24 to position 3
+ Map<String, List<String>> set = Gff2Helper.parseNameValuePairs(gff[8]);
+
+ /*
+ * this should create a mapping from virtual sequence dna1 (added to
+ * newseqs) positions 400-423 to Prot1/3-10
+ */
+ testee.processGffSimilarity(set, seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("Prot1", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 400, 423 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Test processing one exonerate GFF line for the case where the mapping is
+ * protein2dna, similarity feature is on the target (the dna), match to the
+ * reverse strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessGffSimilarity_protein2dna_reverse_targetgff()
+ throws IOException
+ {
+ ExonerateHelper testee = new ExonerateHelper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "dna1\texonerate:protein2dna:local\tsimilarity\t377\t400\t0\t-\t.\talignment_id 0 ; Query Prot1 ; Align 400 3 24"
+ .split("\\t");
+ SequenceI seq = new Sequence("dna1/371-410",
+ "CGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATC");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] { seq });
+ // GFF feature on the target describes mapping from base 400 for
+ // count 24 to position 3
+ Map<String, List<String>> set = Gff2Helper.parseNameValuePairs(gff[8]);
+
+ /*
+ * this should create a mapping from virtual sequence dna1 (added to
+ * newseqs) positions 400-377 (reverse) to Prot1/3-10
+ */
+ testee.processGffSimilarity(set, seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("Prot1", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 400, 377 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Tests loading exonerate GFF2 output, including 'similarity' alignment
+ * feature, on to sequences
+ */
+ @Test(groups = { "Functional" })
+ public void testAddExonerateGffToAlignment()
+ {
+ FileLoader loader = new FileLoader(false);
+ AlignFrame af = loader.LoadFileWaitTillLoaded(
+ "examples/testdata/exonerateseqs.fa",
+ FormatAdapter.FILE);
+
+ af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff",
+ FormatAdapter.FILE, null, null);
+
+ /*
+ * verify one mapping to a dummy sequence, one to a real one
+ */
+ List<AlignedCodonFrame> mappings = af
+ .getViewport().getAlignment().getDataset().getCodonFrames();
+ assertEquals(2, mappings.size());
+ Iterator<AlignedCodonFrame> iter = mappings.iterator();
+
+ // first mapping is to dummy sequence
+ AlignedCodonFrame mapping = iter.next();
+ Mapping[] mapList = mapping.getProtMappings();
+ assertEquals(1, mapList.length);
+ assertTrue(mapList[0].getTo() instanceof SequenceDummy);
+ assertEquals("DDB_G0269124", mapList[0].getTo().getName());
+
+ // 143 in protein should map to codon [11270, 11269, 11268] in dna
+ int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143);
+ assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion);
+
+ // second mapping is to a sequence in the alignment
+ mapping = iter.next();
+ mapList = mapping.getProtMappings();
+ assertEquals(1, mapList.length);
+ SequenceI proteinSeq = af.getViewport().getAlignment()
+ .findName("DDB_G0280897");
+ assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo());
+ assertEquals(1, mapping.getdnaToProt().length);
+
+ // 143 in protein should map to codon [11270, 11269, 11268] in dna
+ mappedRegion = mapList[0].getMap().locateInFrom(143, 143);
+ assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion);
+
+ // 182 in protein should map to codon [11153, 11152, 11151] in dna
+ mappedRegion = mapList[0].getMap().locateInFrom(182, 182);
+ assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion);
+
+ // and the reverse mapping:
+ mappedRegion = mapList[0].getMap().locateInTo(11151, 11153);
+ assertArrayEquals(new int[] { 182, 182 }, mappedRegion);
+
+ // 11150 in dna should _not_ map to protein
+ mappedRegion = mapList[0].getMap().locateInTo(11150, 11150);
+ assertNull(mappedRegion);
+
+ // similarly 183 in protein should _not_ map to dna
+ mappedRegion = mapList[0].getMap().locateInFrom(183, 183);
+ assertNull(mappedRegion);
+ }
+}
--- /dev/null
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertNull;
+import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceDummy;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.testng.annotations.Test;
+
+public class Gff3HelperTest
+{
+
+ /**
+ * Test processing one PASA GFF line giving a match from forward strand to
+ * forward strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessCdnaMatch_forwardToForward() throws IOException
+ {
+ GffHelperBase testee = new Gff3Helper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 +"
+ .split("\\t");
+ SequenceI seq = new Sequence("gi|68711",
+ "GAATTCGTTCATGTAGGTTGATTTTTATT");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+
+ /*
+ * this should create a mapping from gi|68711/12923-13060
+ * to virtual sequence gi|N37351 (added to newseqs) positions 1-138
+ */
+ testee.processGff(seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("gi|N37351", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+
+ /*
+ * 'dnaseqs' (map from) is here [gi|68711]
+ * 'aaseqs' (map to) is here [gi|N37351]
+ */
+ // TODO use more suitable naming in AlignedCodonFrame
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 12923, 13060 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 1, 138 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Test processing one PASA GFF line giving a match from forward strand to
+ * reverse strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessCdnaMatch_forwardToReverse() throws IOException
+ {
+ GffHelperBase testee = new Gff3Helper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 -"
+ .split("\\t");
+ SequenceI seq = new Sequence("gi|68711",
+ "GAATTCGTTCATGTAGGTTGATTTTTATT");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+
+ /*
+ * this should create a mapping from gi|68711/12923-13060
+ * to virtual sequence gi|N37351 (added to newseqs) positions 138-1
+ */
+ testee.processGff(seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("gi|N37351", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+
+ /*
+ * 'dnaseqs' (map from) is here [gi|68711]
+ * 'aaseqs' (map to) is here [gi|N37351]
+ */
+ // TODO use more suitable naming in AlignedCodonFrame
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 12923, 13060 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 138, 1 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+ /**
+ * Test processing one PASA GFF line giving a match from reverse complement
+ * strand to forward strand
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessCdnaMatch_reverseToForward() throws IOException
+ {
+ GffHelperBase testee = new Gff3Helper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t-\t.\tID=align_68;Target=gi|N37351 1 138 +"
+ .split("\\t");
+ SequenceI seq = new Sequence("gi|68711",
+ "GAATTCGTTCATGTAGGTTGATTTTTATT");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+
+ /*
+ * (For now) we don't process reverse complement mappings; to do this
+ * would require (a) creating a virtual sequence placeholder for the
+ * reverse complement (b) resolving the sequence by its id from some
+ * source (GFF ##FASTA or other) (c) creating the reverse complement
+ * sequence (d) updating the mapping to be to the reverse complement
+ */
+ SequenceFeature sf = testee.processGff(seq, gff, align, newseqs, false);
+ assertNull(sf);
+ assertTrue(newseqs.isEmpty());
+ }
+
+ /**
+ * Test processing two PASA GFF lines representing a spliced mapping
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessCdnaMatch_spliced() throws IOException
+ {
+ GffHelperBase testee = new Gff3Helper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ SequenceI seq = new Sequence("gi|68711",
+ "GAATTCGTTCATGTAGGTTGATTTTTATT");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+
+ // mapping from gi|68711 12923-13060 to gi|N37351 1-138
+ String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 +"
+ .split("\\t");
+ testee.processGff(seq, gff, align, newseqs, false);
+ // mapping from gi|68711 13411-13550 to gi|N37351 139-278
+ gff = "gi|68711\tblat-pasa\tcDNA_match\t13411\t13550\t98.55\t+\t.\tID=align_68;Target=gi|N37351 139 278 +"
+ .split("\\t");
+ testee.processGff(seq, gff, align, newseqs, false);
+
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("gi|N37351", newseqs.get(0).getName());
+
+ // only 1 AlignedCodonFrame added to the alignment with both mappings!
+ // (this is important for 'align cdna to genome' to work correctly)
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().get(0);
+
+ /*
+ * 'dnaseqs' (map from) is here [gi|68711]
+ * 'aaseqs' (map to) is here [gi|N37351]
+ */
+ // TODO use more suitable naming in AlignedCodonFrame
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(2, mapping.getdnaToProt()[0].getFromRanges().size());
+ // the two spliced dna ranges are combined in one MapList
+ assertArrayEquals(new int[] { 12923, 13060 },
+ mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertArrayEquals(new int[] { 13411, 13550 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(1));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ // the two cdna ranges are merged into one contiguous region
+ assertArrayEquals(new int[] { 1, 278 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+}
--- /dev/null
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertFalse;
+import static org.testng.AssertJUnit.assertTrue;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.testng.annotations.Test;
+
+public class GffHelperBaseTest
+{
+
+ /**
+ * Test the method that parses lines like <br>
+ * ID=2345;Name=Something,Another thing;Notes=Hello;Notes=World
+ */
+ @Test(groups = { "Functional" })
+ public void testParseNameValuePairs()
+ {
+ assertTrue(GffHelperBase.parseNameValuePairs(null, ";", ' ', ",")
+ .isEmpty());
+ assertTrue(GffHelperBase.parseNameValuePairs("", ";", ' ', ",")
+ .isEmpty());
+ assertTrue(GffHelperBase.parseNameValuePairs("hello=world", ";", ' ',
+ ",").isEmpty());
+
+ Map<String, List<String>> map = GffHelperBase.parseNameValuePairs(
+ "hello world", ";", ' ', ", ");
+ assertEquals(1, map.size());
+ assertEquals(1, map.get("hello").size());
+ assertEquals("world", map.get("hello").get(0));
+
+ map = GffHelperBase
+ .parseNameValuePairs(
+ "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny; Type=",
+ ";", '=', ",");
+
+ // Type is ignored as no value was supplied
+ assertEquals(2, map.size());
+
+ assertEquals(1, map.get("Method").size());
+ assertEquals("manual curation", map.get("Method").get(0)); // trimmed
+
+ assertEquals(3, map.get("Notes").size());
+ assertEquals("F2 S", map.get("Notes").get(0));
+ assertEquals("Metal", map.get("Notes").get(1));
+ assertEquals("Shiny", map.get("Notes").get(2));
+ }
+
+ /**
+ * Test for the method that tries to trim mappings to equivalent lengths
+ */
+ @Test(groups = "Functional")
+ public void testTrimMapping()
+ {
+ int[] from = { 1, 12 };
+ int[] to = { 20, 31 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[1, 12]", Arrays.toString(from)); // unchanged
+ assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+
+ // from too long:
+ from = new int[] { 1, 13 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[1, 12]", Arrays.toString(from)); // trimmed
+ assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+
+ // to too long:
+ to = new int[] { 20, 33 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[1, 12]", Arrays.toString(from)); // unchanged
+ assertEquals("[20, 31]", Arrays.toString(to)); // trimmed
+
+ // from reversed:
+ from = new int[] { 12, 1 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[12, 1]", Arrays.toString(from)); // unchanged
+ assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+
+ // to reversed:
+ to = new int[] { 31, 20 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[12, 1]", Arrays.toString(from)); // unchanged
+ assertEquals("[31, 20]", Arrays.toString(to)); // unchanged
+
+ // from reversed and too long:
+ from = new int[] { 14, 1 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[14, 3]", Arrays.toString(from)); // end trimmed
+ assertEquals("[31, 20]", Arrays.toString(to)); // unchanged
+
+ // to reversed and too long:
+ to = new int[] { 31, 10 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
+ assertEquals("[14, 3]", Arrays.toString(from)); // unchanged
+ assertEquals("[31, 20]", Arrays.toString(to)); // end trimmed
+
+ // cdna to peptide (matching)
+ from = new int[] { 1, 18 };
+ to = new int[] { 4, 9 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
+ assertEquals("[1, 18]", Arrays.toString(from)); // unchanged
+ assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+
+ // overlong cdna to peptide
+ from = new int[] { 1, 20 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
+ assertEquals("[1, 18]", Arrays.toString(from)); // end trimmed
+ assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+
+ // overlong cdna (reversed) to peptide
+ from = new int[] { 20, 1 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
+ assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed
+ assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+
+ // overlong cdna (reversed) to peptide (reversed)
+ from = new int[] { 20, 1 };
+ to = new int[] { 9, 4 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
+ assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed
+ assertEquals("[9, 4]", Arrays.toString(to)); // unchanged
+
+ // peptide to cdna (matching)
+ from = new int[] { 4, 9 };
+ to = new int[] { 1, 18 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
+ assertEquals("[1, 18]", Arrays.toString(to)); // unchanged
+
+ // peptide to overlong cdna
+ to = new int[] { 1, 20 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
+ assertEquals("[1, 18]", Arrays.toString(to)); // end trimmed
+
+ // peptide to overlong cdna (reversed)
+ to = new int[] { 20, 1 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
+ assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed
+
+ // peptide (reversed) to overlong cdna (reversed)
+ from = new int[] { 9, 4 };
+ to = new int[] { 20, 1 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[9, 4]", Arrays.toString(from)); // unchanged
+ assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed
+
+ // overlong peptide to word-length cdna
+ from = new int[] { 4, 10 };
+ to = new int[] { 1, 18 };
+ assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[4, 9]", Arrays.toString(from)); // end trimmed
+ assertEquals("[1, 18]", Arrays.toString(to)); // unchanged
+
+ // overlong peptide to non-word-length cdna
+ from = new int[] { 4, 10 };
+ to = new int[] { 1, 19 };
+ assertFalse(GffHelperBase.trimMapping(from, to, 1, 3));
+ assertEquals("[4, 10]", Arrays.toString(from)); // unchanged
+ assertEquals("[1, 19]", Arrays.toString(to)); // unchanged
+
+ }
+}
--- /dev/null
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertNull;
+import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+
+import org.testng.annotations.Test;
+
+public class GffHelperFactoryTest
+{
+
+ @Test(groups = "Functional")
+ public void testGetHelper()
+ {
+ assertNull(GffHelperFactory.getHelper(null));
+
+ String tabRegex = "\\t";
+
+ /*
+ * column 3 = 'similarity' indicates exonerate GFF alignment data
+ */
+ String gff = "submitted\taffine:local\tsimilarity\t20\t30\t99\t+\t.\t";
+ // no attributes (column 9 data):
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof Gff2Helper);
+
+ // attributes set but unhandled featureGroup - get generic handler
+ gff = "submitted\taffine:local\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertSame(GffHelperFactory.getHelper(gff.split(tabRegex)).getClass(),
+ Gff3Helper.class);
+
+ // handled featureGroup (exonerate model) values
+ gff = "submitted\texonerate:protein2dna:local\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ gff = "submitted\tprotein2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ gff = "submitted\tcoding2coding\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ gff = "submitted\tcoding2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ gff = "submitted\tcdna2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ gff = "submitted\tgenome2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ // not case-sensitive:
+ gff = "submitted\tgenome2genome\tSIMILARITY\t20\t30\t99\t+\t.\tID=$1";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper);
+
+ /*
+ * InterProScan has 'protein_match' in column 3
+ */
+ gff = "Submitted\tPANTHER\tprotein_match\t1\t1174\t0.0\t+\t.\tName=PTHR32154";
+ assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof InterProScanHelper);
+
+ /*
+ * nothing specific - return the generic GFF3 class if Name=Value is present in col9
+ */
+ gff = "nothing\tinteresting\there\t20\t30\t99\t+\t.\tID=1";
+ GffHelperI helper = GffHelperFactory.getHelper(gff.split(tabRegex));
+ assertSame(helper.getClass(), Gff3Helper.class);
+
+ // return the generic GFF2 class if "Name Value" is present in col9
+ gff = "nothing\tinteresting\there\t20\t30\t99\t+\t.\tID 1";
+ helper = GffHelperFactory.getHelper(gff.split(tabRegex));
+ assertSame(helper.getClass(), Gff2Helper.class);
+ }
+}
-package jalview.io;
+package jalview.io.gff;
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertSame;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceI;
import jalview.gui.AlignFrame;
+import jalview.io.FileLoader;
+import jalview.io.FormatAdapter;
-import java.util.Set;
+import java.util.List;
import org.testng.annotations.Test;
/**
- * Tests of use cases that include parsing exonerate GFF 'similarity' features.
- * These describe mappings between protein and cDNA
- *
- * @author gmcarstairs
- *
+ * Tests of use cases that include parsing GFF (version 2 or 3) features that
+ * describe mappings between protein and cDNA. The format of the GFF varies
+ * depending on which tool generated it.
*/
-public class ExonerateGffTest
+public class GffTests
{
-
/**
* Test the case where we load a protein ('query') sequence, then exonerateGff
* describing its mapping to cDNA, and then a DNA sequence including the
* mapped region
*/
@Test(groups = "Functional")
- public void testLoadProteinGffCdna()
+ public void testResolveExonerateGff()
{
String proteinSeq = ">prot1/10-16\nYCWRSGA";
AlignFrame af = new FileLoader(false).LoadFileWaitTillLoaded(
assertEquals(1, dataset.getSequences().size());
assertEquals("prot1", dataset.getSequenceAt(0).getName());
assertEquals("YCWRSGA", dataset.getSequenceAt(0).getSequenceAsString());
- Set<AlignedCodonFrame> mappings = dataset.getCodonFrames();
+ List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
assertEquals(1, mappings.size());
AlignedCodonFrame mapping = mappings.iterator().next();
SequenceI mappedDna = mapping.getDnaForAaSeq(dataset.getSequenceAt(0));
/*
* Now 'realise' the virtual mapping to the real DNA sequence;
* interactively this could be by a drag or fetch of the sequence data
+ * on to the alignment
*/
mapping.realiseWith(dna1);
// verify the mapping is now from the real, not the dummy sequence
--- /dev/null
+package jalview.io.gff;
+
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertSame;
+import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceDummy;
+import jalview.datamodel.SequenceI;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.testng.annotations.Test;
+
+public class InterProScanHelperTest
+{
+
+ /**
+ * Test processing one InterProScan GFF line
+ *
+ * @throws IOException
+ */
+ @Test(groups = "Functional")
+ public void testProcessProteinMatch() throws IOException
+ {
+ InterProScanHelper testee = new InterProScanHelper();
+ List<SequenceI> newseqs = new ArrayList<SequenceI>();
+ String[] gff = "Submitted\tPfam\tprotein_match\t5\t30\t0\t+\t.\tName=PF12838;Target=Submitted 5 30;signature_desc=4Fe-4S dicluster domain;ID=match$17_5_30"
+ .split("\\t");
+ SequenceI seq = new Sequence("Prot1", "PQRASTGKEEDVMIWCHQN");
+ seq.createDatasetSequence();
+ AlignmentI align = new Alignment(new SequenceI[] {});
+ Map<String, List<String>> set = Gff3Helper.parseNameValuePairs(gff[8]);
+
+ /*
+ * this should create a mapping from Prot1/5-30 to virtual sequence
+ * match$17_5_30 (added to newseqs) positions 1-26
+ */
+ testee.processProteinMatch(set, seq, gff, align, newseqs, false);
+ assertEquals(1, newseqs.size());
+ assertTrue(newseqs.get(0) instanceof SequenceDummy);
+ assertEquals("match$17_5_30", newseqs.get(0).getName());
+ assertEquals(1, align.getCodonFrames().size());
+ AlignedCodonFrame mapping = align.getCodonFrames().iterator().next();
+
+ /*
+ * 'dnaseqs' (map from) is here [Prot1]
+ * 'aaseqs' (map to) is here [match$17_5_30]
+ */
+ // TODO use more suitable naming in AlignedCodonFrame
+ assertEquals(1, mapping.getAaSeqs().length);
+ assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]);
+ assertEquals(1, mapping.getdnaSeqs().length);
+ assertSame(newseqs.get(0), mapping.getAaSeqs()[0]);
+ assertEquals(1, mapping.getdnaToProt().length);
+ assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size());
+ assertArrayEquals(new int[] { 5, 30 }, mapping.getdnaToProt()[0]
+ .getFromRanges().get(0));
+ assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size());
+ assertArrayEquals(new int[] { 1, 26 }, mapping.getdnaToProt()[0]
+ .getToRanges().get(0));
+ }
+
+}
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.Map;
import org.testng.annotations.Test;
{ "a", "b*c", "cde" }, "*"));
}
- /**
- * Test the method that parses lines like <br>
- * ID=2345;Name=Something;
- */
- @Test(groups = { "Functional" })
- public void testParseNameValuePairs()
- {
- char[] separators = new char[] { ' ' };
- assertTrue(StringUtils.parseNameValuePairs(null, ";", separators)
- .isEmpty());
- assertTrue(StringUtils.parseNameValuePairs("", ";", separators)
- .isEmpty());
- assertTrue(StringUtils.parseNameValuePairs("hello=world", ";",
- separators).isEmpty());
-
- Map<String, List<String>> map = StringUtils.parseNameValuePairs(
- "hello world", ";", separators);
- assertEquals(1, map.size());
- assertEquals(1, map.get("hello").size());
- assertEquals("world", map.get("hello").get(0));
-
- separators = new char[] { ' ', '=' };
- map = StringUtils
- .parseNameValuePairs(
- "Method= manual curation ;nothing; Notes F2=S ; Notes=Metal; Type=",
- ";", separators);
-
- // Type is ignored as no value was supplied
- assertEquals(2, map.size());
-
- // equals separator used ahead of space separator:
- assertEquals(1, map.get("Method").size());
- assertEquals("manual curation", map.get("Method").get(0)); // trimmed
-
- assertEquals(2, map.get("Notes").size());
- // space separator used ahead of equals separator
- assertEquals("F2=S", map.get("Notes").get(0));
- assertEquals("Metal", map.get("Notes").get(1));
- }
-
@Test(groups = { "Functional" })
public void testListToDelimitedString()
{