*/
public class FeaturesFile extends AlignFile
{
+ private static final String NOTE = "Note";
+
+ private static final String ALIGN = "Align";
+
+ private static final String QUERY = "Query";
+
+ private static final String TARGET = "Target";
+
+ private static final String SIMILARITY = "similarity";
+
protected static final String STRAND = "STRAND";
protected static final String FRAME = "FRAME";
*/
protected boolean parseJalviewFeature(String line, StringTokenizer st,
AlignmentI alignment, Map<String, Object> featureColours,
- boolean removeHTML, boolean relaxedIdmatching, String featureGroup)
+ boolean removeHTML, boolean relaxedIdMatching, String featureGroup)
{
/*
* Jalview: description seqid seqIndex start end type [score]
*/
- String desc = st.nextToken();
- String seqId = st.nextToken();
- SequenceI seq = findName(alignment, seqId, relaxedIdmatching, null);
- if (!st.hasMoreTokens())
+ if (st.countTokens() < 6)
{
- System.err
- .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up.");
- // in all probability, this isn't a file we understand, so bail
- // quietly.
+ System.err.println("Ignoring feature line '" + line
+ + "' with unexpected number of columns (" + st.countTokens()
+ + ")");
return false;
}
+ String desc = st.nextToken();
+ String seqId = st.nextToken();
+ SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId);
if (!seqId.equals("ID_NOT_SPECIFIED"))
{
- seq = findName(alignment, seqId, relaxedIdmatching, null);
+ seq = findName(alignment, null, relaxedIdMatching, seqId);
st.nextToken();
}
else
/**
* Returns a sequence matching the given id, as follows
* <ul>
- * <li>matching is on exact sequence name, or on a token within the sequence
- * name, or a dbxref, if relaxed matching is selected</li>
+ * <li>strict matching is on exact sequence name</li>
+ * <li>relaxed matching allows matching on a token within the sequence name,
+ * or a dbxref</li>
* <li>first tries to find a match in the alignment sequences</li>
- * <li>else tries to find a match in the new sequences already generated
+ * <li>else tries to find a match in the new sequences already generated while
* parsing the features file</li>
* <li>else creates a new placeholder sequence, adds it to the new sequences
* list, and returns it</li>
* </ul>
*
* @param align
- * @param seqId
- * @param relaxedIdMatching
* @param newseqs
+ * @param relaxedIdMatching
+ * @param seqId
* @return
*/
- protected SequenceI findName(AlignmentI align, String seqId,
- boolean relaxedIdMatching, List<SequenceI> newseqs)
+ protected SequenceI findName(AlignmentI align, List<SequenceI> newseqs,
+ boolean relaxedIdMatching, String seqId)
{
SequenceI match = null;
if (relaxedIdMatching)
}
/**
- * Helper method to make a mapping given a set of attributes for a GFF feature
+ * Returns a mapping given list of one or more Align descriptors (exonerate
+ * format)
*
- * @param set
- * @param attr
+ * @param alignedRegions
+ * a list of "Align fromStart toStart fromCount"
+ * @param mapIsFromCdna
+ * if true, 'from' is dna, else 'from' is protein
* @param strand
* either 1 (forward) or -1 (reverse)
* @return
- * @throws InvalidGFF3FieldException
+ * @throws IOException
*/
protected MapList constructCodonMappingFromAlign(
- Map<String, List<String>> set, String attr,
- int strand) throws InvalidGFF3FieldException
+ List<String> alignedRegions, boolean mapIsFromCdna, int strand)
+ throws IOException
{
if (strand == 0)
{
- throw new InvalidGFF3FieldException(attr, set,
+ throw new IOException(
"Invalid strand for a codon mapping (cannot be 0)");
}
- List<Integer> fromrange = new ArrayList<Integer>();
- List<Integer> torange = new ArrayList<Integer>();
- int lastppos = 0, lastpframe = 0;
- for (String range : set.get(attr))
- {
- List<Integer> ints = new ArrayList<Integer>();
- StringTokenizer st = new StringTokenizer(range, " ");
- while (st.hasMoreTokens())
- {
- String num = st.nextToken();
- try
- {
- ints.add(new Integer(num));
- } catch (NumberFormatException nfe)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Invalid number in field " + num);
- }
- }
+ int regions = alignedRegions.size();
+ // arrays to hold [start, end] for each aligned region
+ int[] fromRanges = new int[regions * 2]; // from dna
+ int[] toRanges = new int[regions * 2]; // to protein
+ int fromRangesIndex = 0;
+ int toRangesIndex = 0;
+
+ for (String range : alignedRegions)
+ {
/*
- * Align positionInRef positionInQuery LengthInRef
- * contig_1146 exonerate:p2g:local similarity 8534 11269 3652 - .
- * alignment_id 0 ; Query DDB_G0269124 Align 11270 143 120
+ * Align mapFromStart mapToStart mapFromCount
+ * e.g. if mapIsFromCdna
+ * Align 11270 143 120
* means:
- * 120 bases align at pos 143 in protein to 11270 on dna (-ve strand)
- * and so on for additional ' ; Align x y z' groups
+ * 120 bases from pos 11270 align to pos 143 in peptide
+ * if !mapIsFromCdna this would instead be
+ * Align 143 11270 40
*/
- if (ints.size() != 3)
+ String[] tokens = range.split(" ");
+ if (tokens.length != 3)
{
- throw new InvalidGFF3FieldException(attr, set,
- "Invalid number of fields for this attribute ("
- + ints.size() + ")");
+ throw new IOException("Wrong number of fields for Align");
}
- fromrange.add(ints.get(0));
- fromrange.add(ints.get(0) + strand * ints.get(2));
- // how are intron/exon boundaries that do not align in codons
- // represented
- if (ints.get(1).intValue() == lastppos && lastpframe > 0)
+ int fromStart = 0;
+ int toStart = 0;
+ int fromCount = 0;
+ try
{
- // extend existing to map
- lastppos += ints.get(2) / 3;
- lastpframe = ints.get(2) % 3;
- torange.set(torange.size() - 1, new Integer(lastppos));
- }
- else
+ fromStart = Integer.parseInt(tokens[0]);
+ toStart = Integer.parseInt(tokens[1]);
+ fromCount = Integer.parseInt(tokens[2]);
+ } catch (NumberFormatException nfe)
{
- // new to map range
- torange.add(ints.get(1));
- lastppos = ints.get(1) + ints.get(2) / 3;
- lastpframe = ints.get(2) % 3;
- torange.add(new Integer(lastppos));
+ throw new IOException("Invalid number in Align field: "
+ + nfe.getMessage());
}
- }
- // from and to ranges must end up being a series of start/end intervals
- if (fromrange.size() % 2 == 1)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Couldn't parse the DNA alignment range correctly");
- }
- if (torange.size() % 2 == 1)
- {
- throw new InvalidGFF3FieldException(attr, set,
- "Couldn't parse the protein alignment range correctly");
- }
- // finally, build the map
- int[] frommap = new int[fromrange.size()], tomap = new int[torange
- .size()];
- int p = 0;
- for (Integer ip : fromrange)
- {
- frommap[p++] = ip.intValue();
- }
- p = 0;
- for (Integer ip : torange)
- {
- tomap[p++] = ip.intValue();
- }
-
- return new MapList(frommap, tomap, 3, 1);
- }
- private List<SequenceI> findNames(AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching,
- List<String> list)
- {
- List<SequenceI> found = new ArrayList<SequenceI>();
- for (String seqId : list)
- {
- SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
- if (seq != null)
+ /*
+ * Jalview always models from dna to protein, so adjust values if the
+ * GFF mapping is from protein to dna
+ */
+ if (!mapIsFromCdna)
{
- found.add(seq);
+ fromCount *= 3;
+ int temp = fromStart;
+ fromStart = toStart;
+ toStart = temp;
}
+ fromRanges[fromRangesIndex++] = fromStart;
+ fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
+
+ /*
+ * If a codon has an intron gap, there will be contiguous 'toRanges';
+ * this is handled for us by the MapList constructor.
+ * (It is not clear that exonerate ever generates this case)
+ */
+ toRanges[toRangesIndex++] = toStart;
+ toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
}
- return found;
+
+ return new MapList(fromRanges, toRanges, 3, 1);
}
/**
*
* @param st
* @param alignment
- * @param relaxedIdmatching
+ * @param relaxedIdMatching
* @param newseqs
* @return
*/
- protected SequenceI parseGffFeature(StringTokenizer st, AlignmentI alignment, boolean relaxedIdmatching,
+ protected SequenceI parseGffFeature(StringTokenizer st,
+ AlignmentI alignment, boolean relaxedIdMatching,
List<SequenceI> newseqs)
{
SequenceI seq;
/*
* GFF: seqid source type start end score strand phase [attributes]
*/
+ if (st.countTokens() < 8)
+ {
+ System.err
+ .println("Ignoring GFF feature line with unexpected number of columns ("
+ + st.countTokens() + ")");
+ return null;
+ }
String seqId = st.nextToken();
/*
* locate referenced sequence in alignment _or_
* as a forward reference (SequenceDummy)
*/
- seq = findName(alignment, seqId, relaxedIdmatching, newseqs);
+ seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
String desc = st.nextToken();
String group = null;
if (st.hasMoreTokens())
{
- String attributes = st.nextToken();
- sf.setValue(ATTRIBUTES, attributes);
-
- /*
- * parse semi-structured attributes in column 9 and add them to the
- * sequence feature's 'otherData' table; use Note as a best proxy for
- * description
- */
- Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
- new char[] { ' ', '=' });
- for (Entry<String, List<String>> attr : nameValues.entrySet())
- {
- String values = StringUtils.listToDelimitedString(attr.getValue(),
- "; ");
- sf.setValue(attr.getKey(), values);
- if ("Note".equals(attr.getKey()))
- {
- sf.setDescription(values);
- }
- }
+ processGffColumnNine(st.nextToken(), sf);
}
if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
- relaxedIdmatching))
+ relaxedIdMatching))
{
// check whether we should add the sequence feature to any other
// sequences in the alignment with the same or similar
}
/**
+ * Process the 'column 9' data of the GFF file. This is less formally defined,
+ * and its interpretation will vary depending on the tool that has generated
+ * it.
+ *
+ * @param attributes
+ * @param sf
+ */
+ protected void processGffColumnNine(String attributes, SequenceFeature sf)
+ {
+ sf.setValue(ATTRIBUTES, attributes);
+
+ /*
+ * Parse attributes in column 9 and add them to the sequence feature's
+ * 'otherData' table; use Note as a best proxy for description
+ */
+ char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
+ Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
+ nameValueSeparator);
+ for (Entry<String, List<String>> attr : nameValues.entrySet())
+ {
+ String values = StringUtils.listToDelimitedString(attr.getValue(),
+ "; ");
+ sf.setValue(attr.getKey(), values);
+ if (NOTE.equals(attr.getKey()))
+ {
+ sf.setDescription(values);
+ }
+ }
+ }
+
+ /**
* After encountering ##fasta in a GFF3 file, process the remainder of the
* file as FAST sequence data. Any placeholder sequences created during
* feature parsing are updated with the actual sequences.
}
/**
- * Processes the 'Query' and 'Align' properties associated with a GFF
- * similarity feature; these properties define the mapping of the annotated
- * feature to another from which it has transferred annotation
+ * Processes the 'Query' (or 'Target') and 'Align' properties associated with
+ * an exonerate GFF similarity feature; these properties define the mapping of
+ * the annotated feature (e.g. 'exon') to a related sequence.
*
* @param set
* @param seq
* @param sf
- * @return
+ * @param align
+ * @param newseqs
+ * @param relaxedIdMatching
+ * @throws IOException
*/
public void processGffSimilarity(Map<String, List<String>> set, SequenceI seq,
SequenceFeature sf, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)
- throws InvalidGFF3FieldException
+ throws IOException
{
+ if (!validateExonerateModel(sf))
+ {
+ return;
+ }
+
int strand = sf.getStrand();
- // exonerate cdna/protein map
- // look for fields
- List<SequenceI> querySeq = findNames(align, newseqs, relaxedIdMatching,
- set.get("Query"));
- if (querySeq == null || querySeq.size() != 1)
- {
- throw new InvalidGFF3FieldException("Query", set,
- "Expecting exactly one sequence in Query field (got "
- + set.get("Query") + ")");
+
+ /*
+ * exonerate (protein2dna or protein2genome) may be run with
+ * --showquerygff outputs
+ * Target <dnaseqid> ; Align proteinStartPos dnaStartPos peptideCount
+ * --showtargetgff outputs
+ * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
+ * where the Align spec may repeat
+ */
+ boolean mapIsFromCdna = true;
+ List<String> mapTo = set.get(QUERY);
+ if (mapTo == null)
+ {
+ mapTo = set.get(TARGET);
+ mapIsFromCdna = false;
}
- if (set.containsKey("Align"))
+ if (mapTo == null || mapTo.size() != 1)
{
- // process the align maps and create cdna/protein maps
- // ideally, the query sequences are in the alignment, but maybe not...
-
- AlignedCodonFrame alco = new AlignedCodonFrame();
- MapList codonmapping = constructCodonMappingFromAlign(set, "Align",
- strand);
-
- // add codon mapping, and hope!
- alco.addMap(seq, querySeq.get(0), codonmapping);
- align.addCodonFrame(alco);
+ throw new IOException(
+ "Expecting exactly one sequence in Query field (got " + mapTo
+ + ")");
}
+
+ /*
+ * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
+ */
+ SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching,
+ mapTo.get(0));
+ /*
+ * Process the Align maps and create cdna/protein maps;
+ * ideally, the query sequences are in the alignment, but maybe not...
+ */
+ AlignedCodonFrame alco = new AlignedCodonFrame();
+ MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN),
+ mapIsFromCdna, strand);
+ /*
+ * Jalview always maps from dna to protein
+ */
+ if (mapIsFromCdna)
+ {
+ alco.addMap(seq, mappedSequence, codonmapping);
+ }
+ else
+ {
+ alco.addMap(mappedSequence, seq, codonmapping);
+ }
+ align.addCodonFrame(alco);
+ }
+
+ /**
+ * Returns true if the exonerate model (saved from column 2 of the GFF as the
+ * SequenceFeature's group) is one that we are willing to process, else false
+ *
+ * @param sf
+ */
+ protected boolean validateExonerateModel(SequenceFeature sf)
+ {
+ /*
+ * we don't handle protein-to-protein or dna-to-dna alignment here
+ */
+ String source = sf.getFeatureGroup();
+ if (source == null
+ || (!source.contains("protein2dna") && !source
+ .contains("protein2genome")))
+ {
+ System.err
+ .println("I only accept protein2dna or protein2genome but found "
+ + source);
+ return false;
+ }
+ return true;
}
/**
Map<String, List<String>> set = StringUtils.parseNameValuePairs(
attset, ";", new char[] { ' ', '-' });
- if ("similarity".equals(sf.getType()))
+ if (SIMILARITY.equals(sf.getType()))
{
try
{
+ addFeature = false;
processGffSimilarity(set, seq, sf, align, newseqs,
relaxedIdMatching);
- addFeature = false;
- } catch (InvalidGFF3FieldException ivfe)
+ } catch (IOException ivfe)
{
System.err.println(ivfe);
}
}
}
-
-class InvalidGFF3FieldException extends Exception
-{
- String field, value;
-
- public InvalidGFF3FieldException(String field,
- Map<String, List<String>> set, String message)
- {
- super(message + " (Field was " + field + " and value was "
- + set.get(field).toString());
- this.field = field;
- this.value = set.get(field).toString();
- }
-}
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNull;
+import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals;
+import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Mapping;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import java.awt.Color;
import java.io.File;
import java.io.IOException;
+import java.util.Iterator;
import java.util.Map;
+import java.util.Set;
import org.testng.annotations.Test;
public class FeaturesFileTest
{
- private static String exonerateSeqs = "examples/testdata/exonerateseqs.fa",
- exonerateOutput = "examples/testdata/exonerateoutput.gff",
- simpleGffFile = "examples/testdata/simpleGff3.gff";
+ private static String simpleGffFile = "examples/testdata/simpleGff3.gff";
@Test(groups = { "Functional" })
public void testParse() throws Exception
AlignFrame af = new AlignFrame(al, 500, 500);
Map<String, Object> colours = af.getFeatureRenderer()
.getFeatureColours();
+ // GFF2 uses space as name/value separator in column 9
String gffData = "METAL\tcc9900\n" + "GFF\n"
- + "FER_CAPAA\tuniprot\tMETAL\t44\t45\t4.0\t.\t.\n"
+ + "FER_CAPAA\tuniprot\tMETAL\t44\t45\t4.0\t.\t.\tNote Iron-sulfur; Note 2Fe-2S\n"
+ "FER1_SOLLC\tuniprot\tPfam\t55\t130\t2.0\t.\t.";
FeaturesFile featuresFile = new FeaturesFile(gffData,
FormatAdapter.PASTE);
.getSequenceFeatures();
assertEquals(1, sfs.length);
SequenceFeature sf = sfs[0];
- assertEquals("uniprot", sf.description);
+ assertEquals("Iron-sulfur; 2Fe-2S", sf.description);
assertEquals(44, sf.begin);
assertEquals(45, sf.end);
assertEquals("uniprot", sf.featureGroup);
* @throws Exception
*/
@Test(groups = { "Functional" })
- public void testParse_pureGff() throws Exception
+ public void testParse_pureGff3() throws Exception
{
File f = new File("examples/uniref50.fa");
AlignmentI al = readAlignmentFile(f);
AlignFrame af = new AlignFrame(al, 500, 500);
Map<String, Object> colours = af.getFeatureRenderer()
.getFeatureColours();
- String gffData = "##gff-version 2\n"
+ // GFF3 uses '=' separator for name/value pairs in colum 9
+ String gffData = "##gff-version 3\n"
+ "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t"
+ "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n"
+ "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.";
}
@Test(groups = { "Functional" })
- public void simpleGff3FileIdentify()
- {
- assertEquals("Didn't recognise file correctly.",
- IdentifyFile.FeaturesFile,
- new IdentifyFile().identify(simpleGffFile, FormatAdapter.FILE));
- }
-
- @Test(groups = { "Functional" })
public void simpleGff3FileLoader() throws IOException
{
AlignFrame af = new FileLoader(false).LoadFileWaitTillLoaded(
checkDatasetfromSimpleGff3(dataset);
}
+ /**
+ * Tests loading exonerate GFF2 output, including 'similarity' alignment
+ * feature, on to sequences
+ */
@Test(groups = { "Functional" })
public void testExonerateImport()
{
- // exonerate does not tag sequences after features, so we have a more
- // conventional annotation import test here
-
FileLoader loader = new FileLoader(false);
-
- AlignFrame af = loader.LoadFileWaitTillLoaded(exonerateSeqs,
+ AlignFrame af = loader.LoadFileWaitTillLoaded(
+ "examples/testdata/exonerateseqs.fa",
FormatAdapter.FILE);
- assertEquals("Unexpected number of DNA protein associations", 0, af
- .getViewport().getAlignment().getCodonFrames().size());
+ af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff",
+ FormatAdapter.FILE, null, null);
- af.loadJalviewDataFile(exonerateOutput, FormatAdapter.FILE, null, null);
-
- assertTrue("Expected at least one DNA protein association", 0 != af
- .getViewport().getAlignment().getDataset().getCodonFrames()
- .size());
+ /*
+ * verify one mapping to a dummy sequence, one to a real one
+ */
+ Set<AlignedCodonFrame> mappings = af
+ .getViewport().getAlignment().getDataset().getCodonFrames();
+ assertEquals(2, mappings.size());
+ Iterator<AlignedCodonFrame> iter = mappings.iterator();
+
+ // first mapping is to dummy sequence
+ AlignedCodonFrame mapping = iter.next();
+ Mapping[] mapList = mapping.getProtMappings();
+ assertEquals(1, mapList.length);
+ assertTrue(mapList[0].getTo() instanceof SequenceDummy);
+ assertEquals("DDB_G0269124", mapList[0].getTo().getName());
+
+ // second mapping is to a sequence in the alignment
+ mapping = iter.next();
+ mapList = mapping.getProtMappings();
+ assertEquals(1, mapList.length);
+ SequenceI proteinSeq = af.getViewport().getAlignment()
+ .findName("DDB_G0280897");
+ assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo());
+ assertEquals(1, mapping.getdnaToProt().length);
+
+ // 143 in protein should map to codon [11270, 11269, 11268] in dna
+ int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143);
+ assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion);
+
+ // 182 in protein should map to codon [11153, 11152, 11151] in dna
+ mappedRegion = mapList[0].getMap().locateInFrom(182, 182);
+ assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion);
+
+ // and the reverse mapping:
+ mappedRegion = mapList[0].getMap().locateInTo(11151, 11153);
+ assertArrayEquals(new int[] { 182, 182 }, mappedRegion);
+
+ // 11150 in dna should _not_ map to protein
+ mappedRegion = mapList[0].getMap().locateInTo(11150, 11150);
+ assertNull(mappedRegion);
+
+ // similarly 183 in protein should _not_ map to dna
+ mappedRegion = mapList[0].getMap().locateInFrom(183, 183);
+ assertNull(mappedRegion);
}
}