From 0900cfda915f917ce29ced5401b7118ff2a5372a Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 30 Nov 2015 09:14:34 +0000 Subject: [PATCH] JAL-652 further refactoring / tests --- src/jalview/io/FeaturesFile.java | 386 ++++++++++++++++++--------------- test/jalview/io/FeaturesFileTest.java | 91 +++++--- 2 files changed, 278 insertions(+), 199 deletions(-) diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index ee6ba11..bd7127f 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -65,6 +65,16 @@ import java.util.StringTokenizer; */ public class FeaturesFile extends AlignFile { + private static final String NOTE = "Note"; + + private static final String ALIGN = "Align"; + + private static final String QUERY = "Query"; + + private static final String TARGET = "Target"; + + private static final String SIMILARITY = "similarity"; + protected static final String STRAND = "STRAND"; protected static final String FRAME = "FRAME"; @@ -284,26 +294,25 @@ public class FeaturesFile extends AlignFile */ protected boolean parseJalviewFeature(String line, StringTokenizer st, AlignmentI alignment, Map featureColours, - boolean removeHTML, boolean relaxedIdmatching, String featureGroup) + boolean removeHTML, boolean relaxedIdMatching, String featureGroup) { /* * Jalview: description seqid seqIndex start end type [score] */ - String desc = st.nextToken(); - String seqId = st.nextToken(); - SequenceI seq = findName(alignment, seqId, relaxedIdmatching, null); - if (!st.hasMoreTokens()) + if (st.countTokens() < 6) { - System.err - .println("DEBUG: Run out of tokens when trying to identify the destination for the feature.. giving up."); - // in all probability, this isn't a file we understand, so bail - // quietly. + System.err.println("Ignoring feature line '" + line + + "' with unexpected number of columns (" + st.countTokens() + + ")"); return false; } + String desc = st.nextToken(); + String seqId = st.nextToken(); + SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId); if (!seqId.equals("ID_NOT_SPECIFIED")) { - seq = findName(alignment, seqId, relaxedIdmatching, null); + seq = findName(alignment, null, relaxedIdMatching, seqId); st.nextToken(); } else @@ -599,23 +608,24 @@ public class FeaturesFile extends AlignFile /** * Returns a sequence matching the given id, as follows * * * @param align - * @param seqId - * @param relaxedIdMatching * @param newseqs + * @param relaxedIdMatching + * @param seqId * @return */ - protected SequenceI findName(AlignmentI align, String seqId, - boolean relaxedIdMatching, List newseqs) + protected SequenceI findName(AlignmentI align, List newseqs, + boolean relaxedIdMatching, String seqId) { SequenceI match = null; if (relaxedIdMatching) @@ -1075,118 +1085,88 @@ public class FeaturesFile extends AlignFile } /** - * Helper method to make a mapping given a set of attributes for a GFF feature + * Returns a mapping given list of one or more Align descriptors (exonerate + * format) * - * @param set - * @param attr + * @param alignedRegions + * a list of "Align fromStart toStart fromCount" + * @param mapIsFromCdna + * if true, 'from' is dna, else 'from' is protein * @param strand * either 1 (forward) or -1 (reverse) * @return - * @throws InvalidGFF3FieldException + * @throws IOException */ protected MapList constructCodonMappingFromAlign( - Map> set, String attr, - int strand) throws InvalidGFF3FieldException + List alignedRegions, boolean mapIsFromCdna, int strand) + throws IOException { if (strand == 0) { - throw new InvalidGFF3FieldException(attr, set, + throw new IOException( "Invalid strand for a codon mapping (cannot be 0)"); } - List fromrange = new ArrayList(); - List torange = new ArrayList(); - int lastppos = 0, lastpframe = 0; - for (String range : set.get(attr)) - { - List ints = new ArrayList(); - StringTokenizer st = new StringTokenizer(range, " "); - while (st.hasMoreTokens()) - { - String num = st.nextToken(); - try - { - ints.add(new Integer(num)); - } catch (NumberFormatException nfe) - { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number in field " + num); - } - } + int regions = alignedRegions.size(); + // arrays to hold [start, end] for each aligned region + int[] fromRanges = new int[regions * 2]; // from dna + int[] toRanges = new int[regions * 2]; // to protein + int fromRangesIndex = 0; + int toRangesIndex = 0; + + for (String range : alignedRegions) + { /* - * Align positionInRef positionInQuery LengthInRef - * contig_1146 exonerate:p2g:local similarity 8534 11269 3652 - . - * alignment_id 0 ; Query DDB_G0269124 Align 11270 143 120 + * Align mapFromStart mapToStart mapFromCount + * e.g. if mapIsFromCdna + * Align 11270 143 120 * means: - * 120 bases align at pos 143 in protein to 11270 on dna (-ve strand) - * and so on for additional ' ; Align x y z' groups + * 120 bases from pos 11270 align to pos 143 in peptide + * if !mapIsFromCdna this would instead be + * Align 143 11270 40 */ - if (ints.size() != 3) + String[] tokens = range.split(" "); + if (tokens.length != 3) { - throw new InvalidGFF3FieldException(attr, set, - "Invalid number of fields for this attribute (" - + ints.size() + ")"); + throw new IOException("Wrong number of fields for Align"); } - fromrange.add(ints.get(0)); - fromrange.add(ints.get(0) + strand * ints.get(2)); - // how are intron/exon boundaries that do not align in codons - // represented - if (ints.get(1).intValue() == lastppos && lastpframe > 0) + int fromStart = 0; + int toStart = 0; + int fromCount = 0; + try { - // extend existing to map - lastppos += ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.set(torange.size() - 1, new Integer(lastppos)); - } - else + fromStart = Integer.parseInt(tokens[0]); + toStart = Integer.parseInt(tokens[1]); + fromCount = Integer.parseInt(tokens[2]); + } catch (NumberFormatException nfe) { - // new to map range - torange.add(ints.get(1)); - lastppos = ints.get(1) + ints.get(2) / 3; - lastpframe = ints.get(2) % 3; - torange.add(new Integer(lastppos)); + throw new IOException("Invalid number in Align field: " + + nfe.getMessage()); } - } - // from and to ranges must end up being a series of start/end intervals - if (fromrange.size() % 2 == 1) - { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the DNA alignment range correctly"); - } - if (torange.size() % 2 == 1) - { - throw new InvalidGFF3FieldException(attr, set, - "Couldn't parse the protein alignment range correctly"); - } - // finally, build the map - int[] frommap = new int[fromrange.size()], tomap = new int[torange - .size()]; - int p = 0; - for (Integer ip : fromrange) - { - frommap[p++] = ip.intValue(); - } - p = 0; - for (Integer ip : torange) - { - tomap[p++] = ip.intValue(); - } - - return new MapList(frommap, tomap, 3, 1); - } - private List findNames(AlignmentI align, List newseqs, boolean relaxedIdMatching, - List list) - { - List found = new ArrayList(); - for (String seqId : list) - { - SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs); - if (seq != null) + /* + * Jalview always models from dna to protein, so adjust values if the + * GFF mapping is from protein to dna + */ + if (!mapIsFromCdna) { - found.add(seq); + fromCount *= 3; + int temp = fromStart; + fromStart = toStart; + toStart = temp; } + fromRanges[fromRangesIndex++] = fromStart; + fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1); + + /* + * If a codon has an intron gap, there will be contiguous 'toRanges'; + * this is handled for us by the MapList constructor. + * (It is not clear that exonerate ever generates this case) + */ + toRanges[toRangesIndex++] = toStart; + toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; } - return found; + + return new MapList(fromRanges, toRanges, 3, 1); } /** @@ -1195,24 +1175,32 @@ public class FeaturesFile extends AlignFile * * @param st * @param alignment - * @param relaxedIdmatching + * @param relaxedIdMatching * @param newseqs * @return */ - protected SequenceI parseGffFeature(StringTokenizer st, AlignmentI alignment, boolean relaxedIdmatching, + protected SequenceI parseGffFeature(StringTokenizer st, + AlignmentI alignment, boolean relaxedIdMatching, List newseqs) { SequenceI seq; /* * GFF: seqid source type start end score strand phase [attributes] */ + if (st.countTokens() < 8) + { + System.err + .println("Ignoring GFF feature line with unexpected number of columns (" + + st.countTokens() + ")"); + return null; + } String seqId = st.nextToken(); /* * locate referenced sequence in alignment _or_ * as a forward reference (SequenceDummy) */ - seq = findName(alignment, seqId, relaxedIdmatching, newseqs); + seq = findName(alignment, newseqs, relaxedIdMatching, seqId); String desc = st.nextToken(); String group = null; @@ -1253,30 +1241,11 @@ public class FeaturesFile extends AlignFile if (st.hasMoreTokens()) { - String attributes = st.nextToken(); - sf.setValue(ATTRIBUTES, attributes); - - /* - * parse semi-structured attributes in column 9 and add them to the - * sequence feature's 'otherData' table; use Note as a best proxy for - * description - */ - Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", - new char[] { ' ', '=' }); - for (Entry> attr : nameValues.entrySet()) - { - String values = StringUtils.listToDelimitedString(attr.getValue(), - "; "); - sf.setValue(attr.getKey(), values); - if ("Note".equals(attr.getKey())) - { - sf.setDescription(values); - } - } + processGffColumnNine(st.nextToken(), sf); } if (processOrAddSeqFeature(alignment, newseqs, seq, sf, - relaxedIdmatching)) + relaxedIdMatching)) { // check whether we should add the sequence feature to any other // sequences in the alignment with the same or similar @@ -1289,6 +1258,37 @@ public class FeaturesFile extends AlignFile } /** + * Process the 'column 9' data of the GFF file. This is less formally defined, + * and its interpretation will vary depending on the tool that has generated + * it. + * + * @param attributes + * @param sf + */ + protected void processGffColumnNine(String attributes, SequenceFeature sf) + { + sf.setValue(ATTRIBUTES, attributes); + + /* + * Parse attributes in column 9 and add them to the sequence feature's + * 'otherData' table; use Note as a best proxy for description + */ + char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' }; + Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", + nameValueSeparator); + for (Entry> attr : nameValues.entrySet()) + { + String values = StringUtils.listToDelimitedString(attr.getValue(), + "; "); + sf.setValue(attr.getKey(), values); + if (NOTE.equals(attr.getKey())) + { + sf.setDescription(values); + } + } + } + + /** * After encountering ##fasta in a GFF3 file, process the remainder of the * file as FAST sequence data. Any placeholder sequences created during * feature parsing are updated with the actual sequences. @@ -1415,44 +1415,100 @@ public class FeaturesFile extends AlignFile } /** - * Processes the 'Query' and 'Align' properties associated with a GFF - * similarity feature; these properties define the mapping of the annotated - * feature to another from which it has transferred annotation + * Processes the 'Query' (or 'Target') and 'Align' properties associated with + * an exonerate GFF similarity feature; these properties define the mapping of + * the annotated feature (e.g. 'exon') to a related sequence. * * @param set * @param seq * @param sf - * @return + * @param align + * @param newseqs + * @param relaxedIdMatching + * @throws IOException */ public void processGffSimilarity(Map> set, SequenceI seq, SequenceFeature sf, AlignmentI align, List newseqs, boolean relaxedIdMatching) - throws InvalidGFF3FieldException + throws IOException { + if (!validateExonerateModel(sf)) + { + return; + } + int strand = sf.getStrand(); - // exonerate cdna/protein map - // look for fields - List querySeq = findNames(align, newseqs, relaxedIdMatching, - set.get("Query")); - if (querySeq == null || querySeq.size() != 1) - { - throw new InvalidGFF3FieldException("Query", set, - "Expecting exactly one sequence in Query field (got " - + set.get("Query") + ")"); + + /* + * exonerate (protein2dna or protein2genome) may be run with + * --showquerygff outputs + * Target ; Align proteinStartPos dnaStartPos peptideCount + * --showtargetgff outputs + * Query ; Align dnaStartPos proteinStartPos nucleotideCount + * where the Align spec may repeat + */ + boolean mapIsFromCdna = true; + List mapTo = set.get(QUERY); + if (mapTo == null) + { + mapTo = set.get(TARGET); + mapIsFromCdna = false; } - if (set.containsKey("Align")) + if (mapTo == null || mapTo.size() != 1) { - // process the align maps and create cdna/protein maps - // ideally, the query sequences are in the alignment, but maybe not... - - AlignedCodonFrame alco = new AlignedCodonFrame(); - MapList codonmapping = constructCodonMappingFromAlign(set, "Align", - strand); - - // add codon mapping, and hope! - alco.addMap(seq, querySeq.get(0), codonmapping); - align.addCodonFrame(alco); + throw new IOException( + "Expecting exactly one sequence in Query field (got " + mapTo + + ")"); } + + /* + * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; + */ + SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching, + mapTo.get(0)); + /* + * Process the Align maps and create cdna/protein maps; + * ideally, the query sequences are in the alignment, but maybe not... + */ + AlignedCodonFrame alco = new AlignedCodonFrame(); + MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN), + mapIsFromCdna, strand); + /* + * Jalview always maps from dna to protein + */ + if (mapIsFromCdna) + { + alco.addMap(seq, mappedSequence, codonmapping); + } + else + { + alco.addMap(mappedSequence, seq, codonmapping); + } + align.addCodonFrame(alco); + } + + /** + * Returns true if the exonerate model (saved from column 2 of the GFF as the + * SequenceFeature's group) is one that we are willing to process, else false + * + * @param sf + */ + protected boolean validateExonerateModel(SequenceFeature sf) + { + /* + * we don't handle protein-to-protein or dna-to-dna alignment here + */ + String source = sf.getFeatureGroup(); + if (source == null + || (!source.contains("protein2dna") && !source + .contains("protein2genome"))) + { + System.err + .println("I only accept protein2dna or protein2genome but found " + + source); + return false; + } + return true; } /** @@ -1482,14 +1538,14 @@ public class FeaturesFile extends AlignFile Map> set = StringUtils.parseNameValuePairs( attset, ";", new char[] { ' ', '-' }); - if ("similarity".equals(sf.getType())) + if (SIMILARITY.equals(sf.getType())) { try { + addFeature = false; processGffSimilarity(set, seq, sf, align, newseqs, relaxedIdMatching); - addFeature = false; - } catch (InvalidGFF3FieldException ivfe) + } catch (IOException ivfe) { System.err.println(ivfe); } @@ -1504,17 +1560,3 @@ public class FeaturesFile extends AlignFile } } - -class InvalidGFF3FieldException extends Exception -{ - String field, value; - - public InvalidGFF3FieldException(String field, - Map> set, String message) - { - super(message + " (Field was " + field + " and value was " - + set.get(field).toString()); - this.field = field; - this.value = set.get(field).toString(); - } -} diff --git a/test/jalview/io/FeaturesFileTest.java b/test/jalview/io/FeaturesFileTest.java index ed388a5..506ee91 100644 --- a/test/jalview/io/FeaturesFileTest.java +++ b/test/jalview/io/FeaturesFileTest.java @@ -24,10 +24,14 @@ import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertNotNull; import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; +import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; +import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Mapping; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -38,16 +42,16 @@ import jalview.schemes.GraduatedColor; import java.awt.Color; import java.io.File; import java.io.IOException; +import java.util.Iterator; import java.util.Map; +import java.util.Set; import org.testng.annotations.Test; public class FeaturesFileTest { - private static String exonerateSeqs = "examples/testdata/exonerateseqs.fa", - exonerateOutput = "examples/testdata/exonerateoutput.gff", - simpleGffFile = "examples/testdata/simpleGff3.gff"; + private static String simpleGffFile = "examples/testdata/simpleGff3.gff"; @Test(groups = { "Functional" }) public void testParse() throws Exception @@ -144,8 +148,9 @@ public class FeaturesFileTest AlignFrame af = new AlignFrame(al, 500, 500); Map colours = af.getFeatureRenderer() .getFeatureColours(); + // GFF2 uses space as name/value separator in column 9 String gffData = "METAL\tcc9900\n" + "GFF\n" - + "FER_CAPAA\tuniprot\tMETAL\t44\t45\t4.0\t.\t.\n" + + "FER_CAPAA\tuniprot\tMETAL\t44\t45\t4.0\t.\t.\tNote Iron-sulfur; Note 2Fe-2S\n" + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t2.0\t.\t."; FeaturesFile featuresFile = new FeaturesFile(gffData, FormatAdapter.PASTE); @@ -162,7 +167,7 @@ public class FeaturesFileTest .getSequenceFeatures(); assertEquals(1, sfs.length); SequenceFeature sf = sfs[0]; - assertEquals("uniprot", sf.description); + assertEquals("Iron-sulfur; 2Fe-2S", sf.description); assertEquals(44, sf.begin); assertEquals(45, sf.end); assertEquals("uniprot", sf.featureGroup); @@ -240,14 +245,15 @@ public class FeaturesFileTest * @throws Exception */ @Test(groups = { "Functional" }) - public void testParse_pureGff() throws Exception + public void testParse_pureGff3() throws Exception { File f = new File("examples/uniref50.fa"); AlignmentI al = readAlignmentFile(f); AlignFrame af = new AlignFrame(al, 500, 500); Map colours = af.getFeatureRenderer() .getFeatureColours(); - String gffData = "##gff-version 2\n" + // GFF3 uses '=' separator for name/value pairs in colum 9 + String gffData = "##gff-version 3\n" + "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t" + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n" + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t."; @@ -389,14 +395,6 @@ public class FeaturesFileTest } @Test(groups = { "Functional" }) - public void simpleGff3FileIdentify() - { - assertEquals("Didn't recognise file correctly.", - IdentifyFile.FeaturesFile, - new IdentifyFile().identify(simpleGffFile, FormatAdapter.FILE)); - } - - @Test(groups = { "Functional" }) public void simpleGff3FileLoader() throws IOException { AlignFrame af = new FileLoader(false).LoadFileWaitTillLoaded( @@ -421,24 +419,63 @@ public class FeaturesFileTest checkDatasetfromSimpleGff3(dataset); } + /** + * Tests loading exonerate GFF2 output, including 'similarity' alignment + * feature, on to sequences + */ @Test(groups = { "Functional" }) public void testExonerateImport() { - // exonerate does not tag sequences after features, so we have a more - // conventional annotation import test here - FileLoader loader = new FileLoader(false); - - AlignFrame af = loader.LoadFileWaitTillLoaded(exonerateSeqs, + AlignFrame af = loader.LoadFileWaitTillLoaded( + "examples/testdata/exonerateseqs.fa", FormatAdapter.FILE); - assertEquals("Unexpected number of DNA protein associations", 0, af - .getViewport().getAlignment().getCodonFrames().size()); + af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff", + FormatAdapter.FILE, null, null); - af.loadJalviewDataFile(exonerateOutput, FormatAdapter.FILE, null, null); - - assertTrue("Expected at least one DNA protein association", 0 != af - .getViewport().getAlignment().getDataset().getCodonFrames() - .size()); + /* + * verify one mapping to a dummy sequence, one to a real one + */ + Set mappings = af + .getViewport().getAlignment().getDataset().getCodonFrames(); + assertEquals(2, mappings.size()); + Iterator iter = mappings.iterator(); + + // first mapping is to dummy sequence + AlignedCodonFrame mapping = iter.next(); + Mapping[] mapList = mapping.getProtMappings(); + assertEquals(1, mapList.length); + assertTrue(mapList[0].getTo() instanceof SequenceDummy); + assertEquals("DDB_G0269124", mapList[0].getTo().getName()); + + // second mapping is to a sequence in the alignment + mapping = iter.next(); + mapList = mapping.getProtMappings(); + assertEquals(1, mapList.length); + SequenceI proteinSeq = af.getViewport().getAlignment() + .findName("DDB_G0280897"); + assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo()); + assertEquals(1, mapping.getdnaToProt().length); + + // 143 in protein should map to codon [11270, 11269, 11268] in dna + int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143); + assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion); + + // 182 in protein should map to codon [11153, 11152, 11151] in dna + mappedRegion = mapList[0].getMap().locateInFrom(182, 182); + assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion); + + // and the reverse mapping: + mappedRegion = mapList[0].getMap().locateInTo(11151, 11153); + assertArrayEquals(new int[] { 182, 182 }, mappedRegion); + + // 11150 in dna should _not_ map to protein + mappedRegion = mapList[0].getMap().locateInTo(11150, 11150); + assertNull(mappedRegion); + + // similarly 183 in protein should _not_ map to dna + mappedRegion = mapList[0].getMap().locateInFrom(183, 183); + assertNull(mappedRegion); } } -- 1.7.10.2