From 8f920d337154e092f5f9056ffde3cdf2735eca43 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 22 Dec 2015 09:10:07 +0000 Subject: [PATCH] JAL-653 GFF new/refactored helper classes --- examples/testdata/exonerateoutput.gff | 1 + examples/testdata/simpleGff3.gff | 4 +- src/jalview/datamodel/Mapping.java | 1 + src/jalview/datamodel/MappingType.java | 63 +++ src/jalview/io/FeaturesFile.java | 483 +++++++------------- src/jalview/io/gff/ExonerateHelper.java | 348 ++++++++++++++ src/jalview/io/gff/Gff2Helper.java | 51 +++ src/jalview/io/gff/Gff3Helper.java | 361 +++++++++++++++ src/jalview/io/gff/GffHelperBase.java | 396 ++++++++++++++++ src/jalview/io/gff/GffHelperFactory.java | 70 +++ src/jalview/io/gff/GffHelperI.java | 44 ++ src/jalview/io/gff/InterProScanHelper.java | 117 +++++ src/jalview/util/StringUtils.java | 68 --- test/jalview/datamodel/MappingTypeTest.java | 43 ++ test/jalview/io/FeaturesFileTest.java | 69 +-- test/jalview/io/gff/ExonerateHelperTest.java | 295 ++++++++++++ test/jalview/io/gff/Gff3HelperTest.java | 206 +++++++++ test/jalview/io/gff/GffHelperBaseTest.java | 168 +++++++ test/jalview/io/gff/GffHelperFactoryTest.java | 72 +++ .../{ExonerateGffTest.java => gff/GffTests.java} | 22 +- test/jalview/io/gff/InterProScanHelperTest.java | 71 +++ test/jalview/util/StringUtilsTest.java | 41 -- 22 files changed, 2495 insertions(+), 499 deletions(-) create mode 100644 src/jalview/datamodel/MappingType.java create mode 100644 src/jalview/io/gff/ExonerateHelper.java create mode 100644 src/jalview/io/gff/Gff2Helper.java create mode 100644 src/jalview/io/gff/Gff3Helper.java create mode 100644 src/jalview/io/gff/GffHelperBase.java create mode 100644 src/jalview/io/gff/GffHelperFactory.java create mode 100644 src/jalview/io/gff/GffHelperI.java create mode 100644 src/jalview/io/gff/InterProScanHelper.java create mode 100644 test/jalview/datamodel/MappingTypeTest.java create mode 100644 test/jalview/io/gff/ExonerateHelperTest.java create mode 100644 test/jalview/io/gff/Gff3HelperTest.java create mode 100644 test/jalview/io/gff/GffHelperBaseTest.java create mode 100644 test/jalview/io/gff/GffHelperFactoryTest.java rename test/jalview/io/{ExonerateGffTest.java => gff/GffTests.java} (86%) create mode 100644 test/jalview/io/gff/InterProScanHelperTest.java diff --git a/examples/testdata/exonerateoutput.gff b/examples/testdata/exonerateoutput.gff index bf3349f..d3b5f9b 100644 --- a/examples/testdata/exonerateoutput.gff +++ b/examples/testdata/exonerateoutput.gff @@ -13,6 +13,7 @@ contig_1146 exonerate:protein2genome:local gene 8534 11269 3652 - . gene_id 0 ; sequence DDB_G0269124 ; gene_orientation . contig_1146 exonerate:protein2genome:local cds 8534 11269 . - . contig_1146 exonerate:protein2genome:local exon 8534 11269 . - . insertions 3 ; deletions 6 +#TODO need to understand why GFF features is from 11269 but Align is from 11270 contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0269124 ; Align 11270 143 120 ; Align 11150 187 282 ; Align 10865 281 888 ; Align 9977 578 1068 ; Align 8909 935 375 # and a made-up alignment to a sequence in exonerateseqs.fa contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0280897 ; Align 11270 143 120 diff --git a/examples/testdata/simpleGff3.gff b/examples/testdata/simpleGff3.gff index 0d85293..d363bae 100644 --- a/examples/testdata/simpleGff3.gff +++ b/examples/testdata/simpleGff3.gff @@ -6,13 +6,15 @@ ##date 2015-01-16 ##type DNA # +# exonerate run with --showtargetgff generates 'features on the target' i.e. mappings to the query # tab-delimited # seqname source feature start end score strand frame attributes # seq1 exonerate:protein2genome:local gene 8 11 3652 - . gene_id 0 ; sequence seq2 ; gene_orientation . seq1 exonerate:protein2genome:local cds 9 11 . - . seq1 exonerate:protein2genome:local exon 9 11 . - . insertions 3 ; deletions 6 -seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3 +#seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3 +seq1 exonerate:protein2genome:local similarity 9 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3 # # appending FASTA sequences is strictly a GFF3 format feature # but Jalview is able to handle this mixture of GFF2 / GFF3 :-) diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java index 6c619ce..eb594be 100644 --- a/src/jalview/datamodel/Mapping.java +++ b/src/jalview/datamodel/Mapping.java @@ -693,6 +693,7 @@ public class Mapping * * @see java.lang.Object#finalize() */ + @Override protected void finalize() throws Throwable { map = null; diff --git a/src/jalview/datamodel/MappingType.java b/src/jalview/datamodel/MappingType.java new file mode 100644 index 0000000..c0c69aa --- /dev/null +++ b/src/jalview/datamodel/MappingType.java @@ -0,0 +1,63 @@ +package jalview.datamodel; + +/** + * An enumeration of the kinds of mapping (from nucleotide or peptide, to + * nucleotide or peptide), and the corresponding word lengths + */ +public enum MappingType +{ + NucleotideToPeptide(3, 1) + { + @Override + public MappingType getInverse() + { + return PeptideToNucleotide; + } + }, + PeptideToNucleotide(1, 3) + { + @Override + public MappingType getInverse() + { + return NucleotideToPeptide; + } + }, + NucleotideToNucleotide(1, 1) + { + @Override + public MappingType getInverse() + { + return NucleotideToNucleotide; + } + }, + PeptideToPeptide(1, 1) + { + @Override + public MappingType getInverse() + { + return PeptideToPeptide; + } + }; + + private int fromRatio; + + private int toRatio; + + private MappingType(int fromSize, int toSize) + { + fromRatio = fromSize; + toRatio = toSize; + } + + public abstract MappingType getInverse(); + + public int getFromRatio() + { + return fromRatio; + } + + public int getToRatio() + { + return toRatio; + } +} diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index bd7127f..22b0601 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -20,14 +20,19 @@ */ package jalview.io; +import jalview.analysis.AlignmentUtils; import jalview.analysis.SequenceIdMatcher; import jalview.api.AlignViewportI; +import jalview.api.FeaturesSourceI; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.io.gff.GffHelperBase; +import jalview.io.gff.GffHelperFactory; +import jalview.io.gff.GffHelperI; import jalview.schemes.AnnotationColourGradient; import jalview.schemes.GraduatedColor; import jalview.schemes.UserColourScheme; @@ -63,24 +68,14 @@ import java.util.StringTokenizer; * @author jbprocter * @author gmcarstairs */ -public class FeaturesFile extends AlignFile +public class FeaturesFile extends AlignFile implements FeaturesSourceI { - private static final String NOTE = "Note"; - - private static final String ALIGN = "Align"; - - private static final String QUERY = "Query"; - - private static final String TARGET = "Target"; - - private static final String SIMILARITY = "similarity"; + private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED"; - protected static final String STRAND = "STRAND"; + private static final String NOTE = "Note"; protected static final String FRAME = "FRAME"; - protected static final String ATTRIBUTES = "ATTRIBUTES"; - protected static final String TAB = "\t"; protected static final String GFF_VERSION = "##gff-version"; @@ -196,7 +191,7 @@ public class FeaturesFile extends AlignFile String line = null; try { - StringTokenizer st; + String[] gffColumns; String featureGroup = null; while ((line = nextLine()) != null) @@ -211,41 +206,40 @@ public class FeaturesFile extends AlignFile continue; } - st = new StringTokenizer(line, TAB); - if (st.countTokens() == 1) + gffColumns = line.split("\\t"); // tab as regex + if (gffColumns.length == 1) { if (line.trim().equalsIgnoreCase("GFF")) { /* - * Jalview features file with appendded GFF - * assume GFF2 (though it may declare gff-version 3) + * Jalview features file with appended GFF + * assume GFF2 (though it may declare ##gff-version 3) */ gffVersion = 2; continue; } } - if (st.countTokens() > 1 && st.countTokens() < 4) + if (gffColumns.length > 1 && gffColumns.length < 4) { /* * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or - * a feature type colour specification; not GFF format + * a feature type colour specification */ - String ft = st.nextToken(); + String ft = gffColumns[0]; if (ft.equalsIgnoreCase("startgroup")) { - featureGroup = st.nextToken(); + featureGroup = gffColumns[1]; } else if (ft.equalsIgnoreCase("endgroup")) { // We should check whether this is the current group, // but at present theres no way of showing more than 1 group - st.nextToken(); featureGroup = null; } else { - parseFeatureColour(line, ft, st, colours); + parseFeatureColour(line, ft, gffColumns, colours); } continue; } @@ -257,12 +251,12 @@ public class FeaturesFile extends AlignFile */ if (gffVersion == 0) { - parseJalviewFeature(line, st, align, colours, removeHTML, + parseJalviewFeature(line, gffColumns, align, colours, removeHTML, relaxedIdmatching, featureGroup); } else { - parseGffFeature(st, align, relaxedIdmatching, newseqs); + parseGff(gffColumns, align, relaxedIdmatching, newseqs); } } resetMatcher(); @@ -281,51 +275,51 @@ public class FeaturesFile extends AlignFile } /** - * Try to parse a Jalview format feature specification. Returns true if - * successful or false if not. + * Try to parse a Jalview format feature specification and add it as a + * sequence feature to any matching sequences in the alignment. Returns true + * if successful (a feature was added), or false if not. * * @param line - * @param st + * @param gffColumns * @param alignment * @param featureColours * @param removeHTML * @param relaxedIdmatching * @param featureGroup */ - protected boolean parseJalviewFeature(String line, StringTokenizer st, + protected boolean parseJalviewFeature(String line, String[] gffColumns, AlignmentI alignment, Map featureColours, boolean removeHTML, boolean relaxedIdMatching, String featureGroup) { /* - * Jalview: description seqid seqIndex start end type [score] + * tokens: description seqid seqIndex start end type [score] */ - if (st.countTokens() < 6) + if (gffColumns.length < 6) { System.err.println("Ignoring feature line '" + line - + "' with unexpected number of columns (" + st.countTokens() - + ")"); + + "' with too few columns (" + gffColumns.length + ")"); return false; } - String desc = st.nextToken(); - String seqId = st.nextToken(); - SequenceI seq = findName(alignment, null, relaxedIdMatching, seqId); + String desc = gffColumns[0]; + String seqId = gffColumns[1]; + SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching); - if (!seqId.equals("ID_NOT_SPECIFIED")) + if (!ID_NOT_SPECIFIED.equals(seqId)) { - seq = findName(alignment, null, relaxedIdMatching, seqId); - st.nextToken(); + seq = findSequence(seqId, alignment, null, relaxedIdMatching); } else { seqId = null; seq = null; + String seqIndex = gffColumns[2]; try { - int idx = Integer.parseInt(st.nextToken()); + int idx = Integer.parseInt(seqIndex); seq = alignment.getSequenceAt(idx); } catch (NumberFormatException ex) { - // continue + System.err.println("Invalid sequence index: " + seqIndex); } } @@ -335,10 +329,10 @@ public class FeaturesFile extends AlignFile return false; } - int startPos = Integer.parseInt(st.nextToken()); - int endPos = Integer.parseInt(st.nextToken()); + int startPos = Integer.parseInt(gffColumns[3]); + int endPos = Integer.parseInt(gffColumns[4]); - String ft = st.nextToken(); + String ft = gffColumns[5]; if (!featureColours.containsKey(ft)) { @@ -349,18 +343,18 @@ public class FeaturesFile extends AlignFile UserColourScheme ucs = new UserColourScheme(ft); featureColours.put(ft, ucs.findColour('A')); } - SequenceFeature sf = new SequenceFeature(ft, desc, "", - startPos, endPos, featureGroup); - if (st.hasMoreTokens()) + SequenceFeature sf = new SequenceFeature(ft, desc, "", startPos, + endPos, featureGroup); + if (gffColumns.length > 6) { - float score = 0f; + float score = Float.NaN; try { - score = new Float(st.nextToken()).floatValue(); + score = new Float(gffColumns[6]).floatValue(); // update colourgradient bounds if allowed to } catch (NumberFormatException ex) { - // leave as 0 + // leave as NaN } sf.setScore(score); } @@ -384,16 +378,16 @@ public class FeaturesFile extends AlignFile * the current input line (for error messages only) * @param featureType * the first token on the line - * @param st - * holds remaining tokens on the line + * @param gffColumns + * holds tokens on the line * @param colours * map to which to add derived colour specification */ protected void parseFeatureColour(String line, String featureType, - StringTokenizer st, Map colours) + String[] gffColumns, Map colours) { Object colour = null; - String colscheme = st.nextToken(); + String colscheme = gffColumns[1]; if (colscheme.indexOf("|") > -1 || colscheme.trim().equalsIgnoreCase("label")) { @@ -618,15 +612,18 @@ public class FeaturesFile extends AlignFile * list, and returns it * * + * @param seqId * @param align * @param newseqs * @param relaxedIdMatching - * @param seqId + * * @return */ - protected SequenceI findName(AlignmentI align, List newseqs, - boolean relaxedIdMatching, String seqId) + protected SequenceI findSequence(String seqId, AlignmentI align, + List newseqs, boolean relaxedIdMatching) { + // TODO encapsulate in SequenceIdMatcher, share the matcher + // with the GffHelper (removing code duplication) SequenceI match = null; if (relaxedIdMatching) { @@ -996,7 +993,8 @@ public class FeaturesFile extends AlignFile * a map whose keys are the type names of visible features * @return */ - public String printGffFormat(SequenceI[] sequences, Map visible) + public String printGffFormat(SequenceI[] sequences, + Map visible) { return printGffFormat(sequences, visible, true, true); } @@ -1012,7 +1010,8 @@ public class FeaturesFile extends AlignFile * @param includeNonPositionalFeatures * @return */ - public String printGffFormat(SequenceI[] sequences, Map visible, boolean outputVisibleOnly, + public String printGffFormat(SequenceI[] sequences, + Map visible, boolean outputVisibleOnly, boolean includeNonPositionalFeatures) { StringBuilder out = new StringBuilder(256); @@ -1044,13 +1043,13 @@ public class FeaturesFile extends AlignFile */ continue; } - + source = sf.featureGroup; if (source == null) { source = sf.getDescription(); } - + out.append(seq.getName()); out.append(TAB); out.append(source); @@ -1063,24 +1062,25 @@ public class FeaturesFile extends AlignFile out.append(TAB); out.append(sf.score); out.append(TAB); - - out.append(sf.getValue(STRAND, ".")); + + int strand = sf.getStrand(); + out.append(strand == 1 ? "+" : (strand == -1 ? "-" : ".")); out.append(TAB); - + out.append(sf.getValue(FRAME, ".")); - + // miscellaneous key-values (GFF column 9) - String attributes = (String) sf.getValue(ATTRIBUTES); + String attributes = sf.getAttributes(); if (attributes != null) { out.append(TAB).append(attributes); } - + out.append(newline); } } } - + return out.toString(); } @@ -1165,95 +1165,66 @@ public class FeaturesFile extends AlignFile toRanges[toRangesIndex++] = toStart; toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; } - + return new MapList(fromRanges, toRanges, 3, 1); } /** - * Parse a GFF format feature. This may include creating a 'dummy' sequence - * for the feature or its mapped sequence + * Parse a GFF format feature. This may include creating a 'dummy' sequence to + * hold the feature, or for its mapped sequence, or both, to be resolved + * either later in the GFF file (##FASTA section), or when the user loads + * additional sequences. * - * @param st + * @param gffColumns * @param alignment * @param relaxedIdMatching * @param newseqs * @return */ - protected SequenceI parseGffFeature(StringTokenizer st, - AlignmentI alignment, boolean relaxedIdMatching, - List newseqs) + protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment, + boolean relaxedIdMatching, List newseqs) { - SequenceI seq; /* * GFF: seqid source type start end score strand phase [attributes] */ - if (st.countTokens() < 8) + if (gffColumns.length < 5) { - System.err - .println("Ignoring GFF feature line with unexpected number of columns (" - + st.countTokens() + ")"); + System.err.println("Ignoring GFF feature line with too few columns (" + + gffColumns.length + ")"); return null; } - String seqId = st.nextToken(); - + /* * locate referenced sequence in alignment _or_ - * as a forward reference (SequenceDummy) + * as a forward or external reference (SequenceDummy) */ - seq = findName(alignment, newseqs, relaxedIdMatching, seqId); - - String desc = st.nextToken(); - String group = null; - if (desc.indexOf(' ') == -1) - { - // could also be a source term rather than description line - group = desc; - } - String ft = st.nextToken(); - int startPos = StringUtils.parseInt(st.nextToken()); - int endPos = StringUtils.parseInt(st.nextToken()); - // TODO: decide if non positional feature assertion for input data - // where end==0 is generally valid - if (endPos == 0) - { - // treat as non-positional feature, regardless. - startPos = 0; - } - float score = 0f; - try - { - score = new Float(st.nextToken()).floatValue(); - } catch (NumberFormatException ex) - { - // leave at 0 - } - - SequenceFeature sf = new SequenceFeature(ft, desc, startPos, - endPos, score, group); - if (st.hasMoreTokens()) - { - sf.setValue(STRAND, st.nextToken()); - } - if (st.hasMoreTokens()) - { - sf.setValue(FRAME, st.nextToken()); - } - - if (st.hasMoreTokens()) - { - processGffColumnNine(st.nextToken(), sf); - } - - if (processOrAddSeqFeature(alignment, newseqs, seq, sf, - relaxedIdMatching)) + String seqId = gffColumns[0]; + SequenceI seq = findSequence(seqId, alignment, newseqs, + relaxedIdMatching); + + SequenceFeature sf = null; + GffHelperI helper = GffHelperFactory.getHelper(gffColumns); + if (helper != null) { - // check whether we should add the sequence feature to any other - // sequences in the alignment with the same or similar - while ((seq = alignment.findName(seq, seqId, true)) != null) + try + { + sf = helper.processGff(seq, gffColumns, alignment, newseqs, + relaxedIdMatching); + if (sf != null) + { + seq.addSequenceFeature(sf); + while ((seq = alignment.findName(seq, seqId, true)) != null) + { + seq.addSequenceFeature(new SequenceFeature(sf)); + } + } + } catch (IOException e) { - seq.addSequenceFeature(new SequenceFeature(sf)); + System.err.println("GFF parsing failed with: " + e.getMessage()); + return null; } } + return seq; } @@ -1267,15 +1238,16 @@ public class FeaturesFile extends AlignFile */ protected void processGffColumnNine(String attributes, SequenceFeature sf) { - sf.setValue(ATTRIBUTES, attributes); - + sf.setAttributes(attributes); + /* * Parse attributes in column 9 and add them to the sequence feature's * 'otherData' table; use Note as a best proxy for description */ - char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' }; - Map> nameValues = StringUtils.parseNameValuePairs(attributes, ";", - nameValueSeparator); + char nameValueSeparator = gffVersion == 3 ? '=' : ' '; + // TODO check we don't break GFF2 values which include commas here + Map> nameValues = GffHelperBase + .parseNameValuePairs(attributes, ";", nameValueSeparator, ","); for (Entry> attr : nameValues.entrySet()) { String values = StringUtils.listToDelimitedString(attr.getValue(), @@ -1308,42 +1280,68 @@ public class FeaturesFile extends AlignFile } FastaFile parser = new FastaFile(this); List includedseqs = parser.getSeqs(); + SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); - // iterate over includedseqs, and replacing matching ones with newseqs - // sequences. Generic iterator not used here because we modify includedseqs - // as we go + + /* + * iterate over includedseqs, and replacing matching ones with newseqs + * sequences. Generic iterator not used here because we modify + * includedseqs as we go + */ for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) { // search for any dummy seqs that this sequence can be used to update - SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p)); - if (dummyseq != null) + SequenceI includedSeq = includedseqs.get(p); + SequenceI dummyseq = smatcher.findIdMatch(includedSeq); + if (dummyseq != null && dummyseq instanceof SequenceDummy) { - // dummyseq was created so it could be annotated and referred to in - // alignments/codon mappings - - SequenceI mseq = includedseqs.get(p); - // mseq is the 'template' imported from the FASTA file which we'll use - // to coomplete dummyseq - if (dummyseq instanceof SequenceDummy) + // probably have the pattern wrong + // idea is that a flyweight proxy for a sequence ID can be created for + // 1. stable reference creation + // 2. addition of annotation + // 3. future replacement by a real sequence + // current pattern is to create SequenceDummy objects - a convenience + // constructor for a Sequence. + // problem is that when promoted to a real sequence, all references + // need to be updated somehow. We avoid that by keeping the same object. + ((SequenceDummy) dummyseq).become(includedSeq); + dummyseq.createDatasetSequence(); + + /* + * Update mappings so they are now to the dataset sequence + */ + for (AlignedCodonFrame mapping : align.getCodonFrames()) { - // probably have the pattern wrong - // idea is that a flyweight proxy for a sequence ID can be created for - // 1. stable reference creation - // 2. addition of annotation - // 3. future replacement by a real sequence - // current pattern is to create SequenceDummy objects - a convenience - // constructor for a Sequence. - // problem is that when promoted to a real sequence, all references - // need - // to be updated somehow. - ((SequenceDummy) dummyseq).become(mseq); - includedseqs.set(p, dummyseq); // template is no longer needed + mapping.updateToDataset(dummyseq); } + + /* + * replace parsed sequence with the realised forward reference + */ + includedseqs.set(p, dummyseq); } } - // finally add sequences to the dataset + + /* + * finally add sequences to the dataset + */ for (SequenceI seq : includedseqs) { + // experimental: mapping-based 'alignment' to query sequence + AlignmentUtils.alignSequenceAs(seq, align, + String.valueOf(align.getGapCharacter()), false, true); + + // rename sequences if GFF handler requested this + // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ? + SequenceFeature[] sfs = seq.getSequenceFeatures(); + if (sfs != null) + { + String newName = (String) sfs[0].getValue(GffHelperI.RENAME_TOKEN); + if (newName != null) + { + seq.setName(newName); + } + } align.addSequence(seq); } } @@ -1357,7 +1355,8 @@ public class FeaturesFile extends AlignFile * @param newseqs * @throws IOException */ - protected void processGffPragma(String line, Map gffProps, AlignmentI align, + protected void processGffPragma(String line, + Map gffProps, AlignmentI align, List newseqs) throws IOException { line = line.trim(); @@ -1366,11 +1365,11 @@ public class FeaturesFile extends AlignFile // close off any open 'forward references' return; } - + String[] tokens = line.substring(2).split(" "); String pragma = tokens[0]; String value = tokens.length == 1 ? null : tokens[1]; - + if ("gff-version".equalsIgnoreCase(pragma)) { if (value != null) @@ -1385,6 +1384,10 @@ public class FeaturesFile extends AlignFile } } } + else if ("sequence-region".equalsIgnoreCase(pragma)) + { + // could capture if wanted here + } else if ("feature-ontology".equalsIgnoreCase(pragma)) { // should resolve against the specified feature ontology URI @@ -1413,150 +1416,4 @@ public class FeaturesFile extends AlignFile System.err.println("Ignoring unknown pragma: " + line); } } - - /** - * Processes the 'Query' (or 'Target') and 'Align' properties associated with - * an exonerate GFF similarity feature; these properties define the mapping of - * the annotated feature (e.g. 'exon') to a related sequence. - * - * @param set - * @param seq - * @param sf - * @param align - * @param newseqs - * @param relaxedIdMatching - * @throws IOException - */ - public void processGffSimilarity(Map> set, SequenceI seq, - SequenceFeature sf, AlignmentI align, List newseqs, boolean relaxedIdMatching) - throws IOException - { - if (!validateExonerateModel(sf)) - { - return; - } - - int strand = sf.getStrand(); - - /* - * exonerate (protein2dna or protein2genome) may be run with - * --showquerygff outputs - * Target ; Align proteinStartPos dnaStartPos peptideCount - * --showtargetgff outputs - * Query ; Align dnaStartPos proteinStartPos nucleotideCount - * where the Align spec may repeat - */ - boolean mapIsFromCdna = true; - List mapTo = set.get(QUERY); - if (mapTo == null) - { - mapTo = set.get(TARGET); - mapIsFromCdna = false; - } - if (mapTo == null || mapTo.size() != 1) - { - throw new IOException( - "Expecting exactly one sequence in Query field (got " + mapTo - + ")"); - } - - /* - * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; - */ - SequenceI mappedSequence = findName(align, newseqs, relaxedIdMatching, - mapTo.get(0)); - /* - * Process the Align maps and create cdna/protein maps; - * ideally, the query sequences are in the alignment, but maybe not... - */ - AlignedCodonFrame alco = new AlignedCodonFrame(); - MapList codonmapping = constructCodonMappingFromAlign(set.get(ALIGN), - mapIsFromCdna, strand); - - /* - * Jalview always maps from dna to protein - */ - if (mapIsFromCdna) - { - alco.addMap(seq, mappedSequence, codonmapping); - } - else - { - alco.addMap(mappedSequence, seq, codonmapping); - } - align.addCodonFrame(alco); - } - - /** - * Returns true if the exonerate model (saved from column 2 of the GFF as the - * SequenceFeature's group) is one that we are willing to process, else false - * - * @param sf - */ - protected boolean validateExonerateModel(SequenceFeature sf) - { - /* - * we don't handle protein-to-protein or dna-to-dna alignment here - */ - String source = sf.getFeatureGroup(); - if (source == null - || (!source.contains("protein2dna") && !source - .contains("protein2genome"))) - { - System.err - .println("I only accept protein2dna or protein2genome but found " - + source); - return false; - } - return true; - } - - /** - * take a sequence feature and examine its attributes to decide how it should - * be added to a sequence - * - * @param seq - * - the destination sequence constructed or discovered in the - * current context - * @param sf - * - the base feature with ATTRIBUTES property containing any - * additional attributes - * @param gFFFile - * - true if we are processing a GFF annotation file - * @return true if sf was actually added to the sequence, false if it was - * processed in another way - */ - public boolean processOrAddSeqFeature(AlignmentI align, List newseqs, - SequenceI seq, SequenceFeature sf, boolean relaxedIdMatching) - { - String attr = (String) sf.getValue(ATTRIBUTES); - boolean addFeature = true; - if (attr != null) - { - for (String attset : attr.split(TAB)) - { - Map> set = StringUtils.parseNameValuePairs( - attset, ";", new char[] { ' ', '-' }); - - if (SIMILARITY.equals(sf.getType())) - { - try - { - addFeature = false; - processGffSimilarity(set, seq, sf, align, newseqs, - relaxedIdMatching); - } catch (IOException ivfe) - { - System.err.println(ivfe); - } - } - } - } - if (addFeature) - { - seq.addSequenceFeature(sf); - } - return addFeature; - } - } diff --git a/src/jalview/io/gff/ExonerateHelper.java b/src/jalview/io/gff/ExonerateHelper.java new file mode 100644 index 0000000..e373861 --- /dev/null +++ b/src/jalview/io/gff/ExonerateHelper.java @@ -0,0 +1,348 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappingType; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * A handler to parse GFF in the format generated by the exonerate tool + */ +public class ExonerateHelper extends Gff2Helper +{ + private static final String SIMILARITY = "similarity"; + + private static final String GENOME2GENOME = "genome2genome"; + + private static final String CDNA2GENOME = "cdna2genome"; + + private static final String CODING2GENOME = "coding2genome"; + + private static final String CODING2CODING = "coding2coding"; + + private static final String PROTEIN2GENOME = "protein2genome"; + + private static final String PROTEIN2DNA = "protein2dna"; + + private static final String ALIGN = "Align"; + + private static final String QUERY = "Query"; + + private static final String TARGET = "Target"; + + /** + * Process one GFF feature line (as modelled by SequenceFeature) + * + * @param seq + * the sequence with which this feature is associated + * @param sf + * the sequence feature with ATTRIBUTES property containing any + * additional attributes + * @param align + * the alignment we are adding GFF to + * @param newseqs + * any new sequences referenced by the GFF + * @param relaxedIdMatching + * if true, match word tokens in sequence names + * @return true if the sequence feature should be added to the sequence, else + * false (i.e. it has been processed in another way e.g. to generate a + * mapping) + */ + @Override + public SequenceFeature processGff(SequenceI seq, String[] gffColumns, + AlignmentI align, List newseqs, + boolean relaxedIdMatching) + { + String attr = gffColumns[ATTRIBUTES_COL]; + Map> set = parseNameValuePairs(attr); + + try + { + processGffSimilarity(set, seq, gffColumns, + align, newseqs, relaxedIdMatching); + } catch (IOException ivfe) + { + System.err.println(ivfe); + } + + /* + * return null to indicate we don't want to add a sequence feature for + * similarity (only process it to create mappings) + */ + return null; + } + + /** + * Processes the 'Query' (or 'Target') and 'Align' properties associated with + * an exonerate GFF similarity feature; these properties define the mapping of + * the annotated range to a related sequence. + * + * @param set + * parsed GFF column 9 key/value(s) + * @param seq + * the sequence the GFF feature is on + * @param gff + * the GFF column data + * @param align + * the alignment the sequence belongs to, where any new mappings + * should be added + * @param newseqs + * a list of new 'virtual sequences' generated while parsing GFF + * @param relaxedIdMatching + * if true allow fuzzy search for a matching target sequence + * @throws IOException + */ + protected void processGffSimilarity( + Map> set, + SequenceI seq, String[] gff, AlignmentI align, + List newseqs, boolean relaxedIdMatching) + throws IOException + { + /* + * exonerate may be run with + * --showquerygff - outputs 'features on the query' e.g. (protein2genome) + * Target ; Align proteinStartPos dnaStartPos proteinCount + * --showtargetgff - outputs 'features on the target' e.g. (protein2genome) + * Query ; Align dnaStartPos proteinStartPos nucleotideCount + * where the Align spec may repeat + */ + // TODO handle coding2coding and similar as well + boolean featureIsOnTarget = true; + List mapTo = set.get(QUERY); + if (mapTo == null) + { + mapTo = set.get(TARGET); + featureIsOnTarget = false; + } + MappingType type = getMappingType(gff[SOURCE_COL]); + + if (type == null) + { + throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]); + } + + if (mapTo == null || mapTo.size() != 1) + { + throw new IOException( + "Expecting exactly one sequence in Query or Target field (got " + + mapTo + ")"); + } + + /* + * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; + */ + SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, + relaxedIdMatching); + + /* + * If mapping is from protein to dna, we store it as dna to protein instead + */ + SequenceI mapFromSequence = seq; + SequenceI mapToSequence = mappedSequence; + if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) + || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget)) + { + mapFromSequence = mappedSequence; + mapToSequence = seq; + } + + /* + * Process the Align maps and create mappings. + * These may be cdna-genome, cdna-protein, genome-protein. + * The mapped sequences may or may not be in the alignment + * (they may be included later in the GFF file). + */ + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame acf = getMapping(align, mapFromSequence, + mapToSequence); + + /* + * exonerate GFF has the strand of the target in column 7 rather + * (differs from GFF3 which has it in the Target descriptor) + */ + String strand = gff[STRAND_COL]; + boolean forwardStrand = true; + if ("-".equals(strand)) + { + forwardStrand = false; + } + else if (!"+".equals(strand)) + { + System.err.println("Strand must be specified for alignment"); + return; + } + + List alignedRegions = set.get(ALIGN); + for (String region : alignedRegions) + { + MapList mapping = buildMapping(region, type, forwardStrand, + featureIsOnTarget, gff); + + if (mapping == null) + { + continue; + } + + acf.addMap(mapFromSequence, mapToSequence, mapping); + } + align.addCodonFrame(acf); + } + + /** + * Construct the mapping + * + * @param region + * @param type + * @param forwardStrand + * @param featureIsOnTarget + * @param gff + * @return + */ + protected MapList buildMapping(String region, MappingType type, + boolean forwardStrand, boolean featureIsOnTarget, String[] gff) + { + /* + * process one "fromStart toStart fromCount" descriptor + */ + String[] tokens = region.split(" "); + if (tokens.length != 3) + { + System.err.println("Malformed Align descriptor: " + region); + return null; + } + + /* + * get start/end of from/to mappings + * if feature is on the target sequence we have to invert the sense + */ + int alignFromStart; + int alignToStart; + int alignCount; + try { + alignFromStart = Integer.parseInt(tokens[0]); + alignToStart = Integer.parseInt(tokens[1]); + alignCount = Integer.parseInt(tokens[2]); + } catch (NumberFormatException nfe) { + System.err.println(nfe.toString()); + return null; + } + + int fromStart; + int fromEnd; + int toStart; + int toEnd; + + if (featureIsOnTarget) + { + fromStart = alignToStart; + toStart = alignFromStart; + toEnd = forwardStrand ? toStart + alignCount - 1 : toStart + - (alignCount - 1); + int toLength = Math.abs(toEnd - toStart) + 1; + int fromLength = toLength * type.getFromRatio() / type.getToRatio(); + fromEnd = fromStart + fromLength - 1; + } + else + { + // we use the 'Align' values here not the feature start/end + // not clear why they may differ but it seems they can + fromStart = alignFromStart; + fromEnd = alignFromStart + alignCount - 1; + int fromLength = fromEnd - fromStart + 1; + int toLength = fromLength * type.getToRatio() / type.getFromRatio(); + toStart = alignToStart; + if (forwardStrand) + { + toEnd = toStart + toLength - 1; + } + else + { + toEnd = toStart - (toLength - 1); + } + } + + MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd, + toStart, toEnd, type); + return codonmapping; + } + + /** + * Returns a MappingType depending on the exonerate 'model' value. + * + * @param model + * @return + */ + protected static MappingType getMappingType(String model) + { + MappingType result = null; + + if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME)) + { + result = MappingType.PeptideToNucleotide; + } + else if (model.contains(CODING2CODING) + || model.contains(CODING2GENOME) + || model.contains(CDNA2GENOME) + || model.contains(GENOME2GENOME)) + { + result = MappingType.NucleotideToNucleotide; + } + return result; + } + + /** + * Tests whether the GFF data looks like it was generated by exonerate, and is + * a format we are willing to handle + * + * @param sf + * @return + */ + public static boolean recognises(String[] columns) + { + if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL])) + { + return false; + } + + /* + * inspect alignment model + */ + String model = columns[SOURCE_COL]; + // e.g. exonerate:protein2genome:local + if (model != null) + { + String mdl = model.toLowerCase(); + if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME) + || mdl.contains(CODING2CODING) + || mdl.contains(CODING2GENOME) + || mdl.contains(CDNA2GENOME) + || mdl.contains(GENOME2GENOME)) + { + return true; + } + } + System.err.println("Sorry, I don't handle exonerate model " + model); + return false; + } + + @Override + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> set) + { + SequenceFeature sf = super.buildSequenceFeature(gff, set); + sf.setFeatureGroup("exonerate"); + + return sf; + } + +} diff --git a/src/jalview/io/gff/Gff2Helper.java b/src/jalview/io/gff/Gff2Helper.java new file mode 100644 index 0000000..31303b1 --- /dev/null +++ b/src/jalview/io/gff/Gff2Helper.java @@ -0,0 +1,51 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class Gff2Helper extends GffHelperBase +{ + /** + * GFF2 uses space character to delimit name/value pairs on column 9 + * + * @param text + * @return + */ + public static Map> parseNameValuePairs(String text) + { + // TODO: can a value include a comma? if so it will be broken by this + return parseNameValuePairs(text, ";", ' ', ","); + } + + /** + * Return ' ' as the name-value separator used in column 9 attributes. + */ + @Override + protected char getNameValueSeparator() + { + return ' '; + } + + /** + * Default processing if not overridden is just to construct a sequence + * feature + */ + @Override + public SequenceFeature processGff(SequenceI seq, String[] gff, + AlignmentI align, List newseqs, + boolean relaxedIdMatching) throws IOException + { + Map> attributes = null; + if (gff.length > ATTRIBUTES_COL) + { + attributes = parseNameValuePairs(gff[ATTRIBUTES_COL]); + } + return buildSequenceFeature(gff, attributes); + } + +} diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java new file mode 100644 index 0000000..4c67caa --- /dev/null +++ b/src/jalview/io/gff/Gff3Helper.java @@ -0,0 +1,361 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappingType; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.util.StringUtils; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Base class with generic / common functionality for processing GFF3 data. + * Override this as required for any specialisations resulting from + * peculiarities of GFF3 generated by particular tools. + */ +public class Gff3Helper extends GffHelperBase +{ + protected static final String TARGET = "Target"; + + protected static final String ID = "ID"; + + private static final String NAME = "Name"; + + /** + * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to + * separate multiple values for a name + * + * @param text + * @return + */ + public static Map> parseNameValuePairs(String text) + { + return parseNameValuePairs(text, ";", '=', ","); + } + + /** + * Process one GFF feature line (as modelled by SequenceFeature) + * + * @param seq + * the sequence with which this feature is associated + * @param sf + * the sequence feature with ATTRIBUTES property containing any + * additional attributes + * @param align + * the alignment we are adding GFF to + * @param newseqs + * any new sequences referenced by the GFF + * @param relaxedIdMatching + * if true, match word tokens in sequence names + * @return true if the sequence feature should be added to the sequence, else + * false (i.e. it has been processed in another way e.g. to generate a + * mapping) + * @throws IOException + */ + @Override + public SequenceFeature processGff(SequenceI seq, String[] gff, + AlignmentI align, List newseqs, + boolean relaxedIdMatching) throws IOException + { + /* + * (For now) we don't process mappings from reverse complement ; to do + * this would require (a) creating a virtual sequence placeholder for + * the reverse complement (b) resolving the sequence by its id from some + * source (GFF ##FASTA or other) (c) creating the reverse complement + * sequence (d) updating the mapping to be to the reverse complement + */ + if ("-".equals(gff[STRAND_COL])) + { + System.err + .println("Skipping mapping from reverse complement as not yet supported"); + return null; + } + SequenceFeature sf = null; + + if (gff.length == 9) + { + String soTerm = gff[TYPE_COL]; + String atts = gff[ATTRIBUTES_COL]; + Map> attributes = parseNameValuePairs(atts); + + if (SequenceOntology.getInstance().isProteinMatch(soTerm)) + { + sf = processProteinMatch(attributes, seq, gff, align, + newseqs, relaxedIdMatching); + } + else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm)) + { + sf = processNucleotideMatch(attributes, seq, gff, align, + newseqs, relaxedIdMatching); + } + else + { + sf = buildSequenceFeature(gff, attributes); + } + } + else + { + /* + * fall back on generating a sequence feature with no special processing + */ + sf = buildSequenceFeature(gff, null); + } + + return sf; + } + + /** + * Processes one GFF3 nucleotide (e.g. cDNA to genome) match. + * + * @param attributes + * parsed GFF column 9 key/value(s) + * @param seq + * the sequence the GFF feature is on + * @param gffColumns + * the GFF column data + * @param align + * the alignment the sequence belongs to, where any new mappings + * should be added + * @param newseqs + * a list of new 'virtual sequences' generated while parsing GFF + * @param relaxedIdMatching + * if true allow fuzzy search for a matching target sequence + * @return a sequence feature, if one should be added to the sequence, else + * null + * @throws IOException + */ + protected SequenceFeature processNucleotideMatch( + Map> attributes, SequenceI seq, + String[] gffColumns, AlignmentI align, List newseqs, + boolean relaxedIdMatching) + throws IOException + { + String strand = gffColumns[STRAND_COL]; + if ("-1".equals(strand)) + { + System.err + .println("Currently ignoring mappings from reverse complement"); + return null; + } + + List targets = attributes.get(TARGET); + if (targets == null) + { + System.err.println("'Target' missing in GFF"); + return null; + } + + /* + * Typically we only expect one Target per GFF line, but this can handle + * multiple matches, to the same or different sequences (e.g. dna variants) + */ + for (String target : targets) + { + /* + * Process "seqid start end [strand]" + */ + String[] tokens = target.split(" "); + if (tokens.length < 3) + { + System.err.println("Incomplete Target: " + target); + continue; + } + + /* + * Locate the mapped sequence in the alignment, or as a + * (new or existing) virtual sequence in the newseqs list + */ + String targetId = findTargetId(tokens[0], attributes); + SequenceI mappedSequence1 = findSequence(targetId, align, + newseqs, relaxedIdMatching); + SequenceI mappedSequence = mappedSequence1; + if (mappedSequence == null) + { + continue; + } + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame acf = getMapping(align, seq, mappedSequence); + + try + { + int toStart = Integer.parseInt(tokens[1]); + int toEnd = Integer.parseInt(tokens[2]); + if (tokens.length > 3 && "-".equals(tokens[3])) + { + // mapping to reverse strand - swap start/end + int temp = toStart; + toStart = toEnd; + toEnd = temp; + } + + int fromStart = Integer.parseInt(gffColumns[START_COL]); + int fromEnd = Integer.parseInt(gffColumns[END_COL]); + MapList mapping = constructMappingFromAlign(fromStart, fromEnd, + toStart, toEnd, + MappingType.NucleotideToNucleotide); + + if (mapping != null) + { + acf.addMap(seq, mappedSequence, mapping); + align.addCodonFrame(acf); + } + } catch (NumberFormatException nfe) + { + System.err.println("Invalid start or end in Target " + target); + } + } + + SequenceFeature sf = buildSequenceFeature(gffColumns, attributes); + return sf; + } + + /** + * Returns the target sequence id extracted from the GFF name/value pairs. + * Default (standard behaviour) is the first token for "Target". This may be + * overridden where tools report this in a non-standard way. + * + * @param target + * first token of a "Target" value from GFF column 9, typically + * "seqid start end" + * @param set + * a map with all parsed column 9 attributes + * @return + */ + @SuppressWarnings("unused") + protected String findTargetId(String target, Map> set) + { + return target; + } + + /** + * Processes one GFF 'protein_match'; fields of interest are + *
    + *
  • feature group - the database reporting a match e.g. Pfam
  • + *
  • Name - the matched entry's accession id in the database
  • + *
  • ID - a sequence identifier for the matched region (which may be + * appended as FASTA in the GFF file)
  • + *
+ * + * @param set + * parsed GFF column 9 key/value(s) + * @param seq + * the sequence the GFF feature is on + * @param gffColumns + * the sequence feature holding GFF data + * @param align + * the alignment the sequence belongs to, where any new mappings + * should be added + * @param newseqs + * a list of new 'virtual sequences' generated while parsing GFF + * @param relaxedIdMatching + * if true allow fuzzy search for a matching target sequence + * @return the (real or virtual) sequence(s) mapped to by this match + * @throws IOException + */ + protected SequenceFeature processProteinMatch( + Map> set, SequenceI seq, + String[] gffColumns, AlignmentI align, List newseqs, + boolean relaxedIdMatching) + { + // This is currently tailored to InterProScan GFF output: + // ID holds the ID of the matched sequence, Target references the + // query sequence; this looks wrong, as ID should just be the GFF internal + // ID of the GFF feature, while Target would normally reference the matched + // sequence. + // TODO refactor as needed if other protein-protein GFF varies + + SequenceFeature sf = buildSequenceFeature(gffColumns, set); + + /* + * locate the mapped sequence in the alignment, or as a + * (new or existing) virtual sequence in the newseqs list + */ + List targets = set.get(TARGET); + if (targets != null) + { + for (String target : targets) + { + + SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align, + newseqs, relaxedIdMatching); + SequenceI mappedSequence = mappedSequence1; + if (mappedSequence == null) + { + continue; + } + + /* + * give the mapped sequence a copy of the sequence feature, with + * start/end range adjusted + */ + SequenceFeature sf2 = new SequenceFeature(sf); + sf2.setBegin(1); + int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin(); + sf2.setEnd(sequenceFeatureLength); + mappedSequence.addSequenceFeature(sf2); + + /* + * add a property to the mapped sequence so that it can eventually be + * renamed with its qualified accession id; renaming has to wait until + * all sequence reference resolution is complete + */ + String accessionId = StringUtils.listToDelimitedString( + set.get(NAME), ","); + if (accessionId.length() > 0) + { + String database = sf.getType(); // TODO InterProScan only?? + String qualifiedAccId = database + "|" + accessionId; + sf2.setValue(RENAME_TOKEN, qualifiedAccId); + } + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame alco = getMapping(align, seq, mappedSequence); + int[] from = new int[] { sf.getBegin(), sf.getEnd() }; + int[] to = new int[] { 1, sequenceFeatureLength }; + MapList mapping = new MapList(from, to, 1, 1); + + alco.addMap(seq, mappedSequence, mapping); + align.addCodonFrame(alco); + } + } + + return sf; + } + + /** + * Return '=' as the name-value separator used in column 9 attributes. + */ + @Override + protected char getNameValueSeparator() + { + return '='; + } + + /** + * Modifies the default SequenceFeature in order to set the Target sequence id + * as the description + */ + @Override + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> attributes) + { + SequenceFeature sf = super.buildSequenceFeature(gff, attributes); + String target = (String) sf.getValue(TARGET); + if (target != null) + { + sf.setDescription(target.split(" ")[0]); + } + return sf; + } +} diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java new file mode 100644 index 0000000..fbde9d9 --- /dev/null +++ b/src/jalview/io/gff/GffHelperBase.java @@ -0,0 +1,396 @@ +package jalview.io.gff; + +import jalview.analysis.SequenceIdMatcher; +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.MappingType; +import jalview.datamodel.SequenceDummy; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.MapList; +import jalview.util.StringUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** + * Base class with common functionality for flavours of GFF handler (GFF2 or + * GFF3) + */ +public abstract class GffHelperBase implements GffHelperI +{ + private static final String NOTE = "Note"; + + /* + * GFF columns 1-9 (zero-indexed): + */ + protected static final int SEQID_COL = 0; + + protected static final int SOURCE_COL = 1; + + protected static final int TYPE_COL = 2; + + protected static final int START_COL = 3; + + protected static final int END_COL = 4; + + protected static final int SCORE_COL = 5; + + protected static final int STRAND_COL = 6; + + protected static final int PHASE_COL = 7; + + protected static final int ATTRIBUTES_COL = 8; + + private AlignmentI lastmatchedAl = null; + + private SequenceIdMatcher matcher = null; + + /** + * Constructs and returns a mapping, or null if data appear invalid + * + * @param fromStart + * @param fromEnd + * @param toStart + * @param toEnd + * @param mappingType + * type of mapping (e.g. protein to nucleotide) + * @return + */ + protected MapList constructMappingFromAlign(int fromStart, int fromEnd, + int toStart, int toEnd, MappingType mappingType) + { + int[] from = new int[] { fromStart, fromEnd }; + int[] to = new int[] { toStart, toEnd }; + + /* + * Jalview always models from dna to protein, so switch values if the + * GFF mapping is from protein to dna + */ + if (mappingType == MappingType.PeptideToNucleotide) + { + int[] temp = from; + from = to; + to = temp; + mappingType = mappingType.getInverse(); + } + + int fromRatio = mappingType.getFromRatio(); + int toRatio = mappingType.getToRatio(); + + /* + * sanity check that mapped residue counts match + * TODO understand why PASA generates such cases... + */ + if (!trimMapping(from, to, fromRatio, toRatio)) + { + System.err.println("Ignoring mapping from " + Arrays.toString(from) + + " to " + Arrays.toString(to) + " as counts don't match!"); + return null; + } + + /* + * If a codon has an intron gap, there will be contiguous 'toRanges'; + * this is handled for us by the MapList constructor. + * (It is not clear that exonerate ever generates this case) + */ + + return new MapList(from, to, fromRatio, toRatio); + } + + /** + * Checks that the 'from' and 'to' ranges have equivalent lengths. If not, + * tries to trim the end of the longer so they do. Returns true if the + * mappings could be made equivalent, else false. Note the range array values + * may be modified by this method. + * + * @param from + * @param to + * @param fromRatio + * @param toRatio + * @return + */ + protected static boolean trimMapping(int[] from, int[] to, int fromRatio, + int toRatio) + { + int fromLength = Math.abs(from[1] - from[0]) + 1; + int toLength = Math.abs(to[1] - to[0]) + 1; + int fromOverlap = fromLength * toRatio - toLength * fromRatio; + if (fromOverlap == 0) + { + return true; + } + if (fromOverlap > 0 && fromOverlap % toRatio == 0) + { + /* + * restrict from range to make them match up + * it's kind of arbitrary which end we truncate - here it is the end + */ + System.err.print("Truncating mapping from " + Arrays.toString(from) + + " to "); + if (from[1] > from[0]) + { + from[1] -= fromOverlap / toRatio; + } + else + { + from[1] += fromOverlap / toRatio; + } + System.err.println(Arrays.toString(from)); + return true; + } + else if (fromOverlap < 0 && fromOverlap % fromRatio == 0) + { + fromOverlap = -fromOverlap; // > 0 + /* + * restrict to range to make them match up + */ + System.err.print("Truncating mapping to " + Arrays.toString(to) + + " to "); + if (to[1] > to[0]) + { + to[1] -= fromOverlap / fromRatio; + } + else + { + to[1] += fromOverlap / fromRatio; + } + System.err.println(Arrays.toString(to)); + return true; + } + + /* + * Couldn't truncate to an exact match.. + */ + return false; + } + + /** + * Returns a sequence matching the given id, as follows + *
    + *
  • strict matching is on exact sequence name
  • + *
  • relaxed matching allows matching on a token within the sequence name, + * or a dbxref
  • + *
  • first tries to find a match in the alignment sequences
  • + *
  • else tries to find a match in the new sequences already generated while + * parsing the features file
  • + *
  • else creates a new placeholder sequence, adds it to the new sequences + * list, and returns it
  • + *
+ * + * @param seqId + * @param align + * @param newseqs + * @param relaxedIdMatching + * + * @return + */ + protected SequenceI findSequence(String seqId, AlignmentI align, + List newseqs, boolean relaxedIdMatching) + { + if (seqId == null) + { + return null; + } + SequenceI match = null; + if (relaxedIdMatching) + { + if (lastmatchedAl != align) + { + lastmatchedAl = align; + matcher = new SequenceIdMatcher(align.getSequencesArray()); + if (newseqs != null) + { + matcher.addAll(newseqs); + } + } + match = matcher.findIdMatch(seqId); + } + else + { + match = align.findName(seqId, true); + if (match == null && newseqs != null) + { + for (SequenceI m : newseqs) + { + if (seqId.equals(m.getName())) + { + return m; + } + } + } + + } + if (match == null && newseqs != null) + { + match = new SequenceDummy(seqId); + if (relaxedIdMatching) + { + matcher.addAll(Arrays.asList(new SequenceI[] { match })); + } + // add dummy sequence to the newseqs list + newseqs.add(match); + } + return match; + } + + /** + * Parses the input line to a map of name / value(s) pairs. For example the + * line
+ * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
+ * if parsed with delimiter=";" and separators {' ', '='}
+ * would return a map with { Notes={Fe=S, Metal}, Method={manual curation, + * prediction}, source={Pfam}}
+ * + * This method supports parsing of either GFF2 format (which uses space ' ' as + * the name/value delimiter, and allows multiple occurrences of the same + * name), or GFF3 format (which uses '=' as the name/value delimiter, and + * strictly does not allow repeat occurrences of the same name - but does + * allow a comma-separated list of values). + * + * @param text + * @param namesDelimiter + * the major delimiter between name-value pairs + * @param nameValueSeparator + * one or more separators used between name and value + * @param valuesDelimiter + * delimits a list of more than one value + * @return the name-values map (which may be empty but never null) + */ + public static Map> parseNameValuePairs(String text, + String namesDelimiter, char nameValueSeparator, + String valuesDelimiter) + { + Map> map = new HashMap>(); + if (text == null || text.trim().length() == 0) + { + return map; + } + + for (String pair : text.trim().split(namesDelimiter)) + { + pair = pair.trim(); + if (pair.length() == 0) + { + continue; + } + + int sepPos = pair.indexOf(nameValueSeparator); + if (sepPos == -1) + { + // no name=value present + continue; + } + + String key = pair.substring(0, sepPos).trim(); + String values = pair.substring(sepPos + 1).trim(); + if (values.length() > 0) + { + List vals = map.get(key); + if (vals == null) + { + vals = new ArrayList(); + map.put(key, vals); + } + for (String val : values.split(valuesDelimiter)) + { + vals.add(val); + } + } + } + return map; + } + + /** + * Constructs a SequenceFeature from the GFF column data. Subclasses may wish + * to call this method then adjust the SequenceFeature depending on the + * particular usage of different tools that generate GFF. + * + * @param gff + * @param attributes + * @return + */ + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> attributes) + { + try + { + int start = Integer.parseInt(gff[START_COL]); + int end = Integer.parseInt(gff[END_COL]); + float score = Float.NaN; + try + { + score = Float.parseFloat(gff[SCORE_COL]); + } catch (NumberFormatException nfe) + { + // e.g. '.' - leave as NaN to indicate no score + } + + SequenceFeature sf = new SequenceFeature(gff[TYPE_COL], + gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]); + + if (attributes != null) + { + /* + * save 'raw' column 9 to allow roundtrip output as input + */ + sf.setAttributes(gff[ATTRIBUTES_COL]); + + /* + * Add attributes in column 9 to the sequence feature's + * 'otherData' table; use Note as a best proxy for description + */ + for (Entry> attr : attributes.entrySet()) + { + String values = StringUtils.listToDelimitedString( + attr.getValue(), "; "); + sf.setValue(attr.getKey(), values); + if (NOTE.equals(attr.getKey())) + { + sf.setDescription(values); + } + } + } + + return sf; + } catch (NumberFormatException nfe) + { + System.err.println("Invalid number in gff: " + nfe.getMessage()); + return null; + } + } + + /** + * Returns the character used to separate attributes names from values in GFF + * column 9. This is space for GFF2, '=' for GFF3. + * + * @return + */ + protected abstract char getNameValueSeparator(); + + /** + * Returns any existing mapping held on the alignment between the given + * dataset sequences, or a new one if none found. This is a convenience method + * to facilitate processing multiple GFF lines that make up a single 'spliced' + * mapping, by extending the first mapping as the others are read. + * + * @param align + * @param fromSeq + * @param toSeq + * @return + */ + protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq) + { + AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq); + if (acf == null) + { + acf = new AlignedCodonFrame(); + } + return acf; + } + +} diff --git a/src/jalview/io/gff/GffHelperFactory.java b/src/jalview/io/gff/GffHelperFactory.java new file mode 100644 index 0000000..8bd5115 --- /dev/null +++ b/src/jalview/io/gff/GffHelperFactory.java @@ -0,0 +1,70 @@ +package jalview.io.gff; + + +/** + * A factory to serve instances of GFF helper classes + */ +public class GffHelperFactory +{ + + /** + * Returns a class to process the GFF line based on inspecting its column + * data. This may return a general-purpose GFF2 or GFF3 helper, or a + * specialisation for a flavour of GFF generated by a particular tool. + * + * @param gff + * @return + */ + public static GffHelperI getHelper(String[] gff) + { + if (gff == null || gff.length < 6) + { + return null; + } + + GffHelperI result = null; + if (ExonerateHelper.recognises(gff)) + { + result = new ExonerateHelper(); + } + else if (InterProScanHelper.recognises(gff)) + { + result = new InterProScanHelper(); + } + else if (looksLikeGff3(gff)) + { + result = new Gff3Helper(); + } + else + { + result = new Gff2Helper(); + } + + return result; + } + + /** + * Heuristic rule: if column 9 seems to have Name=Value entries, assume this + * is GFF3. GFF3 uses '=' as name-value separator, GFF2 uses space ' '. + * + * @param gff + * @return + */ + protected static boolean looksLikeGff3(String[] gff) + { + if (gff.length >= 9) + { + String attributes = gff[8].trim(); + int pos1 = attributes.indexOf(';'); + int pos2 = attributes.indexOf('='); + if (pos2 != -1 && (pos1 == -1 || pos2 < pos1)) + { + // there is an '=' before the first ';' (if any) + // not foolproof as theoretically GFF2 could be like "Name Value=123;" + return true; + } + } + return false; + } + +} diff --git a/src/jalview/io/gff/GffHelperI.java b/src/jalview/io/gff/GffHelperI.java new file mode 100644 index 0000000..3d9dc6f --- /dev/null +++ b/src/jalview/io/gff/GffHelperI.java @@ -0,0 +1,44 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.List; + +/** + * An interface to described common functionality of different flavours of GFF + * + * @author gmcarstairs + * + */ +public interface GffHelperI +{ + + final String RENAME_TOKEN = "$RENAME_TO$"; + + /** + * Process one GFF feature line + * + * @param seq + * the sequence with which this feature is associated + * @param gffColumns + * the GFF column data + * @param align + * the alignment we are adding GFF to + * @param newseqs + * any new sequences referenced by the GFF + * @param relaxedIdMatching + * if true, match word tokens in sequence names + * @return a SequenceFeature if one should be created, else null + * @throws IOException + */ + SequenceFeature processGff(SequenceI seq, String[] gffColumns, + AlignmentI align, + List newseqs, boolean relaxedIdMatching) + throws IOException; + + // java 8 will allow static methods in interfaces: + // static boolean recognises(String [] columns); +} diff --git a/src/jalview/io/gff/InterProScanHelper.java b/src/jalview/io/gff/InterProScanHelper.java new file mode 100644 index 0000000..3323e27 --- /dev/null +++ b/src/jalview/io/gff/InterProScanHelper.java @@ -0,0 +1,117 @@ +package jalview.io.gff; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.util.StringUtils; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * A handler to parse GFF in the format generated by InterProScan + */ +public class InterProScanHelper extends Gff3Helper +{ + private static final String INTER_PRO_SCAN = "InterProScan"; + + private static final String SIGNATURE_DESC = "signature_desc"; + + /** + * Process one GFF feature line (as modelled by SequenceFeature) + * + * @param seq + * the sequence with which this feature is associated + * @param gff + * the gff column data + * @param align + * the alignment we are adding GFF to + * @param newseqs + * any new sequences referenced by the GFF + * @param relaxedIdMatching + * if true, match word tokens in sequence names + * @return a sequence feature if one should be added to the sequence, else + * null (i.e. it has been processed in another way e.g. to generate a + * mapping) + * @throws IOException + */ + @Override + public SequenceFeature processGff(SequenceI seq, String[] gff, + AlignmentI align, List newseqs, + boolean relaxedIdMatching) throws IOException + { + /* + * ignore the 'polypeptide' match of the whole sequence + */ + if (".".equals(gff[SOURCE_COL])) + { + return null; + } + + return super.processGff(seq, gff, align, newseqs, relaxedIdMatching); + } + + /** + * + */ + @Override + protected SequenceFeature buildSequenceFeature(String[] gff, + Map> attributes) + { + SequenceFeature sf = super.buildSequenceFeature(gff, attributes); + + /* + * signature_desc is a more informative source of description + */ + List desc = attributes.get(SIGNATURE_DESC); + String description = StringUtils.listToDelimitedString(desc, ", "); + if (description.length() > 0) + { + sf.setDescription(description); + } + + /* + * Set sequence feature group as 'InterProScan', and type as the source + * database for this match (e.g. 'Pfam') + */ + sf.setType(gff[SOURCE_COL]); + sf.setFeatureGroup(INTER_PRO_SCAN); + + return sf; + } + + /** + * Tests whether the GFF data looks like it was generated by InterProScan + * + * @param columns + * @return + */ + public static boolean recognises(String[] columns) + { + SequenceOntology so = SequenceOntology.getInstance(); + String type = columns[TYPE_COL]; + if (so.isProteinMatch(type) + || (".".equals(columns[SOURCE_COL]) && so.isPolypeptide(type))) + { + return true; + } + return false; + } + + /** + * Overriden method, because InterProScan GFF has the target sequence id in + * GFF field 'ID' rather than the usual 'Target' :-O + */ + @Override + protected String findTargetId(String target, Map> set) + { + List ids = set.get(ID); + if (ids == null || ids.size() != 1) + { + return null; + } + return ids.get(0); + } + +} diff --git a/src/jalview/util/StringUtils.java b/src/jalview/util/StringUtils.java index ad1c0f7..6044655 100644 --- a/src/jalview/util/StringUtils.java +++ b/src/jalview/util/StringUtils.java @@ -21,9 +21,7 @@ package jalview.util; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.regex.Pattern; public class StringUtils @@ -252,72 +250,6 @@ public class StringUtils } /** - * Parses the input line to a map of name / value(s) pairs. For example the - * line
- * Notes=Fe-S;Method=manual curation; source = Pfam; Notes = Metal
- * if parsed with delimiter=";" and separators {' ', '='}
- * would return a map with { Notes={Fe=S, Metal}, Method={manual curation}, - * source={Pfam}}
- * Note the name/value strings are trimmed of leading / trailing spaces; the - * first separator encountered is used - * - * @param line - * @param delimiter - * the major delimiter between name-value pairs - * @param separators - * one or more separators used between name and value - * @return the name-values map (which may be empty but never null) - */ - public static Map> parseNameValuePairs(String line, - String delimiter, char[] separators) - { - Map> map = new HashMap>(); - if (line == null || line.trim().length() == 0) - { - return map; - } - - for (String pair : line.trim().split(delimiter)) - { - pair = pair.trim(); - if (pair.length() == 0) - { - continue; - } - - int sepPos = -1; - for (char sep : separators) - { - int pos = pair.indexOf(sep); - if (pos > -1 && (sepPos == -1 || pos < sepPos)) - { - sepPos = pos; - } - } - - if (sepPos == -1) - { - // no name=value detected - continue; - } - - String key = pair.substring(0, sepPos).trim(); - String value = pair.substring(sepPos + 1).trim(); - if (value.length() > 0) - { - List vals = map.get(key); - if (vals == null) - { - vals = new ArrayList(); - map.put(key, vals); - } - vals.add(value); - } - } - return map; - } - - /** * Converts a list to a string with a delimiter before each term except the * first. Returns an empty string given a null or zero-length argument. This * can be replaced with StringJoiner in Java 8. diff --git a/test/jalview/datamodel/MappingTypeTest.java b/test/jalview/datamodel/MappingTypeTest.java new file mode 100644 index 0000000..64dc793 --- /dev/null +++ b/test/jalview/datamodel/MappingTypeTest.java @@ -0,0 +1,43 @@ +package jalview.datamodel; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertSame; + +import jalview.datamodel.MappingType; + +import org.testng.annotations.Test; + +public class MappingTypeTest +{ + + @Test(groups = "Functional") + public void testGetInverse() + { + assertSame(MappingType.PeptideToNucleotide, + MappingType.NucleotideToPeptide.getInverse()); + assertSame(MappingType.NucleotideToPeptide, + MappingType.PeptideToNucleotide.getInverse()); + assertSame(MappingType.NucleotideToNucleotide, + MappingType.NucleotideToNucleotide.getInverse()); + assertSame(MappingType.PeptideToPeptide, + MappingType.PeptideToPeptide.getInverse()); + } + + @Test(groups = "Functional") + public void testGetFromRatio() + { + assertEquals(1, MappingType.NucleotideToNucleotide.getFromRatio()); + assertEquals(1, MappingType.PeptideToNucleotide.getFromRatio()); + assertEquals(1, MappingType.PeptideToPeptide.getFromRatio()); + assertEquals(3, MappingType.NucleotideToPeptide.getFromRatio()); + } + + @Test(groups = "Functional") + public void testGetToRatio() + { + assertEquals(1, MappingType.NucleotideToNucleotide.getToRatio()); + assertEquals(3, MappingType.PeptideToNucleotide.getToRatio()); + assertEquals(1, MappingType.PeptideToPeptide.getToRatio()); + assertEquals(1, MappingType.NucleotideToPeptide.getToRatio()); + } +} diff --git a/test/jalview/io/FeaturesFileTest.java b/test/jalview/io/FeaturesFileTest.java index 506ee91..7112c77 100644 --- a/test/jalview/io/FeaturesFileTest.java +++ b/test/jalview/io/FeaturesFileTest.java @@ -256,7 +256,7 @@ public class FeaturesFileTest String gffData = "##gff-version 3\n" + "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t" + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n" - + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t."; + + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23"; FeaturesFile featuresFile = new FeaturesFile(gffData, FormatAdapter.PASTE); assertTrue("Failed to parse features file", @@ -339,8 +339,8 @@ public class FeaturesFileTest assertEquals("no sequences extracted from GFF3 file", 2, dataset.getHeight()); - SequenceI seq1 = dataset.findName("seq1"), seq2 = dataset - .findName("seq2"); + SequenceI seq1 = dataset.findName("seq1"); + SequenceI seq2 = dataset.findName("seq2"); assertNotNull(seq1); assertNotNull(seq2); assertFalse( @@ -402,8 +402,7 @@ public class FeaturesFileTest assertTrue( "Didn't read the alignment into an alignframe from Gff3 File", af != null); - // FIXME codon mappings are on the alignment but not on the dataset - checkDatasetfromSimpleGff3(af.getViewport().getAlignment()/* .getDataset() */); + checkDatasetfromSimpleGff3(af.getViewport().getAlignment()); } @Test(groups = { "Functional" }) @@ -418,64 +417,4 @@ public class FeaturesFileTest parseResult); checkDatasetfromSimpleGff3(dataset); } - - /** - * Tests loading exonerate GFF2 output, including 'similarity' alignment - * feature, on to sequences - */ - @Test(groups = { "Functional" }) - public void testExonerateImport() - { - FileLoader loader = new FileLoader(false); - AlignFrame af = loader.LoadFileWaitTillLoaded( - "examples/testdata/exonerateseqs.fa", - FormatAdapter.FILE); - - af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff", - FormatAdapter.FILE, null, null); - - /* - * verify one mapping to a dummy sequence, one to a real one - */ - Set mappings = af - .getViewport().getAlignment().getDataset().getCodonFrames(); - assertEquals(2, mappings.size()); - Iterator iter = mappings.iterator(); - - // first mapping is to dummy sequence - AlignedCodonFrame mapping = iter.next(); - Mapping[] mapList = mapping.getProtMappings(); - assertEquals(1, mapList.length); - assertTrue(mapList[0].getTo() instanceof SequenceDummy); - assertEquals("DDB_G0269124", mapList[0].getTo().getName()); - - // second mapping is to a sequence in the alignment - mapping = iter.next(); - mapList = mapping.getProtMappings(); - assertEquals(1, mapList.length); - SequenceI proteinSeq = af.getViewport().getAlignment() - .findName("DDB_G0280897"); - assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo()); - assertEquals(1, mapping.getdnaToProt().length); - - // 143 in protein should map to codon [11270, 11269, 11268] in dna - int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143); - assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion); - - // 182 in protein should map to codon [11153, 11152, 11151] in dna - mappedRegion = mapList[0].getMap().locateInFrom(182, 182); - assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion); - - // and the reverse mapping: - mappedRegion = mapList[0].getMap().locateInTo(11151, 11153); - assertArrayEquals(new int[] { 182, 182 }, mappedRegion); - - // 11150 in dna should _not_ map to protein - mappedRegion = mapList[0].getMap().locateInTo(11150, 11150); - assertNull(mappedRegion); - - // similarly 183 in protein should _not_ map to dna - mappedRegion = mapList[0].getMap().locateInFrom(183, 183); - assertNull(mappedRegion); - } } diff --git a/test/jalview/io/gff/ExonerateHelperTest.java b/test/jalview/io/gff/ExonerateHelperTest.java new file mode 100644 index 0000000..54d6eb2 --- /dev/null +++ b/test/jalview/io/gff/ExonerateHelperTest.java @@ -0,0 +1,295 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; +import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Mapping; +import jalview.datamodel.MappingType; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceDummy; +import jalview.datamodel.SequenceI; +import jalview.gui.AlignFrame; +import jalview.io.FileLoader; +import jalview.io.FormatAdapter; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.testng.annotations.Test; + +public class ExonerateHelperTest +{ + @Test(groups = "Functional") + public void testGetMappingType() + { + // protein-to-dna: + assertSame(MappingType.PeptideToNucleotide, + ExonerateHelper + .getMappingType("exonerate:protein2genome:local")); + assertSame(MappingType.PeptideToNucleotide, + ExonerateHelper.getMappingType("exonerate:protein2dna:local")); + + // dna-to-dna: + assertSame(MappingType.NucleotideToNucleotide, + ExonerateHelper.getMappingType("coding2coding")); + assertSame(MappingType.NucleotideToNucleotide, + ExonerateHelper.getMappingType("coding2genome")); + assertSame(MappingType.NucleotideToNucleotide, + ExonerateHelper.getMappingType("cdna2genome")); + assertSame(MappingType.NucleotideToNucleotide, + ExonerateHelper.getMappingType("genome2genome")); + assertNull(ExonerateHelper.getMappingType("affine:local")); + } + + /** + * Test processing one exonerate GFF line for the case where the mapping is + * protein2dna, similarity feature is on the query (the protein), match to the + * forward strand, target sequence is in neither the alignment nor the 'new + * sequences' + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessGffSimilarity_protein2dna_forward_querygff() + throws IOException + { + ExonerateHelper testee = new ExonerateHelper(); + List newseqs = new ArrayList(); + String[] gff = "Seq\texonerate:protein2dna:local\tsimilarity\t3\t10\t.\t+\t.\talignment_id 0 ; Target dna1 ; Align 3 400 8" + .split("\\t"); + SequenceI seq = new Sequence("Seq", "PQRASTGKEEDVMIWCHQN"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + Map> set = Gff2Helper.parseNameValuePairs(gff[8]); + + /* + * this should create a mapping from Seq2/3-10 to virtual sequence + * dna1 (added to newseqs) positions 400-423 + */ + testee.processGffSimilarity(set, seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("dna1", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 400, 423 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Test processing one exonerate GFF line for the case where the mapping is + * protein2dna, similarity feature is on the query (the protein), match to the + * reverse strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessGffSimilarity_protein2dna_reverse_querygff() + throws IOException + { + ExonerateHelper testee = new ExonerateHelper(); + List newseqs = new ArrayList(); + String[] gff = "Seq\texonerate:protein2dna:local\tsimilarity\t3\t10\t0\t-\t.\talignment_id 0 ; Target dna1 ; Align 3 400 8" + .split("\\t"); + SequenceI seq = new Sequence("Seq", "PQRASTGKEEDVMIWCHQN"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + Map> set = Gff2Helper.parseNameValuePairs(gff[8]); + + /* + * this should create a mapping from Seq2/3-10 to virtual sequence + * dna1 (added to newseqs) positions 400-377 (reverse) + */ + testee.processGffSimilarity(set, seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("dna1", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 400, 377 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Test processing one exonerate GFF line for the case where the mapping is + * protein2dna, similarity feature is on the target (the dna), match to the + * forward strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessGffSimilarity_protein2dna_forward_targetgff() + throws IOException + { + ExonerateHelper testee = new ExonerateHelper(); + List newseqs = new ArrayList(); + String[] gff = "dna1\texonerate:protein2dna:local\tsimilarity\t400\t423\t0\t+\t.\talignment_id 0 ; Query Prot1 ; Align 400 3 24" + .split("\\t"); + SequenceI seq = new Sequence("dna1/391-430", + "CGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATC"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] { seq }); + // GFF feature on the target describes mapping from base 400 for + // count 24 to position 3 + Map> set = Gff2Helper.parseNameValuePairs(gff[8]); + + /* + * this should create a mapping from virtual sequence dna1 (added to + * newseqs) positions 400-423 to Prot1/3-10 + */ + testee.processGffSimilarity(set, seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("Prot1", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + assertEquals(1, mapping.getAaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 400, 423 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Test processing one exonerate GFF line for the case where the mapping is + * protein2dna, similarity feature is on the target (the dna), match to the + * reverse strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessGffSimilarity_protein2dna_reverse_targetgff() + throws IOException + { + ExonerateHelper testee = new ExonerateHelper(); + List newseqs = new ArrayList(); + String[] gff = "dna1\texonerate:protein2dna:local\tsimilarity\t377\t400\t0\t-\t.\talignment_id 0 ; Query Prot1 ; Align 400 3 24" + .split("\\t"); + SequenceI seq = new Sequence("dna1/371-410", + "CGATCCGATCCGATCCGATCCGATCCGATCCGATCCGATC"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] { seq }); + // GFF feature on the target describes mapping from base 400 for + // count 24 to position 3 + Map> set = Gff2Helper.parseNameValuePairs(gff[8]); + + /* + * this should create a mapping from virtual sequence dna1 (added to + * newseqs) positions 400-377 (reverse) to Prot1/3-10 + */ + testee.processGffSimilarity(set, seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("Prot1", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + assertEquals(1, mapping.getAaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 400, 377 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 3, 10 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Tests loading exonerate GFF2 output, including 'similarity' alignment + * feature, on to sequences + */ + @Test(groups = { "Functional" }) + public void testAddExonerateGffToAlignment() + { + FileLoader loader = new FileLoader(false); + AlignFrame af = loader.LoadFileWaitTillLoaded( + "examples/testdata/exonerateseqs.fa", + FormatAdapter.FILE); + + af.loadJalviewDataFile("examples/testdata/exonerateoutput.gff", + FormatAdapter.FILE, null, null); + + /* + * verify one mapping to a dummy sequence, one to a real one + */ + List mappings = af + .getViewport().getAlignment().getDataset().getCodonFrames(); + assertEquals(2, mappings.size()); + Iterator iter = mappings.iterator(); + + // first mapping is to dummy sequence + AlignedCodonFrame mapping = iter.next(); + Mapping[] mapList = mapping.getProtMappings(); + assertEquals(1, mapList.length); + assertTrue(mapList[0].getTo() instanceof SequenceDummy); + assertEquals("DDB_G0269124", mapList[0].getTo().getName()); + + // 143 in protein should map to codon [11270, 11269, 11268] in dna + int[] mappedRegion = mapList[0].getMap().locateInFrom(143, 143); + assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion); + + // second mapping is to a sequence in the alignment + mapping = iter.next(); + mapList = mapping.getProtMappings(); + assertEquals(1, mapList.length); + SequenceI proteinSeq = af.getViewport().getAlignment() + .findName("DDB_G0280897"); + assertSame(proteinSeq.getDatasetSequence(), mapList[0].getTo()); + assertEquals(1, mapping.getdnaToProt().length); + + // 143 in protein should map to codon [11270, 11269, 11268] in dna + mappedRegion = mapList[0].getMap().locateInFrom(143, 143); + assertArrayEquals(new int[] { 11270, 11268 }, mappedRegion); + + // 182 in protein should map to codon [11153, 11152, 11151] in dna + mappedRegion = mapList[0].getMap().locateInFrom(182, 182); + assertArrayEquals(new int[] { 11153, 11151 }, mappedRegion); + + // and the reverse mapping: + mappedRegion = mapList[0].getMap().locateInTo(11151, 11153); + assertArrayEquals(new int[] { 182, 182 }, mappedRegion); + + // 11150 in dna should _not_ map to protein + mappedRegion = mapList[0].getMap().locateInTo(11150, 11150); + assertNull(mappedRegion); + + // similarly 183 in protein should _not_ map to dna + mappedRegion = mapList[0].getMap().locateInFrom(183, 183); + assertNull(mappedRegion); + } +} diff --git a/test/jalview/io/gff/Gff3HelperTest.java b/test/jalview/io/gff/Gff3HelperTest.java new file mode 100644 index 0000000..420b032 --- /dev/null +++ b/test/jalview/io/gff/Gff3HelperTest.java @@ -0,0 +1,206 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; +import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceDummy; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.testng.annotations.Test; + +public class Gff3HelperTest +{ + + /** + * Test processing one PASA GFF line giving a match from forward strand to + * forward strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessCdnaMatch_forwardToForward() throws IOException + { + GffHelperBase testee = new Gff3Helper(); + List newseqs = new ArrayList(); + String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 +" + .split("\\t"); + SequenceI seq = new Sequence("gi|68711", + "GAATTCGTTCATGTAGGTTGATTTTTATT"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + + /* + * this should create a mapping from gi|68711/12923-13060 + * to virtual sequence gi|N37351 (added to newseqs) positions 1-138 + */ + testee.processGff(seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("gi|N37351", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + + /* + * 'dnaseqs' (map from) is here [gi|68711] + * 'aaseqs' (map to) is here [gi|N37351] + */ + // TODO use more suitable naming in AlignedCodonFrame + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 12923, 13060 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 1, 138 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Test processing one PASA GFF line giving a match from forward strand to + * reverse strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessCdnaMatch_forwardToReverse() throws IOException + { + GffHelperBase testee = new Gff3Helper(); + List newseqs = new ArrayList(); + String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 -" + .split("\\t"); + SequenceI seq = new Sequence("gi|68711", + "GAATTCGTTCATGTAGGTTGATTTTTATT"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + + /* + * this should create a mapping from gi|68711/12923-13060 + * to virtual sequence gi|N37351 (added to newseqs) positions 138-1 + */ + testee.processGff(seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("gi|N37351", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + + /* + * 'dnaseqs' (map from) is here [gi|68711] + * 'aaseqs' (map to) is here [gi|N37351] + */ + // TODO use more suitable naming in AlignedCodonFrame + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 12923, 13060 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 138, 1 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + + /** + * Test processing one PASA GFF line giving a match from reverse complement + * strand to forward strand + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessCdnaMatch_reverseToForward() throws IOException + { + GffHelperBase testee = new Gff3Helper(); + List newseqs = new ArrayList(); + String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t-\t.\tID=align_68;Target=gi|N37351 1 138 +" + .split("\\t"); + SequenceI seq = new Sequence("gi|68711", + "GAATTCGTTCATGTAGGTTGATTTTTATT"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + + /* + * (For now) we don't process reverse complement mappings; to do this + * would require (a) creating a virtual sequence placeholder for the + * reverse complement (b) resolving the sequence by its id from some + * source (GFF ##FASTA or other) (c) creating the reverse complement + * sequence (d) updating the mapping to be to the reverse complement + */ + SequenceFeature sf = testee.processGff(seq, gff, align, newseqs, false); + assertNull(sf); + assertTrue(newseqs.isEmpty()); + } + + /** + * Test processing two PASA GFF lines representing a spliced mapping + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessCdnaMatch_spliced() throws IOException + { + GffHelperBase testee = new Gff3Helper(); + List newseqs = new ArrayList(); + SequenceI seq = new Sequence("gi|68711", + "GAATTCGTTCATGTAGGTTGATTTTTATT"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + + // mapping from gi|68711 12923-13060 to gi|N37351 1-138 + String[] gff = "gi|68711\tblat-pasa\tcDNA_match\t12923\t13060\t98.55\t+\t.\tID=align_68;Target=gi|N37351 1 138 +" + .split("\\t"); + testee.processGff(seq, gff, align, newseqs, false); + // mapping from gi|68711 13411-13550 to gi|N37351 139-278 + gff = "gi|68711\tblat-pasa\tcDNA_match\t13411\t13550\t98.55\t+\t.\tID=align_68;Target=gi|N37351 139 278 +" + .split("\\t"); + testee.processGff(seq, gff, align, newseqs, false); + + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("gi|N37351", newseqs.get(0).getName()); + + // only 1 AlignedCodonFrame added to the alignment with both mappings! + // (this is important for 'align cdna to genome' to work correctly) + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().get(0); + + /* + * 'dnaseqs' (map from) is here [gi|68711] + * 'aaseqs' (map to) is here [gi|N37351] + */ + // TODO use more suitable naming in AlignedCodonFrame + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(2, mapping.getdnaToProt()[0].getFromRanges().size()); + // the two spliced dna ranges are combined in one MapList + assertArrayEquals(new int[] { 12923, 13060 }, + mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertArrayEquals(new int[] { 13411, 13550 }, mapping.getdnaToProt()[0] + .getFromRanges().get(1)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + // the two cdna ranges are merged into one contiguous region + assertArrayEquals(new int[] { 1, 278 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + +} diff --git a/test/jalview/io/gff/GffHelperBaseTest.java b/test/jalview/io/gff/GffHelperBaseTest.java new file mode 100644 index 0000000..fe8f88e --- /dev/null +++ b/test/jalview/io/gff/GffHelperBaseTest.java @@ -0,0 +1,168 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertFalse; +import static org.testng.AssertJUnit.assertTrue; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.testng.annotations.Test; + +public class GffHelperBaseTest +{ + + /** + * Test the method that parses lines like
+ * ID=2345;Name=Something,Another thing;Notes=Hello;Notes=World + */ + @Test(groups = { "Functional" }) + public void testParseNameValuePairs() + { + assertTrue(GffHelperBase.parseNameValuePairs(null, ";", ' ', ",") + .isEmpty()); + assertTrue(GffHelperBase.parseNameValuePairs("", ";", ' ', ",") + .isEmpty()); + assertTrue(GffHelperBase.parseNameValuePairs("hello=world", ";", ' ', + ",").isEmpty()); + + Map> map = GffHelperBase.parseNameValuePairs( + "hello world", ";", ' ', ", "); + assertEquals(1, map.size()); + assertEquals(1, map.get("hello").size()); + assertEquals("world", map.get("hello").get(0)); + + map = GffHelperBase + .parseNameValuePairs( + "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny; Type=", + ";", '=', ","); + + // Type is ignored as no value was supplied + assertEquals(2, map.size()); + + assertEquals(1, map.get("Method").size()); + assertEquals("manual curation", map.get("Method").get(0)); // trimmed + + assertEquals(3, map.get("Notes").size()); + assertEquals("F2 S", map.get("Notes").get(0)); + assertEquals("Metal", map.get("Notes").get(1)); + assertEquals("Shiny", map.get("Notes").get(2)); + } + + /** + * Test for the method that tries to trim mappings to equivalent lengths + */ + @Test(groups = "Functional") + public void testTrimMapping() + { + int[] from = { 1, 12 }; + int[] to = { 20, 31 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[1, 12]", Arrays.toString(from)); // unchanged + assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + + // from too long: + from = new int[] { 1, 13 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[1, 12]", Arrays.toString(from)); // trimmed + assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + + // to too long: + to = new int[] { 20, 33 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[1, 12]", Arrays.toString(from)); // unchanged + assertEquals("[20, 31]", Arrays.toString(to)); // trimmed + + // from reversed: + from = new int[] { 12, 1 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[12, 1]", Arrays.toString(from)); // unchanged + assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + + // to reversed: + to = new int[] { 31, 20 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[12, 1]", Arrays.toString(from)); // unchanged + assertEquals("[31, 20]", Arrays.toString(to)); // unchanged + + // from reversed and too long: + from = new int[] { 14, 1 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[14, 3]", Arrays.toString(from)); // end trimmed + assertEquals("[31, 20]", Arrays.toString(to)); // unchanged + + // to reversed and too long: + to = new int[] { 31, 10 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); + assertEquals("[14, 3]", Arrays.toString(from)); // unchanged + assertEquals("[31, 20]", Arrays.toString(to)); // end trimmed + + // cdna to peptide (matching) + from = new int[] { 1, 18 }; + to = new int[] { 4, 9 }; + assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); + assertEquals("[1, 18]", Arrays.toString(from)); // unchanged + assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + + // overlong cdna to peptide + from = new int[] { 1, 20 }; + assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); + assertEquals("[1, 18]", Arrays.toString(from)); // end trimmed + assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + + // overlong cdna (reversed) to peptide + from = new int[] { 20, 1 }; + assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); + assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed + assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + + // overlong cdna (reversed) to peptide (reversed) + from = new int[] { 20, 1 }; + to = new int[] { 9, 4 }; + assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); + assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed + assertEquals("[9, 4]", Arrays.toString(to)); // unchanged + + // peptide to cdna (matching) + from = new int[] { 4, 9 }; + to = new int[] { 1, 18 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[4, 9]", Arrays.toString(from)); // unchanged + assertEquals("[1, 18]", Arrays.toString(to)); // unchanged + + // peptide to overlong cdna + to = new int[] { 1, 20 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[4, 9]", Arrays.toString(from)); // unchanged + assertEquals("[1, 18]", Arrays.toString(to)); // end trimmed + + // peptide to overlong cdna (reversed) + to = new int[] { 20, 1 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[4, 9]", Arrays.toString(from)); // unchanged + assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed + + // peptide (reversed) to overlong cdna (reversed) + from = new int[] { 9, 4 }; + to = new int[] { 20, 1 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[9, 4]", Arrays.toString(from)); // unchanged + assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed + + // overlong peptide to word-length cdna + from = new int[] { 4, 10 }; + to = new int[] { 1, 18 }; + assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[4, 9]", Arrays.toString(from)); // end trimmed + assertEquals("[1, 18]", Arrays.toString(to)); // unchanged + + // overlong peptide to non-word-length cdna + from = new int[] { 4, 10 }; + to = new int[] { 1, 19 }; + assertFalse(GffHelperBase.trimMapping(from, to, 1, 3)); + assertEquals("[4, 10]", Arrays.toString(from)); // unchanged + assertEquals("[1, 19]", Arrays.toString(to)); // unchanged + + } +} diff --git a/test/jalview/io/gff/GffHelperFactoryTest.java b/test/jalview/io/gff/GffHelperFactoryTest.java new file mode 100644 index 0000000..657b5bd --- /dev/null +++ b/test/jalview/io/gff/GffHelperFactoryTest.java @@ -0,0 +1,72 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertNull; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; + +import org.testng.annotations.Test; + +public class GffHelperFactoryTest +{ + + @Test(groups = "Functional") + public void testGetHelper() + { + assertNull(GffHelperFactory.getHelper(null)); + + String tabRegex = "\\t"; + + /* + * column 3 = 'similarity' indicates exonerate GFF alignment data + */ + String gff = "submitted\taffine:local\tsimilarity\t20\t30\t99\t+\t.\t"; + // no attributes (column 9 data): + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof Gff2Helper); + + // attributes set but unhandled featureGroup - get generic handler + gff = "submitted\taffine:local\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertSame(GffHelperFactory.getHelper(gff.split(tabRegex)).getClass(), + Gff3Helper.class); + + // handled featureGroup (exonerate model) values + gff = "submitted\texonerate:protein2dna:local\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + gff = "submitted\tprotein2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + gff = "submitted\tcoding2coding\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + gff = "submitted\tcoding2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + gff = "submitted\tcdna2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + gff = "submitted\tgenome2genome\tsimilarity\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + // not case-sensitive: + gff = "submitted\tgenome2genome\tSIMILARITY\t20\t30\t99\t+\t.\tID=$1"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof ExonerateHelper); + + /* + * InterProScan has 'protein_match' in column 3 + */ + gff = "Submitted\tPANTHER\tprotein_match\t1\t1174\t0.0\t+\t.\tName=PTHR32154"; + assertTrue(GffHelperFactory.getHelper(gff.split(tabRegex)) instanceof InterProScanHelper); + + /* + * nothing specific - return the generic GFF3 class if Name=Value is present in col9 + */ + gff = "nothing\tinteresting\there\t20\t30\t99\t+\t.\tID=1"; + GffHelperI helper = GffHelperFactory.getHelper(gff.split(tabRegex)); + assertSame(helper.getClass(), Gff3Helper.class); + + // return the generic GFF2 class if "Name Value" is present in col9 + gff = "nothing\tinteresting\there\t20\t30\t99\t+\t.\tID 1"; + helper = GffHelperFactory.getHelper(gff.split(tabRegex)); + assertSame(helper.getClass(), Gff2Helper.class); + } +} diff --git a/test/jalview/io/ExonerateGffTest.java b/test/jalview/io/gff/GffTests.java similarity index 86% rename from test/jalview/io/ExonerateGffTest.java rename to test/jalview/io/gff/GffTests.java index 70c0ec2..77da8fa 100644 --- a/test/jalview/io/ExonerateGffTest.java +++ b/test/jalview/io/gff/GffTests.java @@ -1,4 +1,4 @@ -package jalview.io; +package jalview.io.gff; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertSame; @@ -13,28 +13,27 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceI; import jalview.gui.AlignFrame; +import jalview.io.FileLoader; +import jalview.io.FormatAdapter; -import java.util.Set; +import java.util.List; import org.testng.annotations.Test; /** - * Tests of use cases that include parsing exonerate GFF 'similarity' features. - * These describe mappings between protein and cDNA - * - * @author gmcarstairs - * + * Tests of use cases that include parsing GFF (version 2 or 3) features that + * describe mappings between protein and cDNA. The format of the GFF varies + * depending on which tool generated it. */ -public class ExonerateGffTest +public class GffTests { - /** * Test the case where we load a protein ('query') sequence, then exonerateGff * describing its mapping to cDNA, and then a DNA sequence including the * mapped region */ @Test(groups = "Functional") - public void testLoadProteinGffCdna() + public void testResolveExonerateGff() { String proteinSeq = ">prot1/10-16\nYCWRSGA"; AlignFrame af = new FileLoader(false).LoadFileWaitTillLoaded( @@ -55,7 +54,7 @@ public class ExonerateGffTest assertEquals(1, dataset.getSequences().size()); assertEquals("prot1", dataset.getSequenceAt(0).getName()); assertEquals("YCWRSGA", dataset.getSequenceAt(0).getSequenceAsString()); - Set mappings = dataset.getCodonFrames(); + List mappings = dataset.getCodonFrames(); assertEquals(1, mappings.size()); AlignedCodonFrame mapping = mappings.iterator().next(); SequenceI mappedDna = mapping.getDnaForAaSeq(dataset.getSequenceAt(0)); @@ -79,6 +78,7 @@ public class ExonerateGffTest /* * Now 'realise' the virtual mapping to the real DNA sequence; * interactively this could be by a drag or fetch of the sequence data + * on to the alignment */ mapping.realiseWith(dna1); // verify the mapping is now from the real, not the dummy sequence diff --git a/test/jalview/io/gff/InterProScanHelperTest.java b/test/jalview/io/gff/InterProScanHelperTest.java new file mode 100644 index 0000000..2ef4c99 --- /dev/null +++ b/test/jalview/io/gff/InterProScanHelperTest.java @@ -0,0 +1,71 @@ +package jalview.io.gff; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertSame; +import static org.testng.AssertJUnit.assertTrue; +import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceDummy; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.testng.annotations.Test; + +public class InterProScanHelperTest +{ + + /** + * Test processing one InterProScan GFF line + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testProcessProteinMatch() throws IOException + { + InterProScanHelper testee = new InterProScanHelper(); + List newseqs = new ArrayList(); + String[] gff = "Submitted\tPfam\tprotein_match\t5\t30\t0\t+\t.\tName=PF12838;Target=Submitted 5 30;signature_desc=4Fe-4S dicluster domain;ID=match$17_5_30" + .split("\\t"); + SequenceI seq = new Sequence("Prot1", "PQRASTGKEEDVMIWCHQN"); + seq.createDatasetSequence(); + AlignmentI align = new Alignment(new SequenceI[] {}); + Map> set = Gff3Helper.parseNameValuePairs(gff[8]); + + /* + * this should create a mapping from Prot1/5-30 to virtual sequence + * match$17_5_30 (added to newseqs) positions 1-26 + */ + testee.processProteinMatch(set, seq, gff, align, newseqs, false); + assertEquals(1, newseqs.size()); + assertTrue(newseqs.get(0) instanceof SequenceDummy); + assertEquals("match$17_5_30", newseqs.get(0).getName()); + assertEquals(1, align.getCodonFrames().size()); + AlignedCodonFrame mapping = align.getCodonFrames().iterator().next(); + + /* + * 'dnaseqs' (map from) is here [Prot1] + * 'aaseqs' (map to) is here [match$17_5_30] + */ + // TODO use more suitable naming in AlignedCodonFrame + assertEquals(1, mapping.getAaSeqs().length); + assertSame(seq.getDatasetSequence(), mapping.getdnaSeqs()[0]); + assertEquals(1, mapping.getdnaSeqs().length); + assertSame(newseqs.get(0), mapping.getAaSeqs()[0]); + assertEquals(1, mapping.getdnaToProt().length); + assertEquals(1, mapping.getdnaToProt()[0].getFromRanges().size()); + assertArrayEquals(new int[] { 5, 30 }, mapping.getdnaToProt()[0] + .getFromRanges().get(0)); + assertEquals(1, mapping.getdnaToProt()[0].getToRanges().size()); + assertArrayEquals(new int[] { 1, 26 }, mapping.getdnaToProt()[0] + .getToRanges().get(0)); + } + +} diff --git a/test/jalview/util/StringUtilsTest.java b/test/jalview/util/StringUtilsTest.java index 0b776d1..dc2555b 100644 --- a/test/jalview/util/StringUtilsTest.java +++ b/test/jalview/util/StringUtilsTest.java @@ -27,7 +27,6 @@ import static org.testng.AssertJUnit.assertTrue; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.Map; import org.testng.annotations.Test; @@ -132,46 +131,6 @@ public class StringUtilsTest { "a", "b*c", "cde" }, "*")); } - /** - * Test the method that parses lines like
- * ID=2345;Name=Something; - */ - @Test(groups = { "Functional" }) - public void testParseNameValuePairs() - { - char[] separators = new char[] { ' ' }; - assertTrue(StringUtils.parseNameValuePairs(null, ";", separators) - .isEmpty()); - assertTrue(StringUtils.parseNameValuePairs("", ";", separators) - .isEmpty()); - assertTrue(StringUtils.parseNameValuePairs("hello=world", ";", - separators).isEmpty()); - - Map> map = StringUtils.parseNameValuePairs( - "hello world", ";", separators); - assertEquals(1, map.size()); - assertEquals(1, map.get("hello").size()); - assertEquals("world", map.get("hello").get(0)); - - separators = new char[] { ' ', '=' }; - map = StringUtils - .parseNameValuePairs( - "Method= manual curation ;nothing; Notes F2=S ; Notes=Metal; Type=", - ";", separators); - - // Type is ignored as no value was supplied - assertEquals(2, map.size()); - - // equals separator used ahead of space separator: - assertEquals(1, map.get("Method").size()); - assertEquals("manual curation", map.get("Method").get(0)); // trimmed - - assertEquals(2, map.get("Notes").size()); - // space separator used ahead of equals separator - assertEquals("F2=S", map.get("Notes").get(0)); - assertEquals("Metal", map.get("Notes").get(1)); - } - @Test(groups = { "Functional" }) public void testListToDelimitedString() { -- 1.7.10.2