From: Jim Procter Date: Thu, 19 Feb 2015 15:20:14 +0000 (+0000) Subject: JAL-653 GFF attributes parsing mechanism, with first attempt at processing exonerate... X-Git-Tag: Release_2_10_0~296^2~163 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=b005d2429ec2b467aaf18318336842dc4c292b76 JAL-653 GFF attributes parsing mechanism, with first attempt at processing exonerate's 'similarity' features as cDNA/protein mappings --- diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index a9ad665..baf9b82 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -527,10 +527,15 @@ public class FeaturesFile extends AlignFile sf.setValue("ATTRIBUTES", attributes.toString()); } - seq.addSequenceFeature(sf); - while ((seq = align.findName(seq, seqId, true)) != null) + if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile, + relaxedIdmatching)) { - seq.addSequenceFeature(new SequenceFeature(sf)); + // check whether we should add the sequence feature to any other + // sequences in the alignment with the same or similar + while ((seq = align.findName(seq, seqId, true)) != null) + { + seq.addSequenceFeature(new SequenceFeature(sf)); + } } break; } @@ -639,6 +644,280 @@ public class FeaturesFile extends AlignFile return true; } + + /** + * take a sequence feature and examine its attributes to decide how it should + * be added to a sequence + * + * @param seq + * - the destination sequence constructed or discovered in the + * current context + * @param sf + * - the base feature with ATTRIBUTES property containing any + * additional attributes + * @param gFFFile + * - true if we are processing a GFF annotation file + * @return true if sf was actually added to the sequence, false if it was + * processed in another way + */ + public boolean processOrAddSeqFeature(AlignmentI align, List newseqs, SequenceI seq, SequenceFeature sf, + boolean gFFFile, boolean relaxedIdMatching) + { + String attr = (String) sf.getValue("ATTRIBUTES"); + boolean add = true; + if (gFFFile && attr != null) + { + int nattr=8; + + for (String attset : attr.split("\t")) + { + if (attset==null || attset.trim().length()==0) + { + continue; + } + nattr++; + Map> set = new HashMap>(); + // normally, only expect one column - 9 - in this field + // the attributes (Gff3) or groups (gff2) field + for (String pair : attset.trim().split(";")) + { + pair = pair.trim(); + if (pair.length() == 0) + { + continue; + } + + // expect either space seperated (gff2) or '=' separated (gff3) + // key/value pairs here + + int eqpos = pair.indexOf('='),sppos = pair.indexOf(' '); + String key = null, value = null; + + if (sppos > -1 && (eqpos == -1 || sppos < eqpos)) + { + key = pair.substring(0, sppos); + value = pair.substring(sppos + 1); + } else { + if (eqpos > -1 && (sppos == -1 || eqpos < sppos)) + { + key = pair.substring(0, eqpos); + value = pair.substring(eqpos + 1); + } else + { + key = pair; + } + } + if (key != null) + { + List vals = set.get(key); + if (vals == null) + { + vals = new ArrayList(); + set.put(key, vals); + } + if (value != null) + { + vals.add(value.trim()); + } + } + } + try + { + add &= processGffKey(set, nattr, seq, sf, align, newseqs, + relaxedIdMatching); // process decides if + // feature is actually + // added + } catch (InvalidGFF3FieldException ivfe) + { + System.err.println(ivfe); + } + } + } + if (add) + { + seq.addSequenceFeature(sf); + } + return add; + } + + public class InvalidGFF3FieldException extends Exception + { + String field, value; + + public InvalidGFF3FieldException(String field, + Map> set, String message) + { + super(message + " (Field was " + field + " and value was " + + set.get(field).toString()); + this.field = field; + this.value = set.get(field).toString(); + } + + } + + /** + * take a set of keys for a feature and interpret them + * + * @param set + * @param nattr + * @param seq + * @param sf + * @return + */ + public boolean processGffKey(Map> set, int nattr, + SequenceI seq, SequenceFeature sf, AlignmentI align, + List newseqs, boolean relaxedIdMatching) + throws InvalidGFF3FieldException + { + String attr; + // decide how to interpret according to type + if (sf.getType().equals("similarity")) + { + int strand = sf.getStrand(); + // exonerate cdna/protein map + // look for fields + List querySeq = findNames(align, newseqs, + relaxedIdMatching, set.get(attr="Query")); + if (querySeq==null || querySeq.size()!=1) + { + throw new InvalidGFF3FieldException( attr, set, + "Expecting exactly one sequence in Query field (got " + + set.get(attr) + ")"); + } + if (set.containsKey(attr="Align")) + { + // process the align maps and create cdna/protein maps + // ideally, the query sequences are in the alignment, but maybe not... + + AlignedCodonFrame alco = new AlignedCodonFrame(); + MapList codonmapping = constructCodonMappingFromAlign(set, attr, + strand); + + // add codon mapping, and hope! + alco.addMap(seq, querySeq.get(0), codonmapping); + align.addCodonFrame(alco); + // everything that's needed to be done is done + // no features to create here ! + return false; + } + + } + return true; + } + + private MapList constructCodonMappingFromAlign( + Map> set, + String attr, int strand) throws InvalidGFF3FieldException + { + if (strand == 0) + { + throw new InvalidGFF3FieldException(attr, set, + "Invalid strand for a codon mapping (cannot be 0)"); + } + List fromrange = new ArrayList(), torange = new ArrayList(); + int lastppos = 0, lastpframe = 0; + for (String range : set.get(attr)) + { + List ints = new ArrayList(); + StringTokenizer st = new StringTokenizer(range, " "); + while (st.hasMoreTokens()) + { + String num = st.nextToken(); + try + { + ints.add(new Integer(num)); + } catch (NumberFormatException nfe) + { + throw new InvalidGFF3FieldException(attr, set, + "Invalid number in field " + num); + } + } + // Align positionInRef positionInQuery LengthInRef + // contig_1146 exonerate:protein2genome:local similarity 8534 11269 + // 3652 - . alignment_id 0 ; + // Query DDB_G0269124 + // Align 11270 143 120 + // corresponds to : 120 bases align at pos 143 in protein to 11270 on + // dna in strand direction + // Align 11150 187 282 + // corresponds to : 282 bases align at pos 187 in protein to 11150 on + // dna in strand direction + // + // Align 10865 281 888 + // Align 9977 578 1068 + // Align 8909 935 375 + // + if (ints.size() != 3) + { + throw new InvalidGFF3FieldException(attr, set, + "Invalid number of fields for this attribute (" + + ints.size() + ")"); + } + fromrange.add(new Integer(ints.get(0).intValue())); + fromrange.add(new Integer(ints.get(0).intValue() + strand + * ints.get(2).intValue())); + // how are intron/exon boundaries that do not align in codons + // represented + if (ints.get(1).equals(lastppos) && lastpframe > 0) + { + // extend existing to map + lastppos += ints.get(2) / 3; + lastpframe = ints.get(2) % 3; + torange.set(torange.size() - 1, new Integer(lastppos)); + } + else + { + // new to map range + torange.add(ints.get(1)); + lastppos = ints.get(1) + ints.get(2) / 3; + lastpframe = ints.get(2) % 3; + torange.add(new Integer(lastppos)); + } + } + // from and to ranges must end up being a series of start/end intervals + if (fromrange.size() % 2 == 1) + { + throw new InvalidGFF3FieldException(attr, set, + "Couldn't parse the DNA alignment range correctly"); + } + if (torange.size() % 2 == 1) + { + throw new InvalidGFF3FieldException(attr, set, + "Couldn't parse the protein alignment range correctly"); + } + // finally, build the map + int[] frommap = new int[fromrange.size()], tomap = new int[torange + .size()]; + int p = 0; + for (Integer ip : fromrange) + { + frommap[p++] = ip.intValue(); + } + p = 0; + for (Integer ip : torange) + { + tomap[p++] = ip.intValue(); + } + + return new MapList(frommap, tomap, 3, 1); + } + + private List findNames(AlignmentI align, + List newseqs, boolean relaxedIdMatching, + List list) + { + List found = new ArrayList(); + for (String seqId : list) + { + SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs); + if (seq != null) + { + found.add(seq); + } + } + return found; + } + private AlignmentI lastmatchedAl = null; private SequenceIdMatcher matcher = null;