sf.setValue("ATTRIBUTES", attributes.toString());
}
- seq.addSequenceFeature(sf);
- while ((seq = align.findName(seq, seqId, true)) != null)
+ if (processOrAddSeqFeature(align, newseqs, seq, sf, GFFFile,
+ relaxedIdmatching))
{
- seq.addSequenceFeature(new SequenceFeature(sf));
+ // check whether we should add the sequence feature to any other
+ // sequences in the alignment with the same or similar
+ while ((seq = align.findName(seq, seqId, true)) != null)
+ {
+ seq.addSequenceFeature(new SequenceFeature(sf));
+ }
}
break;
}
return true;
}
+
+ /**
+ * take a sequence feature and examine its attributes to decide how it should
+ * be added to a sequence
+ *
+ * @param seq
+ * - the destination sequence constructed or discovered in the
+ * current context
+ * @param sf
+ * - the base feature with ATTRIBUTES property containing any
+ * additional attributes
+ * @param gFFFile
+ * - true if we are processing a GFF annotation file
+ * @return true if sf was actually added to the sequence, false if it was
+ * processed in another way
+ */
+ public boolean processOrAddSeqFeature(AlignmentI align, List<SequenceI> newseqs, SequenceI seq, SequenceFeature sf,
+ boolean gFFFile, boolean relaxedIdMatching)
+ {
+ String attr = (String) sf.getValue("ATTRIBUTES");
+ boolean add = true;
+ if (gFFFile && attr != null)
+ {
+ int nattr=8;
+
+ for (String attset : attr.split("\t"))
+ {
+ if (attset==null || attset.trim().length()==0)
+ {
+ continue;
+ }
+ nattr++;
+ Map<String, List<String>> set = new HashMap<String, List<String>>();
+ // normally, only expect one column - 9 - in this field
+ // the attributes (Gff3) or groups (gff2) field
+ for (String pair : attset.trim().split(";"))
+ {
+ pair = pair.trim();
+ if (pair.length() == 0)
+ {
+ continue;
+ }
+
+ // expect either space seperated (gff2) or '=' separated (gff3)
+ // key/value pairs here
+
+ int eqpos = pair.indexOf('='),sppos = pair.indexOf(' ');
+ String key = null, value = null;
+
+ if (sppos > -1 && (eqpos == -1 || sppos < eqpos))
+ {
+ key = pair.substring(0, sppos);
+ value = pair.substring(sppos + 1);
+ } else {
+ if (eqpos > -1 && (sppos == -1 || eqpos < sppos))
+ {
+ key = pair.substring(0, eqpos);
+ value = pair.substring(eqpos + 1);
+ } else
+ {
+ key = pair;
+ }
+ }
+ if (key != null)
+ {
+ List<String> vals = set.get(key);
+ if (vals == null)
+ {
+ vals = new ArrayList<String>();
+ set.put(key, vals);
+ }
+ if (value != null)
+ {
+ vals.add(value.trim());
+ }
+ }
+ }
+ try
+ {
+ add &= processGffKey(set, nattr, seq, sf, align, newseqs,
+ relaxedIdMatching); // process decides if
+ // feature is actually
+ // added
+ } catch (InvalidGFF3FieldException ivfe)
+ {
+ System.err.println(ivfe);
+ }
+ }
+ }
+ if (add)
+ {
+ seq.addSequenceFeature(sf);
+ }
+ return add;
+ }
+
+ public class InvalidGFF3FieldException extends Exception
+ {
+ String field, value;
+
+ public InvalidGFF3FieldException(String field,
+ Map<String, List<String>> set, String message)
+ {
+ super(message + " (Field was " + field + " and value was "
+ + set.get(field).toString());
+ this.field = field;
+ this.value = set.get(field).toString();
+ }
+
+ }
+
+ /**
+ * take a set of keys for a feature and interpret them
+ *
+ * @param set
+ * @param nattr
+ * @param seq
+ * @param sf
+ * @return
+ */
+ public boolean processGffKey(Map<String, List<String>> set, int nattr,
+ SequenceI seq, SequenceFeature sf, AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching)
+ throws InvalidGFF3FieldException
+ {
+ String attr;
+ // decide how to interpret according to type
+ if (sf.getType().equals("similarity"))
+ {
+ int strand = sf.getStrand();
+ // exonerate cdna/protein map
+ // look for fields
+ List<SequenceI> querySeq = findNames(align, newseqs,
+ relaxedIdMatching, set.get(attr="Query"));
+ if (querySeq==null || querySeq.size()!=1)
+ {
+ throw new InvalidGFF3FieldException( attr, set,
+ "Expecting exactly one sequence in Query field (got "
+ + set.get(attr) + ")");
+ }
+ if (set.containsKey(attr="Align"))
+ {
+ // process the align maps and create cdna/protein maps
+ // ideally, the query sequences are in the alignment, but maybe not...
+
+ AlignedCodonFrame alco = new AlignedCodonFrame();
+ MapList codonmapping = constructCodonMappingFromAlign(set, attr,
+ strand);
+
+ // add codon mapping, and hope!
+ alco.addMap(seq, querySeq.get(0), codonmapping);
+ align.addCodonFrame(alco);
+ // everything that's needed to be done is done
+ // no features to create here !
+ return false;
+ }
+
+ }
+ return true;
+ }
+
+ private MapList constructCodonMappingFromAlign(
+ Map<String, List<String>> set,
+ String attr, int strand) throws InvalidGFF3FieldException
+ {
+ if (strand == 0)
+ {
+ throw new InvalidGFF3FieldException(attr, set,
+ "Invalid strand for a codon mapping (cannot be 0)");
+ }
+ List<Integer> fromrange = new ArrayList<Integer>(), torange = new ArrayList<Integer>();
+ int lastppos = 0, lastpframe = 0;
+ for (String range : set.get(attr))
+ {
+ List<Integer> ints = new ArrayList<Integer>();
+ StringTokenizer st = new StringTokenizer(range, " ");
+ while (st.hasMoreTokens())
+ {
+ String num = st.nextToken();
+ try
+ {
+ ints.add(new Integer(num));
+ } catch (NumberFormatException nfe)
+ {
+ throw new InvalidGFF3FieldException(attr, set,
+ "Invalid number in field " + num);
+ }
+ }
+ // Align positionInRef positionInQuery LengthInRef
+ // contig_1146 exonerate:protein2genome:local similarity 8534 11269
+ // 3652 - . alignment_id 0 ;
+ // Query DDB_G0269124
+ // Align 11270 143 120
+ // corresponds to : 120 bases align at pos 143 in protein to 11270 on
+ // dna in strand direction
+ // Align 11150 187 282
+ // corresponds to : 282 bases align at pos 187 in protein to 11150 on
+ // dna in strand direction
+ //
+ // Align 10865 281 888
+ // Align 9977 578 1068
+ // Align 8909 935 375
+ //
+ if (ints.size() != 3)
+ {
+ throw new InvalidGFF3FieldException(attr, set,
+ "Invalid number of fields for this attribute ("
+ + ints.size() + ")");
+ }
+ fromrange.add(new Integer(ints.get(0).intValue()));
+ fromrange.add(new Integer(ints.get(0).intValue() + strand
+ * ints.get(2).intValue()));
+ // how are intron/exon boundaries that do not align in codons
+ // represented
+ if (ints.get(1).equals(lastppos) && lastpframe > 0)
+ {
+ // extend existing to map
+ lastppos += ints.get(2) / 3;
+ lastpframe = ints.get(2) % 3;
+ torange.set(torange.size() - 1, new Integer(lastppos));
+ }
+ else
+ {
+ // new to map range
+ torange.add(ints.get(1));
+ lastppos = ints.get(1) + ints.get(2) / 3;
+ lastpframe = ints.get(2) % 3;
+ torange.add(new Integer(lastppos));
+ }
+ }
+ // from and to ranges must end up being a series of start/end intervals
+ if (fromrange.size() % 2 == 1)
+ {
+ throw new InvalidGFF3FieldException(attr, set,
+ "Couldn't parse the DNA alignment range correctly");
+ }
+ if (torange.size() % 2 == 1)
+ {
+ throw new InvalidGFF3FieldException(attr, set,
+ "Couldn't parse the protein alignment range correctly");
+ }
+ // finally, build the map
+ int[] frommap = new int[fromrange.size()], tomap = new int[torange
+ .size()];
+ int p = 0;
+ for (Integer ip : fromrange)
+ {
+ frommap[p++] = ip.intValue();
+ }
+ p = 0;
+ for (Integer ip : torange)
+ {
+ tomap[p++] = ip.intValue();
+ }
+
+ return new MapList(frommap, tomap, 3, 1);
+ }
+
+ private List<SequenceI> findNames(AlignmentI align,
+ List<SequenceI> newseqs, boolean relaxedIdMatching,
+ List<String> list)
+ {
+ List<SequenceI> found = new ArrayList<SequenceI>();
+ for (String seqId : list)
+ {
+ SequenceI seq = findName(align, seqId, relaxedIdMatching, newseqs);
+ if (seq != null)
+ {
+ found.add(seq);
+ }
+ }
+ return found;
+ }
+
private AlignmentI lastmatchedAl = null;
private SequenceIdMatcher matcher = null;