+ /*
+ * If a codon has an intron gap, there will be contiguous 'toRanges';
+ * this is handled for us by the MapList constructor.
+ * (It is not clear that exonerate ever generates this case)
+ */
+ toRanges[toRangesIndex++] = toStart;
+ toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
+ }
+
+ return new MapList(fromRanges, toRanges, 3, 1);
+ }
+
+ /**
+ * Parse a GFF format feature. This may include creating a 'dummy' sequence
+ * for the feature or its mapped sequence
+ *
+ * @param st
+ * @param alignment
+ * @param relaxedIdMatching
+ * @param newseqs
+ * @return
+ */
+ protected SequenceI parseGffFeature(StringTokenizer st,
+ AlignmentI alignment, boolean relaxedIdMatching,
+ List<SequenceI> newseqs)
+ {
+ SequenceI seq;
+ /*
+ * GFF: seqid source type start end score strand phase [attributes]
+ */
+ if (st.countTokens() < 8)
+ {
+ System.err
+ .println("Ignoring GFF feature line with unexpected number of columns ("
+ + st.countTokens() + ")");
+ return null;
+ }
+ String seqId = st.nextToken();
+
+ /*
+ * locate referenced sequence in alignment _or_
+ * as a forward reference (SequenceDummy)
+ */
+ seq = findName(alignment, newseqs, relaxedIdMatching, seqId);
+
+ String desc = st.nextToken();
+ String group = null;
+ if (desc.indexOf(' ') == -1)
+ {
+ // could also be a source term rather than description line
+ group = desc;
+ }
+ String ft = st.nextToken();
+ int startPos = StringUtils.parseInt(st.nextToken());
+ int endPos = StringUtils.parseInt(st.nextToken());
+ // TODO: decide if non positional feature assertion for input data
+ // where end==0 is generally valid
+ if (endPos == 0)
+ {
+ // treat as non-positional feature, regardless.
+ startPos = 0;
+ }
+ float score = 0f;
+ try
+ {
+ score = new Float(st.nextToken()).floatValue();
+ } catch (NumberFormatException ex)
+ {
+ // leave at 0
+ }
+
+ SequenceFeature sf = new SequenceFeature(ft, desc, startPos,
+ endPos, score, group);
+ if (st.hasMoreTokens())
+ {
+ sf.setValue(STRAND, st.nextToken());
+ }
+ if (st.hasMoreTokens())
+ {
+ sf.setValue(FRAME, st.nextToken());
+ }
+
+ if (st.hasMoreTokens())
+ {
+ processGffColumnNine(st.nextToken(), sf);
+ }
+
+ if (processOrAddSeqFeature(alignment, newseqs, seq, sf,
+ relaxedIdMatching))
+ {
+ // check whether we should add the sequence feature to any other
+ // sequences in the alignment with the same or similar
+ while ((seq = alignment.findName(seq, seqId, true)) != null)
+ {
+ seq.addSequenceFeature(new SequenceFeature(sf));
+ }
+ }
+ return seq;
+ }
+
+ /**
+ * Process the 'column 9' data of the GFF file. This is less formally defined,
+ * and its interpretation will vary depending on the tool that has generated
+ * it.
+ *
+ * @param attributes
+ * @param sf
+ */
+ protected void processGffColumnNine(String attributes, SequenceFeature sf)
+ {
+ sf.setValue(ATTRIBUTES, attributes);
+
+ /*
+ * Parse attributes in column 9 and add them to the sequence feature's
+ * 'otherData' table; use Note as a best proxy for description
+ */
+ char[] nameValueSeparator = new char[] { gffVersion == 3 ? '=' : ' ' };
+ Map<String, List<String>> nameValues = StringUtils.parseNameValuePairs(attributes, ";",
+ nameValueSeparator);
+ for (Entry<String, List<String>> attr : nameValues.entrySet())
+ {
+ String values = StringUtils.listToDelimitedString(attr.getValue(),
+ "; ");
+ sf.setValue(attr.getKey(), values);
+ if (NOTE.equals(attr.getKey()))
+ {
+ sf.setDescription(values);
+ }
+ }
+ }
+
+ /**
+ * After encountering ##fasta in a GFF3 file, process the remainder of the
+ * file as FAST sequence data. Any placeholder sequences created during
+ * feature parsing are updated with the actual sequences.
+ *
+ * @param align
+ * @param newseqs
+ * @throws IOException
+ */
+ protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs)
+ throws IOException
+ {
+ try
+ {
+ mark();
+ } catch (IOException q)
+ {
+ }
+ FastaFile parser = new FastaFile(this);
+ List<SequenceI> includedseqs = parser.getSeqs();
+ SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
+ // iterate over includedseqs, and replacing matching ones with newseqs
+ // sequences. Generic iterator not used here because we modify includedseqs
+ // as we go
+ for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
+ {
+ // search for any dummy seqs that this sequence can be used to update
+ SequenceI dummyseq = smatcher.findIdMatch(includedseqs.get(p));
+ if (dummyseq != null)
+ {
+ // dummyseq was created so it could be annotated and referred to in
+ // alignments/codon mappings
+
+ SequenceI mseq = includedseqs.get(p);
+ // mseq is the 'template' imported from the FASTA file which we'll use
+ // to coomplete dummyseq
+ if (dummyseq instanceof SequenceDummy)
+ {
+ // probably have the pattern wrong
+ // idea is that a flyweight proxy for a sequence ID can be created for
+ // 1. stable reference creation
+ // 2. addition of annotation
+ // 3. future replacement by a real sequence
+ // current pattern is to create SequenceDummy objects - a convenience
+ // constructor for a Sequence.
+ // problem is that when promoted to a real sequence, all references
+ // need
+ // to be updated somehow.
+ ((SequenceDummy) dummyseq).become(mseq);
+ includedseqs.set(p, dummyseq); // template is no longer needed
+ }
+ }
+ }
+ // finally add sequences to the dataset
+ for (SequenceI seq : includedseqs)
+ {
+ align.addSequence(seq);
+ }
+ }