map = sf.otherDetails;
+ formatAttributes(out, map);
+ }
+ }
+
+ /**
+ * A helper method that outputs attributes stored in the map as
+ * semicolon-delimited values e.g.
+ *
+ *
+ * AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9
+ *
+ *
+ * A map-valued attribute is formatted as a comma-delimited list within braces,
+ * for example
+ *
+ *
+ * jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561}
+ *
+ *
+ * The {@code jvmap_} prefix designates a values map and is removed if the value
+ * is parsed when read in. (The GFF3 specification allows 'semi-structured data'
+ * to be represented provided the attribute name begins with a lower case
+ * letter.)
+ *
+ * @param sb
+ * @param map
+ * @see http://gmod.org/wiki/GFF3#GFF3_Format
+ */
+ void formatAttributes(StringBuilder sb, Map map)
+ {
+ sb.append(TAB);
+ boolean first = true;
+ for (String key : map.keySet())
+ {
+ if (SequenceFeature.STRAND.equals(key)
+ || SequenceFeature.PHASE.equals(key))
{
- next = seqs[i].getSequenceFeatures();
- for (int j = 0; j < next.length; j++)
+ /*
+ * values stashed in map but output to their own columns
+ */
+ continue;
+ }
+ {
+ if (!first)
{
- isnonpos = next[j].begin == 0 && next[j].end == 0;
- if ((!nonpos && isnonpos)
- || (!isnonpos && visOnly && !visible
- .containsKey(next[j].type)))
- {
- continue;
- }
+ sb.append(";");
+ }
+ }
+ first = false;
+ Object value = map.get(key);
+ if (value instanceof Map, ?>)
+ {
+ formatMapAttribute(sb, key, (Map, ?>) value);
+ }
+ else
+ {
+ String formatted = StringUtils.urlEncode(value.toString(),
+ GffHelperI.GFF_ENCODABLE);
+ sb.append(key).append(EQUALS).append(formatted);
+ }
+ }
+ }
- source = next[j].featureGroup;
- if (source == null)
- {
- source = next[j].getDescription();
- }
+ /**
+ * Formats the map entries as
+ *
+ *
+ * key=key1=value1,key2=value2,...
+ *
+ *
+ * and appends this to the string buffer
+ *
+ * @param sb
+ * @param key
+ * @param map
+ */
+ private void formatMapAttribute(StringBuilder sb, String key,
+ Map, ?> map)
+ {
+ if (map == null || map.isEmpty())
+ {
+ return;
+ }
- out.append(seqs[i].getName());
- out.append("\t");
- out.append(source);
- out.append("\t");
- out.append(next[j].type);
- out.append("\t");
- out.append(next[j].begin);
- out.append("\t");
- out.append(next[j].end);
- out.append("\t");
- out.append(next[j].score);
- out.append("\t");
-
- if (next[j].getValue("STRAND") != null)
- {
- out.append(next[j].getValue("STRAND"));
- out.append("\t");
- }
- else
- {
- out.append(".\t");
- }
+ /*
+ * AbstractMap.toString would be a shortcut here, but more reliable
+ * to code the required format in case toString changes in future
+ */
+ sb.append(key).append(EQUALS);
+ boolean first = true;
+ for (Entry, ?> entry : map.entrySet())
+ {
+ if (!first)
+ {
+ sb.append(",");
+ }
+ first = false;
+ sb.append(entry.getKey().toString()).append(EQUALS);
+ String formatted = StringUtils.urlEncode(entry.getValue().toString(),
+ GffHelperI.GFF_ENCODABLE);
+ sb.append(formatted);
+ }
+ }
- if (next[j].getValue("FRAME") != null)
- {
- out.append(next[j].getValue("FRAME"));
- }
- else
- {
- out.append(".");
- }
- // TODO: verify/check GFF - should there be a /t here before attribute
- // output ?
+ /**
+ * Returns a mapping given list of one or more Align descriptors (exonerate
+ * format)
+ *
+ * @param alignedRegions
+ * a list of "Align fromStart toStart fromCount"
+ * @param mapIsFromCdna
+ * if true, 'from' is dna, else 'from' is protein
+ * @param strand
+ * either 1 (forward) or -1 (reverse)
+ * @return
+ * @throws IOException
+ */
+ protected MapList constructCodonMappingFromAlign(
+ List alignedRegions, boolean mapIsFromCdna, int strand)
+ throws IOException
+ {
+ if (strand == 0)
+ {
+ throw new IOException(
+ "Invalid strand for a codon mapping (cannot be 0)");
+ }
+ int regions = alignedRegions.size();
+ // arrays to hold [start, end] for each aligned region
+ int[] fromRanges = new int[regions * 2]; // from dna
+ int[] toRanges = new int[regions * 2]; // to protein
+ int fromRangesIndex = 0;
+ int toRangesIndex = 0;
+
+ for (String range : alignedRegions)
+ {
+ /*
+ * Align mapFromStart mapToStart mapFromCount
+ * e.g. if mapIsFromCdna
+ * Align 11270 143 120
+ * means:
+ * 120 bases from pos 11270 align to pos 143 in peptide
+ * if !mapIsFromCdna this would instead be
+ * Align 143 11270 40
+ */
+ String[] tokens = range.split(" ");
+ if (tokens.length != 3)
+ {
+ throw new IOException("Wrong number of fields for Align");
+ }
+ int fromStart = 0;
+ int toStart = 0;
+ int fromCount = 0;
+ try
+ {
+ fromStart = Integer.parseInt(tokens[0]);
+ toStart = Integer.parseInt(tokens[1]);
+ fromCount = Integer.parseInt(tokens[2]);
+ } catch (NumberFormatException nfe)
+ {
+ throw new IOException(
+ "Invalid number in Align field: " + nfe.getMessage());
+ }
- if (next[j].getValue("ATTRIBUTES") != null)
- {
- out.append(next[j].getValue("ATTRIBUTES"));
- }
+ /*
+ * Jalview always models from dna to protein, so adjust values if the
+ * GFF mapping is from protein to dna
+ */
+ if (!mapIsFromCdna)
+ {
+ fromCount *= 3;
+ int temp = fromStart;
+ fromStart = toStart;
+ toStart = temp;
+ }
+ fromRanges[fromRangesIndex++] = fromStart;
+ fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
- out.append(newline);
+ /*
+ * If a codon has an intron gap, there will be contiguous 'toRanges';
+ * this is handled for us by the MapList constructor.
+ * (It is not clear that exonerate ever generates this case)
+ */
+ toRanges[toRangesIndex++] = toStart;
+ toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
+ }
+
+ return new MapList(fromRanges, toRanges, 3, 1);
+ }
+ /**
+ * Parse a GFF format feature. This may include creating a 'dummy' sequence to
+ * hold the feature, or for its mapped sequence, or both, to be resolved
+ * either later in the GFF file (##FASTA section), or when the user loads
+ * additional sequences.
+ *
+ * @param gffColumns
+ * @param alignment
+ * @param relaxedIdMatching
+ * @param newseqs
+ * @return
+ */
+ protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
+ boolean relaxedIdMatching, List newseqs)
+ {
+ /*
+ * GFF: seqid source type start end score strand phase [attributes]
+ */
+ if (gffColumns.length < 5)
+ {
+ System.err.println("Ignoring GFF feature line with too few columns ("
+ + gffColumns.length + ")");
+ return null;
+ }
+
+ /*
+ * locate referenced sequence in alignment _or_
+ * as a forward or external reference (SequenceDummy)
+ */
+ String seqId = gffColumns[0];
+ SequenceI seq = findSequence(seqId, alignment, newseqs,
+ relaxedIdMatching);
+
+ SequenceFeature sf = null;
+ GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
+ if (helper != null)
+ {
+ try
+ {
+ sf = helper.processGff(seq, gffColumns, alignment, newseqs,
+ relaxedIdMatching);
+ if (sf != null)
+ {
+ seq.addSequenceFeature(sf);
+ while ((seq = alignment.findName(seq, seqId, true)) != null)
+ {
+ seq.addSequenceFeature(new SequenceFeature(sf));
+ }
}
+ } catch (IOException e)
+ {
+ System.err.println("GFF parsing failed with: " + e.getMessage());
+ return null;
}
}
- return out.toString();
+ return seq;
}
/**
- * this is only for the benefit of object polymorphism - method does nothing.
+ * After encountering ##fasta in a GFF3 file, process the remainder of the
+ * file as FAST sequence data. Any placeholder sequences created during
+ * feature parsing are updated with the actual sequences.
+ *
+ * @param align
+ * @param newseqs
+ * @throws IOException
*/
- public void parse()
+ protected void processAsFasta(AlignmentI align, List newseqs)
+ throws IOException
{
- // IGNORED
+ try
+ {
+ mark();
+ } catch (IOException q)
+ {
+ }
+ FastaFile parser = new FastaFile(this);
+ List includedseqs = parser.getSeqs();
+
+ SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
+
+ /*
+ * iterate over includedseqs, and replacing matching ones with newseqs
+ * sequences. Generic iterator not used here because we modify
+ * includedseqs as we go
+ */
+ for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
+ {
+ // search for any dummy seqs that this sequence can be used to update
+ SequenceI includedSeq = includedseqs.get(p);
+ SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
+ if (dummyseq != null && dummyseq instanceof SequenceDummy)
+ {
+ // probably have the pattern wrong
+ // idea is that a flyweight proxy for a sequence ID can be created for
+ // 1. stable reference creation
+ // 2. addition of annotation
+ // 3. future replacement by a real sequence
+ // current pattern is to create SequenceDummy objects - a convenience
+ // constructor for a Sequence.
+ // problem is that when promoted to a real sequence, all references
+ // need to be updated somehow. We avoid that by keeping the same object.
+ ((SequenceDummy) dummyseq).become(includedSeq);
+ dummyseq.createDatasetSequence();
+
+ /*
+ * Update mappings so they are now to the dataset sequence
+ */
+ for (AlignedCodonFrame mapping : align.getCodonFrames())
+ {
+ mapping.updateToDataset(dummyseq);
+ }
+
+ /*
+ * replace parsed sequence with the realised forward reference
+ */
+ includedseqs.set(p, dummyseq);
+
+ /*
+ * and remove from the newseqs list
+ */
+ newseqs.remove(dummyseq);
+ }
+ }
+
+ /*
+ * finally add sequences to the dataset
+ */
+ for (SequenceI seq : includedseqs)
+ {
+ // experimental: mapping-based 'alignment' to query sequence
+ AlignmentUtils.alignSequenceAs(seq, align,
+ String.valueOf(align.getGapCharacter()), false, true);
+
+ // rename sequences if GFF handler requested this
+ // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
+ List sfs = seq.getFeatures().getPositionalFeatures();
+ if (!sfs.isEmpty())
+ {
+ String newName = (String) sfs.get(0).getValue(
+ GffHelperI.RENAME_TOKEN);
+ if (newName != null)
+ {
+ seq.setName(newName);
+ }
+ }
+ align.addSequence(seq);
+ }
}
/**
- * this is only for the benefit of object polymorphism - method does nothing.
+ * Process a ## directive
*
- * @return error message
+ * @param line
+ * @param gffProps
+ * @param align
+ * @param newseqs
+ * @throws IOException
*/
- public String print()
+ protected void processGffPragma(String line, Map gffProps,
+ AlignmentI align, List newseqs) throws IOException
{
- return "USE printGFFFormat() or printJalviewFormat()";
- }
+ line = line.trim();
+ if ("###".equals(line))
+ {
+ // close off any open 'forward references'
+ return;
+ }
+
+ String[] tokens = line.substring(2).split(" ");
+ String pragma = tokens[0];
+ String value = tokens.length == 1 ? null : tokens[1];
+ if ("gff-version".equalsIgnoreCase(pragma))
+ {
+ if (value != null)
+ {
+ try
+ {
+ // value may be e.g. "3.1.2"
+ gffVersion = Integer.parseInt(value.split("\\.")[0]);
+ } catch (NumberFormatException e)
+ {
+ // ignore
+ }
+ }
+ }
+ else if ("sequence-region".equalsIgnoreCase(pragma))
+ {
+ // could capture if wanted here
+ }
+ else if ("feature-ontology".equalsIgnoreCase(pragma))
+ {
+ // should resolve against the specified feature ontology URI
+ }
+ else if ("attribute-ontology".equalsIgnoreCase(pragma))
+ {
+ // URI of attribute ontology - not currently used in GFF3
+ }
+ else if ("source-ontology".equalsIgnoreCase(pragma))
+ {
+ // URI of source ontology - not currently used in GFF3
+ }
+ else if ("species-build".equalsIgnoreCase(pragma))
+ {
+ // save URI of specific NCBI taxon version of annotations
+ gffProps.put("species-build", value);
+ }
+ else if ("fasta".equalsIgnoreCase(pragma))
+ {
+ // process the rest of the file as a fasta file and replace any dummy
+ // sequence IDs
+ processAsFasta(align, newseqs);
+ }
+ else
+ {
+ System.err.println("Ignoring unknown pragma: " + line);
+ }
+ }
}