map = sf.otherDetails;
formatAttributes(out, map);
}
}
/**
* A helper method that outputs attributes stored in the map as
* semicolon-delimited values e.g.
*
*
* AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9
*
*
* A map-valued attribute is formatted as a comma-delimited list within braces,
* for example
*
*
* jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561}
*
*
* The {@code jvmap_} prefix designates a values map and is removed if the value
* is parsed when read in. (The GFF3 specification allows 'semi-structured data'
* to be represented provided the attribute name begins with a lower case
* letter.)
*
* @param sb
* @param map
* @see http://gmod.org/wiki/GFF3#GFF3_Format
*/
void formatAttributes(StringBuilder sb, Map map)
{
sb.append(TAB);
boolean first = true;
for (String key : map.keySet())
{
if (SequenceFeature.STRAND.equals(key)
|| SequenceFeature.PHASE.equals(key))
{
/*
* values stashed in map but output to their own columns
*/
continue;
}
{
if (!first)
{
sb.append(";");
}
}
first = false;
Object value = map.get(key);
if (value instanceof Map, ?>)
{
formatMapAttribute(sb, key, (Map, ?>) value);
}
else
{
String formatted = StringUtils.urlEncode(value.toString(),
GffHelperI.GFF_ENCODABLE);
sb.append(key).append(EQUALS).append(formatted);
}
}
}
/**
* Formats the map entries as
*
*
* key=key1=value1,key2=value2,...
*
*
* and appends this to the string buffer
*
* @param sb
* @param key
* @param map
*/
private void formatMapAttribute(StringBuilder sb, String key,
Map, ?> map)
{
if (map == null || map.isEmpty())
{
return;
}
/*
* AbstractMap.toString would be a shortcut here, but more reliable
* to code the required format in case toString changes in future
*/
sb.append(key).append(EQUALS);
boolean first = true;
for (Entry, ?> entry : map.entrySet())
{
if (!first)
{
sb.append(",");
}
first = false;
sb.append(entry.getKey().toString()).append(EQUALS);
String formatted = StringUtils.urlEncode(entry.getValue().toString(),
GffHelperI.GFF_ENCODABLE);
sb.append(formatted);
}
}
/**
* Returns a mapping given list of one or more Align descriptors (exonerate
* format)
*
* @param alignedRegions
* a list of "Align fromStart toStart fromCount"
* @param mapIsFromCdna
* if true, 'from' is dna, else 'from' is protein
* @param strand
* either 1 (forward) or -1 (reverse)
* @return
* @throws IOException
*/
protected MapList constructCodonMappingFromAlign(
List alignedRegions, boolean mapIsFromCdna, int strand)
throws IOException
{
if (strand == 0)
{
throw new IOException(
"Invalid strand for a codon mapping (cannot be 0)");
}
int regions = alignedRegions.size();
// arrays to hold [start, end] for each aligned region
int[] fromRanges = new int[regions * 2]; // from dna
int[] toRanges = new int[regions * 2]; // to protein
int fromRangesIndex = 0;
int toRangesIndex = 0;
for (String range : alignedRegions)
{
/*
* Align mapFromStart mapToStart mapFromCount
* e.g. if mapIsFromCdna
* Align 11270 143 120
* means:
* 120 bases from pos 11270 align to pos 143 in peptide
* if !mapIsFromCdna this would instead be
* Align 143 11270 40
*/
String[] tokens = range.split(" ");
if (tokens.length != 3)
{
throw new IOException("Wrong number of fields for Align");
}
int fromStart = 0;
int toStart = 0;
int fromCount = 0;
try
{
fromStart = Integer.parseInt(tokens[0]);
toStart = Integer.parseInt(tokens[1]);
fromCount = Integer.parseInt(tokens[2]);
} catch (NumberFormatException nfe)
{
throw new IOException(
"Invalid number in Align field: " + nfe.getMessage());
}
/*
* Jalview always models from dna to protein, so adjust values if the
* GFF mapping is from protein to dna
*/
if (!mapIsFromCdna)
{
fromCount *= 3;
int temp = fromStart;
fromStart = toStart;
toStart = temp;
}
fromRanges[fromRangesIndex++] = fromStart;
fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1);
/*
* If a codon has an intron gap, there will be contiguous 'toRanges';
* this is handled for us by the MapList constructor.
* (It is not clear that exonerate ever generates this case)
*/
toRanges[toRangesIndex++] = toStart;
toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3;
}
return new MapList(fromRanges, toRanges, 3, 1);
}
/**
* Parse a GFF format feature. This may include creating a 'dummy' sequence to
* hold the feature, or for its mapped sequence, or both, to be resolved
* either later in the GFF file (##FASTA section), or when the user loads
* additional sequences.
*
* @param gffColumns
* @param alignment
* @param relaxedIdMatching
* @param newseqs
* @return
*/
protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment,
boolean relaxedIdMatching, List newseqs)
{
/*
* GFF: seqid source type start end score strand phase [attributes]
*/
if (gffColumns.length < 5)
{
System.err.println("Ignoring GFF feature line with too few columns ("
+ gffColumns.length + ")");
return null;
}
/*
* locate referenced sequence in alignment _or_
* as a forward or external reference (SequenceDummy)
*/
String seqId = gffColumns[0];
SequenceI seq = findSequence(seqId, alignment, newseqs,
relaxedIdMatching);
SequenceFeature sf = null;
GffHelperI helper = GffHelperFactory.getHelper(gffColumns);
if (helper != null)
{
try
{
sf = helper.processGff(seq, gffColumns, alignment, newseqs,
relaxedIdMatching);
if (sf != null)
{
seq.addSequenceFeature(sf);
while ((seq = alignment.findName(seq, seqId, true)) != null)
{
seq.addSequenceFeature(new SequenceFeature(sf));
}
}
} catch (IOException e)
{
System.err.println("GFF parsing failed with: " + e.getMessage());
return null;
}
}
return seq;
}
/**
* After encountering ##fasta in a GFF3 file, process the remainder of the
* file as FAST sequence data. Any placeholder sequences created during
* feature parsing are updated with the actual sequences.
*
* @param align
* @param newseqs
* @throws IOException
*/
protected void processAsFasta(AlignmentI align, List newseqs)
throws IOException
{
try
{
mark();
} catch (IOException q)
{
}
FastaFile parser = new FastaFile(this);
List includedseqs = parser.getSeqs();
SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs);
/*
* iterate over includedseqs, and replacing matching ones with newseqs
* sequences. Generic iterator not used here because we modify
* includedseqs as we go
*/
for (int p = 0, pSize = includedseqs.size(); p < pSize; p++)
{
// search for any dummy seqs that this sequence can be used to update
SequenceI includedSeq = includedseqs.get(p);
SequenceI dummyseq = smatcher.findIdMatch(includedSeq);
if (dummyseq != null && dummyseq instanceof SequenceDummy)
{
// probably have the pattern wrong
// idea is that a flyweight proxy for a sequence ID can be created for
// 1. stable reference creation
// 2. addition of annotation
// 3. future replacement by a real sequence
// current pattern is to create SequenceDummy objects - a convenience
// constructor for a Sequence.
// problem is that when promoted to a real sequence, all references
// need to be updated somehow. We avoid that by keeping the same object.
((SequenceDummy) dummyseq).become(includedSeq);
dummyseq.createDatasetSequence();
/*
* Update mappings so they are now to the dataset sequence
*/
for (AlignedCodonFrame mapping : align.getCodonFrames())
{
mapping.updateToDataset(dummyseq);
}
/*
* replace parsed sequence with the realised forward reference
*/
includedseqs.set(p, dummyseq);
/*
* and remove from the newseqs list
*/
newseqs.remove(dummyseq);
}
}
/*
* finally add sequences to the dataset
*/
for (SequenceI seq : includedseqs)
{
// experimental: mapping-based 'alignment' to query sequence
AlignmentUtils.alignSequenceAs(seq, align,
String.valueOf(align.getGapCharacter()), false, true);
// rename sequences if GFF handler requested this
// TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ?
List sfs = seq.getFeatures().getPositionalFeatures();
if (!sfs.isEmpty())
{
String newName = (String) sfs.get(0).getValue(
GffHelperI.RENAME_TOKEN);
if (newName != null)
{
seq.setName(newName);
}
}
align.addSequence(seq);
}
}
/**
* Process a ## directive
*
* @param line
* @param gffProps
* @param align
* @param newseqs
* @throws IOException
*/
protected void processGffPragma(String line, Map gffProps,
AlignmentI align, List newseqs) throws IOException
{
line = line.trim();
if ("###".equals(line))
{
// close off any open 'forward references'
return;
}
String[] tokens = line.substring(2).split(" ");
String pragma = tokens[0];
String value = tokens.length == 1 ? null : tokens[1];
if ("gff-version".equalsIgnoreCase(pragma))
{
if (value != null)
{
try
{
// value may be e.g. "3.1.2"
gffVersion = Integer.parseInt(value.split("\\.")[0]);
} catch (NumberFormatException e)
{
// ignore
}
}
}
else if ("sequence-region".equalsIgnoreCase(pragma))
{
// could capture if wanted here
}
else if ("feature-ontology".equalsIgnoreCase(pragma))
{
// should resolve against the specified feature ontology URI
}
else if ("attribute-ontology".equalsIgnoreCase(pragma))
{
// URI of attribute ontology - not currently used in GFF3
}
else if ("source-ontology".equalsIgnoreCase(pragma))
{
// URI of source ontology - not currently used in GFF3
}
else if ("species-build".equalsIgnoreCase(pragma))
{
// save URI of specific NCBI taxon version of annotations
gffProps.put("species-build", value);
}
else if ("fasta".equalsIgnoreCase(pragma))
{
// process the rest of the file as a fasta file and replace any dummy
// sequence IDs
processAsFasta(align, newseqs);
}
else
{
System.err.println("Ignoring unknown pragma: " + line);
}
}
}