package jalview.io;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import jalview.bin.Cache;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.FeatureProperties;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.DBRefUtils;
import jalview.util.DnaUtils;
import jalview.util.MappingUtils;
/**
* A class that provides selective parsing of the EMBL flatfile format.
*
* The initial implementation is limited to extracting fields used by Jalview
* after fetching an EMBL or EMBLCDS entry:
*
*
* accession, version, sequence, xref
* and (for CDS feature) location, protein_id, product, codon_start, translation
*
*
* For a complete parser, it may be best to adopt that provided in
* https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
* (but note this has a dependency on the Apache Commons library)
*
* @author gmcarstairs
* @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
* @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
*/
public class EmblFlatFile extends AlignFile // FileParse
{
/**
* A data bean class to hold values parsed from one CDS Feature (FT)
*/
class CdsData
{
String translation; // from CDS feature /translation
String cdsLocation; // CDS /location raw value
int codonStart = 1; // from CDS /codon_start
String proteinName; // from CDS /product; used for protein description
String proteinId; // from CDS /protein_id
Map cdsProps = new Hashtable<>(); // CDS other qualifiers
}
private static final String WHITESPACE = "\\s+";
private String sourceDb;
/*
* values parsed from the EMBL flatfile record
*/
private String accession; // from ID (first token)
private String version; // from ID (second token)
private String description; // from (first) DE line
private int length = 128; // from ID (7th token), with usable default
private List dbrefs; // from DR and also CDS /db_xref qualifiers
private String sequenceString; // from SQ lines
private List cds;
/**
* Constructor
*
* @param fp
* @param sourceId
* @throws IOException
*/
public EmblFlatFile(FileParse fp, String sourceId) throws IOException
{
super(false, fp); // don't parse immediately
this.sourceDb = sourceId;
dbrefs = new ArrayList<>();
cds = new ArrayList<>();
}
/**
* Parses the flatfile, and if successful, saves as an annotated sequence
* which may be retrieved by calling {@code getSequence()}
*
* @throws IOException
*/
public void parse() throws IOException
{
String line = nextLine();
while (line != null)
{
if (line.startsWith("ID"))
{
line = parseID(line);
}
else if (line.startsWith("DE"))
{
line = parseDE(line);
}
else if (line.startsWith("DR"))
{
line = parseDR(line);
}
else if (line.startsWith("SQ"))
{
line = parseSQ();
}
else if (line.startsWith("FT"))
{
line = parseFT(line);
}
else
{
line = nextLine();
}
}
assembleSequence();
}
/**
* Extracts and saves the primary accession and version (SV value) from an ID
* line, or null if not found. Returns the next line after the one processed.
*
* @param line
* @throws IOException
*/
String parseID(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
/*
* first is primary accession
*/
String token = tokens[0].trim();
if (!token.isEmpty())
{
this.accession = token;
}
/*
* second token is 'SV versionNo'
*/
if (tokens.length > 1)
{
token = tokens[1].trim();
if (token.startsWith("SV"))
{
String[] bits = token.trim().split(WHITESPACE);
this.version = bits[bits.length - 1];
}
}
/*
* seventh token is 'length BP'
*/
if (tokens.length > 6)
{
token = tokens[6].trim();
String[] bits = token.trim().split(WHITESPACE);
try
{
this.length = Integer.valueOf(bits[0]);
} catch (NumberFormatException e)
{
Cache.log.error("bad length read in flatfile, line: " + line);
}
}
return nextLine();
}
/**
* Reads sequence description from the first DE line found. Any trailing
* period is discarded. If there are multiple DE lines, only the first (short
* description) is read, the rest are ignored.
*
* @param line
* @return
* @throws IOException
*/
String parseDE(String line) throws IOException
{
String desc = line.substring(2).trim();
if (desc.endsWith("."))
{
desc = desc.substring(0, desc.length() - 1);
}
this.description = desc;
/*
* pass over any additional DE lines
*/
while ((line = nextLine()) != null)
{
if (!line.startsWith("DE"))
{
break;
}
}
return line;
}
/**
* Processes one DR line and saves as a DBRefEntry cross-reference. Returns
* the line following the line processed.
*
* @param line
* @throws IOException
*/
String parseDR(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
if (tokens.length > 1)
{
/*
* ensure UniProtKB/Swiss-Prot converted to UNIPROT
*/
String db = tokens[0].trim();
db = DBRefUtils.getCanonicalName(db);
String acc = tokens[1].trim();
if (acc.endsWith("."))
{
acc = acc.substring(0, acc.length() - 1);
}
String version = "0";
if (tokens.length > 2)
{
String secondaryId = tokens[2].trim();
if (!secondaryId.isEmpty())
{
// todo: is this right? secondary id is not a version number
// version = secondaryId;
}
}
this.dbrefs.add(new DBRefEntry(db, version, acc));
}
return nextLine();
}
/**
* Reads and saves the sequence, read from the lines following the SQ line.
* Whitespace and position counters are discarded. Returns the next line
* following the sequence data (the next line that doesn't start with
* whitespace).
*
* @throws IOException
*/
String parseSQ() throws IOException
{
StringBuilder sb = new StringBuilder(this.length);
String line = nextLine();
while (line != null && line.startsWith(" "))
{
line = line.trim();
String[] blocks = line.split(WHITESPACE);
/*
* omit the last block (position counter) on each line
*/
for (int i = 0; i < blocks.length - 1; i++)
{
sb.append(blocks[i]);
}
line = nextLine();
}
this.sequenceString = sb.toString();
return line;
}
/**
* Processes an FT line. If it declares a feature type of interest (currently,
* only CDS is processed), processes all of the associated lines (feature
* qualifiers), and returns the next line after that, otherwise simply returns
* the next line.
*
* @param line
* @return
* @throws IOException
*/
String parseFT(String line) throws IOException
{
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 3 || !"CDS".equals(tokens[1]))
{
return nextLine();
}
CdsData data = new CdsData();
data.cdsLocation = tokens[2];
line = nextLine();
while (line != null)
{
if (!line.startsWith("FT ")) // 4 spaces
{
// e.g. start of next feature "FT source..."
break;
}
/*
* extract qualifier, e.g. FT /protein_id="CAA37824.1"
*/
int slashPos = line.indexOf('/');
if (slashPos == -1)
{
Cache.log.error("Unexpected EMBL line ignored: " + line);
continue;
}
int eqPos = line.indexOf('=', slashPos + 1);
if (eqPos == -1)
{
Cache.log.error("Unexpected EMBL line ignored: " + line);
continue;
}
String qualifier = line.substring(slashPos + 1, eqPos);
String value = line.substring(eqPos + 1);
if (value.startsWith("\"") && value.endsWith("\""))
{
value = value.substring(1, value.length() - 1);
}
if ("protein_id".equals(qualifier))
{
data.proteinId = value;
line = nextLine();
}
else if ("codon_start".equals(qualifier))
{
try
{
data.codonStart = Integer.parseInt(value.trim());
} catch (NumberFormatException e)
{
Cache.log.error("Invalid codon_start in XML for " + this.accession
+ ": " + e.getMessage());
}
line = nextLine();
}
else if ("db_xref".equals(qualifier))
{
String[] parts = value.split(":");
if (parts.length == 2)
{
String db = parts[0].trim();
db = DBRefUtils.getCanonicalName(db);
DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
this.dbrefs.add(dbref);
}
line = nextLine();
}
else if ("product".equals(qualifier))
{
// sometimes name is returned e.g. for V00488
data.proteinName = value;
line = nextLine();
}
else if ("translation".equals(qualifier))
{
line = readTranslation(value, data);
}
else if (!"".equals(value))
{
// throw anything else into the additional properties hash
data.cdsProps.put(qualifier, value);
line = nextLine();
}
}
this.cds.add(data);
return line;
}
/**
* Reads and returns the CDS translation from one or more lines of the file,
* and returns the next line after that
*
* @param value
* the first line of the translation (likely quoted)
* @param data
* @return
* @throws IOException
*/
String readTranslation(String value, CdsData data) throws IOException
{
StringBuilder sb = new StringBuilder(this.length / 3 + 1);
sb.append(value.replace("\"", ""));
String line;
while ((line = nextLine()) != null)
{
if (!line.startsWith("FT "))
{
break; // reached next feature or other input line
}
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 2)
{
Cache.log.error("Ignoring bad EMBL line: " + line);
break;
}
if (tokens[1].startsWith("/"))
{
break; // next feature qualifier
}
sb.append(tokens[1].replace("\"", ""));
}
data.translation = sb.toString();
return line;
}
/**
* Constructs and saves the sequence from parsed components
*/
void assembleSequence()
{
String name = this.accession;
if (this.sourceDb != null)
{
name = this.sourceDb + "|" + name;
}
SequenceI seq = new Sequence(name, this.sequenceString);
seq.setDescription(this.description);
/*
* add a DBRef to itself
*/
DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
int[] startEnd = new int[] { 1, seq.getLength() };
selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
seq.addDBRef(selfRef);
for (DBRefEntry dbref : this.dbrefs)
{
seq.addDBRef(dbref);
}
processAllCDS(seq);
seq.deriveSequence();
addSequence(seq);
}
/**
* Process the CDS features, including generation of cross-references and
* mappings to the protein products (translation)
*
* @param seq
*/
protected void processAllCDS(SequenceI seq)
{
/*
* record protein products found to avoid duplication i.e. >1 CDS with
* the same /protein_id [though not sure I can find an example of this]
*/
Map proteins = new HashMap<>();
for (CdsData data : cds)
{
processOneCDS(seq, data, proteins);
}
}
/**
* Processes the parsed CDS feature data to
*
* - add a CDS feature to the sequence for each CDS start-end range
* - create a protein product sequence for the translation
* - create a cross-reference to protein with mapping from dna
* - add any CDS dbrefs to the sequence and to the protein product
*
*
* @param SequenceI
* dna
* @param proteins
* map of protein products so far derived from CDS data
*/
void processOneCDS(SequenceI dna, CdsData data,
Map proteins)
{
/*
* parse location into a list of [start, end, start, end] positions
*/
int[] exons = getCdsRanges(this.accession, data.cdsLocation);
int exonNumber = 0;
for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
{
int exonStart = exons[xint];
int exonEnd = exons[xint + 1];
int begin = Math.min(exonStart, exonEnd);
int end = Math.max(exonStart, exonEnd);
exonNumber++;
String desc = String.format("Exon %d for protein EMBLCDS:%s",
exonNumber, data.proteinId);
SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
this.sourceDb);
for (Entry val : data.cdsProps.entrySet())
{
sf.setValue(val.getKey(), val.getValue());
}
sf.setEnaLocation(data.cdsLocation);
boolean forwardStrand = exonStart <= exonEnd;
sf.setStrand(forwardStrand ? "+" : "-");
sf.setPhase(String.valueOf(data.codonStart - 1));
sf.setValue(FeatureProperties.EXONPOS, exonNumber);
sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
dna.addSequenceFeature(sf);
linkProteinProduct(dna, data, proteins);
}
}
/**
* Constructs a sequence for the protein product for the CDS data (if there is
* one), and dbrefs with mappings from CDS to protein and the reverse
*
* @param dna
* @param data
* @param proteins
*/
void linkProteinProduct(SequenceI dna, CdsData data, Map proteins)
{
/*
* check we have some data to work with
*/
if (data.proteinId == null || data.translation == null)
{
return;
}
/*
* Construct the protein sequence (if not already seen)
*/
SequenceI protein = proteins.get(data.proteinId);
if (protein == null)
{
protein = new Sequence(data.proteinId, data.translation, 1,
data.translation.length());
protein.setDescription(data.proteinName != null ? data.proteinName
: "Protein Product from " + sourceDb);
proteins.put(data.proteinId, protein);
}
}
/**
* Returns the CDS location as a single array of [start, end, start, end...]
* positions. If on the reverse strand, these will be in descending order.
*
* @param accession
* @param location
* @return
*/
protected int[] getCdsRanges(String accession, String location)
{
if (location == null)
{
return new int[] {};
}
try
{
List ranges = DnaUtils.parseLocation(location);
return MappingUtils.listToArray(ranges);
} catch (ParseException e)
{
Cache.log.warn(
String.format("Not parsing inexact CDS location %s in ENA %s",
location, accession));
return new int[] {};
}
}
/**
* Output (print) is not implemented for EMBL flat file format
*/
@Override
public String print(SequenceI[] seqs, boolean jvsuffix)
{
return null;
}
}