package jalview.io;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import jalview.bin.Cache;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.FeatureProperties;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.DnaUtils;
import jalview.util.MappingUtils;
/**
* A class that provides selective parsing of the EMBL flatfile format.
*
* The initial implementation is limited to extracting fields used by Jalview
* after fetching an EMBL or EMBLCDS entry:
*
*
* accession, version, sequence, xref
* and (for CDS feature) location, protein_id, product, codon_start, translation
*
*
* For a complete parser, it may be best to adopt that provided in
* https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
* (but note this has a dependency on the Apache Commons library)
*
* @author gmcarstairs
* @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
* @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
*/
public class EmblFlatFile extends AlignFile // FileParse
{
private static final String WHITESPACE = "\\s+";
private String sourceDb;
/*
* values parsed from the EMBL flatfile record
*/
private String accession; // from ID (first token)
private String version; // from ID (second token)
private int length = 128; // from ID (7th token), with usable default
private List dbrefs; // from DR and also CDS /db_xref qualifiers
private String sequenceString; // from SQ lines
private String translation; // from CDS feature /translation
private String cdsLocation; // CDS /location raw value
private int codonStart = 1; // from CDS /codon_start
private String proteinName; // from CDS /product
private String proteinId; // from CDS /protein_id
private Map cdsProps; // CDS other qualifiers e.g. 'note'
/**
* Constructor
* @param fp
* @param sourceId
* @throws IOException
*/
public EmblFlatFile(FileParse fp, String sourceId) throws IOException
{
super(false, fp); // don't parse immediately
this.sourceDb = sourceId;
dbrefs = new ArrayList<>();
cdsProps = new Hashtable<>();
}
/**
* Parses the flatfile, and if successful, saves as an annotated sequence
* which may be retrieved by calling {@code getSequence()}
*
* @throws IOException
*/
public void parse() throws IOException
{
String line = nextLine();
while (line != null)
{
if (line.startsWith("ID"))
{
line = processID(line);
}
else if (line.startsWith("DR"))
{
line = processDR(line);
}
else if (line.startsWith("SQ"))
{
line = processSQ();
}
else if (line.startsWith("FT"))
{
line = processFT(line);
}
else
{
line = nextLine();
}
}
assembleSequence();
}
/**
* Extracts and saves the primary accession and version (SV value) from an ID
* line, or null if not found. Returns the next line after the one processed.
*
* @param line
* @throws IOException
*/
String processID(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
/*
* first is primary accession
*/
String token = tokens[0].trim();
if (!token.isEmpty())
{
this.accession = token;
}
/*
* second token is 'SV versionNo'
*/
if (tokens.length > 1)
{
token = tokens[1].trim();
if (token.startsWith("SV"))
{
String[] bits = token.trim().split(WHITESPACE);
this.version = bits[bits.length - 1];
}
}
/*
* seventh token is 'length BP'
*/
if (tokens.length > 6)
{
token = tokens[6].trim();
String[] bits = token.trim().split(WHITESPACE);
try
{
this.length = Integer.valueOf(bits[0]);
} catch (NumberFormatException e)
{
Cache.log.error("bad length read in flatfile, line: " + line);
}
}
return nextLine();
}
/**
* Processes one DR line and saves as a DBRefEntry cross-reference. Returns
* the line following the line processed.
*
* @param line
* @throws IOException
*/
String processDR(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
if (tokens.length > 1)
{
String db = tokens[0].trim();
String acc = tokens[1].trim();
if (acc.endsWith("."))
{
acc = acc.substring(0, acc.length() - 1);
}
this.dbrefs.add(new DBRefEntry(db, "0", acc));
}
return nextLine();
}
/**
* Reads and saves the sequence, read from the lines following the SQ line.
* Whitespace and position counters are discarded. Returns the next line
* following the sequence data (the next line that doesn't start with
* whitespace).
*
* @throws IOException
*/
String processSQ() throws IOException
{
StringBuilder sb = new StringBuilder(this.length);
String line = nextLine();
while (line != null && line.startsWith(" "))
{
line = line.trim();
String[] blocks = line.split(WHITESPACE);
/*
* omit the last block (position counter) on each line
*/
for (int i = 0; i < blocks.length - 1; i++)
{
sb.append(blocks[i]);
}
line = nextLine();
}
this.sequenceString = sb.toString();
return line;
}
/**
* Processes an FT line. If it declares a feature type of interest (currently,
* only CDS is processed), processes all of the associated lines (feature
* qualifiers), and returns the next line after that, otherwise simply returns
* the next line.
*
* @param line
* @return
* @throws IOException
*/
String processFT(String line) throws IOException
{
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 3 || !"CDS".equals(tokens[1]))
{
return nextLine();
}
this.cdsLocation = tokens[2];
while ((line = nextLine()) != null)
{
if (!line.startsWith("FT ")) // 4 spaces
{
// e.g. start of next feature "FT source..."
break;
}
/*
* extract qualifier, e.g. FT /protein_id="CAA37824.1"
*/
int slashPos = line.indexOf('/');
if (slashPos == -1)
{
Cache.log.error("Unexpected EMBL line ignored: " + line);
continue;
}
int eqPos = line.indexOf('=', slashPos + 1);
if (eqPos == -1)
{
Cache.log.error("Unexpected EMBL line ignored: " + line);
continue;
}
String qualifier = line.substring(slashPos + 1, eqPos);
String value = line.substring(eqPos + 1);
if (value.startsWith("\"") && value.endsWith("\""))
{
value = value.substring(1, value.length() - 1);
}
if ("protein_id".equals(qualifier))
{
proteinId = value;
}
else if ("codon_start".equals(qualifier))
{
try
{
codonStart = Integer.parseInt(value.trim());
} catch (NumberFormatException e)
{
Cache.log.error("Invalid codon_start in XML for " + this.accession
+ ": " + e.getMessage());
}
}
else if ("product".equals(qualifier))
{
// sometimes name is returned e.g. for V00488
proteinName = value;
}
else if ("translation".equals(qualifier))
{
line = readTranslation(value);
}
else if (!"".equals(value))
{
// throw anything else into the additional properties hash
cdsProps.put(qualifier, value);
}
}
return line;
}
/**
* Reads and saves the CDS translation from one or more lines of the file, and
* returns the next line after that
*
* @param value
* the first line of the translation (likely quoted)
* @return
* @throws IOException
*/
String readTranslation(String value) throws IOException
{
StringBuilder sb = new StringBuilder(this.length / 3 + 1);
sb.append(value.replace("\"", ""));
String line;
while ((line = nextLine()) != null)
{
if (!line.startsWith("FT "))
{
break; // reached next feature or other input line
}
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 2)
{
Cache.log.error("Ignoring bad EMBL line: " + line);
break;
}
if (tokens[1].startsWith("/"))
{
break; // next feature qualifier
}
sb.append(tokens[1].replace("\"", ""));
}
return sb.toString();
}
/**
* Processes the parsed CDS feature data to
*
* - add a CDS feature to the sequence for each CDS start-end range
* - create a protein product sequence for the translation
* - create a cross-reference to protein with mapping from dna
* - add any CDS dbrefs to the sequence and to the protein product
*
* @param SequenceI dna
*/
void processCDS(SequenceI dna)
{
/*
* parse location into a list of [start, end, start, end] positions
*/
int[] exons = getCdsRanges(this.accession, this.cdsLocation);
int exonNumber = 0;
for (int xint = 0; exons != null
&& xint < exons.length - 1; xint += 2)
{
int exonStart = exons[xint];
int exonEnd = exons[xint + 1];
int begin = Math.min(exonStart, exonEnd);
int end = Math.max(exonStart, exonEnd);
exonNumber++;
String desc = String.format("Exon %d for protein EMBLCDS:%s",
exonNumber, proteinId);
SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
if (!cdsProps.isEmpty())
{
for (Entry val : cdsProps.entrySet())
{
sf.setValue(val.getKey(), val.getValue());
}
}
sf.setEnaLocation(this.cdsLocation);
boolean forwardStrand = exonStart <= exonEnd;
sf.setStrand(forwardStrand ? "+" : "-");
sf.setPhase(String.valueOf(codonStart - 1));
sf.setValue(FeatureProperties.EXONPOS, exonNumber);
sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
dna.addSequenceFeature(sf);
}
}
/**
* Constructs and saves the sequence from parsed components
*/
void assembleSequence()
{
String name = this.accession;
if (this.sourceDb != null)
{
name = this.sourceDb + "|" + name;
}
SequenceI seq = new Sequence(name, this.sequenceString);
for (DBRefEntry dbref : this.dbrefs)
{
seq.addDBRef(dbref);
}
processCDS(seq);
seq.deriveSequence();
addSequence(seq);
}
/**
* Output (print) is not implemented for EMBL flat file format
*/
@Override
public String print(SequenceI[] seqs, boolean jvsuffix)
{
return null;
}
/**
* Returns the CDS location as a single array of [start, end, start, end...]
* positions. If on the reverse strand, these will be in descending order.
*
* @param accession
* @param location
* @return
*/
protected int[] getCdsRanges(String accession, String location)
{
if (location == null)
{
return new int[] {};
}
try
{
List ranges = DnaUtils.parseLocation(location);
return MappingUtils.listToArray(ranges);
} catch (ParseException e)
{
Cache.log.warn(
String.format("Not parsing inexact CDS location %s in ENA %s",
location, accession));
return new int[] {};
}
}
}