import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
import jalview.util.DnaUtils;
import jalview.util.MappingUtils;
*/
public class EmblFlatFile extends AlignFile // FileParse
{
+ /**
+ * A data bean class to hold values parsed from one CDS Feature (FT)
+ */
+ class CdsData
+ {
+ String translation; // from CDS feature /translation
+
+ String cdsLocation; // CDS /location raw value
+
+ int codonStart = 1; // from CDS /codon_start
+
+ String proteinName; // from CDS /product; TODO: use for protein description
+
+ String proteinId; // from CDS /protein_id
+
+ Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
+ }
+
private static final String WHITESPACE = "\\s+";
private String sourceDb;
-
+
/*
* values parsed from the EMBL flatfile record
*/
private String sequenceString; // from SQ lines
- private String translation; // from CDS feature /translation
-
- private String cdsLocation; // CDS /location raw value
-
- private int codonStart = 1; // from CDS /codon_start
-
- private String proteinName; // from CDS /product
-
- private String proteinId; // from CDS /protein_id
-
- private Map<String, String> cdsProps; // CDS other qualifiers e.g. 'note'
-
+ private List<CdsData> cds;
+
/**
* Constructor
+ *
* @param fp
* @param sourceId
* @throws IOException
super(false, fp); // don't parse immediately
this.sourceDb = sourceId;
dbrefs = new ArrayList<>();
- cdsProps = new Hashtable<>();
+ cds = new ArrayList<>();
}
/**
{
if (line.startsWith("ID"))
{
- line = processID(line);
+ line = parseID(line);
}
else if (line.startsWith("DR"))
{
- line = processDR(line);
+ line = parseDR(line);
}
else if (line.startsWith("SQ"))
{
- line = processSQ();
+ line = parseSQ();
}
else if (line.startsWith("FT"))
{
- line = processFT(line);
+ line = parseFT(line);
}
else
{
* @param line
* @throws IOException
*/
- String processID(String line) throws IOException
+ String parseID(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
* @param line
* @throws IOException
*/
- String processDR(String line) throws IOException
+ String parseDR(String line) throws IOException
{
String[] tokens = line.substring(2).split(";");
if (tokens.length > 1)
{
+ /*
+ * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+ */
String db = tokens[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
String acc = tokens[1].trim();
if (acc.endsWith("."))
{
acc = acc.substring(0, acc.length() - 1);
}
- this.dbrefs.add(new DBRefEntry(db, "0", acc));
+ String version = "0";
+ if (tokens.length > 2)
+ {
+ String secondaryId = tokens[2].trim();
+ if (!secondaryId.isEmpty())
+ {
+ // todo: is this right? secondary id is not a version number
+ // version = secondaryId;
+ }
+ }
+ this.dbrefs.add(new DBRefEntry(db, version, acc));
}
return nextLine();
*
* @throws IOException
*/
- String processSQ() throws IOException
+ String parseSQ() throws IOException
{
StringBuilder sb = new StringBuilder(this.length);
String line = nextLine();
* @return
* @throws IOException
*/
- String processFT(String line) throws IOException
+ String parseFT(String line) throws IOException
{
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 3 || !"CDS".equals(tokens[1]))
return nextLine();
}
- this.cdsLocation = tokens[2];
+ CdsData data = new CdsData();
+ data.cdsLocation = tokens[2];
- while ((line = nextLine()) != null)
+ line = nextLine();
+ while (line != null)
{
if (!line.startsWith("FT ")) // 4 spaces
{
if ("protein_id".equals(qualifier))
{
- proteinId = value;
+ data.proteinId = value;
+ line = nextLine();
}
else if ("codon_start".equals(qualifier))
{
try
{
- codonStart = Integer.parseInt(value.trim());
+ data.codonStart = Integer.parseInt(value.trim());
} catch (NumberFormatException e)
{
Cache.log.error("Invalid codon_start in XML for " + this.accession
+ ": " + e.getMessage());
}
+ line = nextLine();
+ }
+ else if ("db_xref".equals(qualifier))
+ {
+ String[] parts = value.split(":");
+ if (parts.length == 2)
+ {
+ String db = parts[0].trim();
+ db = DBRefUtils.getCanonicalName(db);
+ DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
+ this.dbrefs.add(dbref);
+ }
+ line = nextLine();
}
else if ("product".equals(qualifier))
{
// sometimes name is returned e.g. for V00488
- proteinName = value;
+ data.proteinName = value;
+ line = nextLine();
}
else if ("translation".equals(qualifier))
{
- line = readTranslation(value);
+ line = readTranslation(value, data);
}
else if (!"".equals(value))
{
// throw anything else into the additional properties hash
- cdsProps.put(qualifier, value);
+ data.cdsProps.put(qualifier, value);
+ line = nextLine();
}
}
+
+ this.cds.add(data);
return line;
}
/**
- * Reads and saves the CDS translation from one or more lines of the file, and
- * returns the next line after that
+ * Reads and returns the CDS translation from one or more lines of the file,
+ * and returns the next line after that
*
* @param value
* the first line of the translation (likely quoted)
+ * @param data
* @return
* @throws IOException
*/
- String readTranslation(String value) throws IOException
+ String readTranslation(String value, CdsData data) throws IOException
{
StringBuilder sb = new StringBuilder(this.length / 3 + 1);
sb.append(value.replace("\"", ""));
sb.append(tokens[1].replace("\"", ""));
}
- return sb.toString();
+ data.translation = sb.toString();
+
+ return line;
}
/**
* <li>add a CDS feature to the sequence for each CDS start-end range</li>
* <li>create a protein product sequence for the translation</li>
* <li>create a cross-reference to protein with mapping from dna</li>
- * <li>add any CDS dbrefs to the sequence and to the protein product</li>
+ * <li>add any CDS dbrefs to the sequence and to the protein product</li>
* </ul>
- * @param SequenceI dna
+ *
+ * @param SequenceI
+ * dna
*/
- void processCDS(SequenceI dna)
+ void processCDS(SequenceI dna, CdsData data)
{
/*
* parse location into a list of [start, end, start, end] positions
*/
- int[] exons = getCdsRanges(this.accession, this.cdsLocation);
+ int[] exons = getCdsRanges(this.accession, data.cdsLocation);
int exonNumber = 0;
-
- for (int xint = 0; exons != null
- && xint < exons.length - 1; xint += 2)
+
+ for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
{
int exonStart = exons[xint];
int exonEnd = exons[xint + 1];
int end = Math.max(exonStart, exonEnd);
exonNumber++;
String desc = String.format("Exon %d for protein EMBLCDS:%s",
- exonNumber, proteinId);
+ exonNumber, data.proteinId);
- SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
- if (!cdsProps.isEmpty())
+ SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
+ this.sourceDb);
+ for (Entry<String, String> val : data.cdsProps.entrySet())
{
- for (Entry<String, String> val : cdsProps.entrySet())
- {
- sf.setValue(val.getKey(), val.getValue());
- }
+ sf.setValue(val.getKey(), val.getValue());
}
- sf.setEnaLocation(this.cdsLocation);
+ sf.setEnaLocation(data.cdsLocation);
boolean forwardStrand = exonStart <= exonEnd;
sf.setStrand(forwardStrand ? "+" : "-");
- sf.setPhase(String.valueOf(codonStart - 1));
+ sf.setPhase(String.valueOf(data.codonStart - 1));
sf.setValue(FeatureProperties.EXONPOS, exonNumber);
- sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+ sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
dna.addSequenceFeature(sf);
}
}
/**
+ * Constructs a sequence for the protein product (if there is one), and dbrefs
+ * with mappings from dna to protein and the reverse
+ */
+ void processTranslation()
+ {
+ // TODO Auto-generated method stub
+
+ }
+
+ /**
* Constructs and saves the sequence from parsed components
*/
void assembleSequence()
{
seq.addDBRef(dbref);
}
-
- processCDS(seq);
+
+ for (CdsData data : cds)
+ {
+ processCDS(seq, data);
+ };
+
+ processTranslation();
+
seq.deriveSequence();
-
+
addSequence(seq);
}