package jalview.datamodel.xdb.embl;
import jalview.analysis.SequenceIdMatcher;
+import jalview.bin.Cache;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.FeatureProperties;
import jalview.util.MappingUtils;
import jalview.util.StringUtils;
+import java.text.ParseException;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.List;
// dbref
retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
new int[] { 1, dna.getLength() }, 1, 1));
- // TODO: transform EMBL Database refs to canonical form
+
+
+ /*
+ * transform EMBL Database refs to canonical form
+ */
if (dbRefs != null)
{
for (DBRefEntry dbref : dbRefs)
{
+ dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource()));
dna.addDBRef(dbref);
}
}
+ SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
try
{
for (EmblFeature feature : features)
{
- if (feature.dbRefs != null)
- {
- for (DBRefEntry dbref : feature.dbRefs)
- {
- /*
- * convert UniProtKB/Swiss-Prot to UNIPROT
- */
- dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource()));
- dna.addDBRef(dbref);
- }
- }
if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
{
- parseCodingFeature(feature, sourceDb, dna, peptides);
+ parseCodingFeature(feature, sourceDb, dna, peptides, matcher);
}
}
} catch (Exception e)
* parent dna sequence for this record
* @param peptides
* list of protein product sequences for Embl entry
+ * @param matcher
+ * helper to match xrefs in already retrieved sequences
*/
void parseCodingFeature(EmblFeature feature, String sourceDb,
- SequenceI dna, List<SequenceI> peptides)
+ SequenceI dna, List<SequenceI> peptides, SequenceIdMatcher matcher)
{
boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
String prname = "";
String prid = null;
Map<String, String> vals = new Hashtable<String, String>();
- SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
/*
* codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
}
else if (qname.equals("protein_id"))
{
- prid = q.getValues()[0];
+ prid = q.getValues()[0].trim();
}
else if (qname.equals("codon_start"))
{
try
{
- codonStart = Integer.parseInt(q.getValues()[0]);
+ codonStart = Integer.parseInt(q.getValues()[0].trim());
} catch (NumberFormatException e)
{
System.err.println("Invalid codon_start in XML for "
else if (qname.equals("product"))
{
// sometimes name is returned e.g. for V00488
- prname = q.getValues()[0];
+ prname = q.getValues()[0].trim();
}
else
{
}
/*
- * add mappings for Uniprot xrefs
+ * add dbRefs to sequence, and mappings for Uniprot xrefs
*/
if (feature.dbRefs != null)
{
boolean mappingUsed = false;
for (DBRefEntry ref : feature.dbRefs)
{
+ /*
+ * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+ */
+ ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
if (ref.getSource().equals(DBRefSource.UNIPROT))
{
String proteinSeqName = DBRefSource.UNIPROT + "|"
}
}
}
+ dna.addDBRef(ref);
}
if (noProteinDbref && product != null)
{
}
/**
- * Returns the CDS positions as a list of [start, end, start, end...]
+ * Returns the CDS positions as a single array of [start, end, start, end...]
* positions. If on the reverse strand, these will be in descending order.
*
* @param feature
{
return new int[] {};
}
- List<int[]> ranges = DnaUtils.parseLocation(feature.location);
- return ranges == null ? new int[] {} : listToArray(ranges);
+
+ try
+ {
+ List<int[]> ranges = DnaUtils.parseLocation(feature.location);
+ return listToArray(ranges);
+ } catch (ParseException e)
+ {
+ Cache.log.warn(String.format(
+ "Not parsing inexact CDS location %s in ENA %s",
+ feature.location, this.accession));
+ return new int[] {};
+ }
}
/**