import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
findXrefSourcesForSequence(seq, dna, sources);
}
}
+ sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in
+ // redundant datasets
+ if (dna)
+ {
+ sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and
+ // EnsemblGenomes xref option shown
+ // from cdna panel
+ sources.remove(DBRefSource.ENSEMBLGENOMES);
+ }
+ // redundant datasets
return sources;
}
if (!rseqs.contains(matchInDataset))
{
rseqs.add(matchInDataset);
- // need to try harder to only add unique mappings
- if (xref.getMap().getMap().isTripletMap()
- && dataset.getMapping(seq, matchInDataset) == null
- && cf.getMappingBetween(seq, matchInDataset) == null)
+ }
+ // even if rseqs contained matchInDataset - check mappings between
+ // these seqs are added
+ // need to try harder to only add unique mappings
+ if (xref.getMap().getMap().isTripletMap()
+ && dataset.getMapping(seq, matchInDataset) == null
+ && cf.getMappingBetween(seq, matchInDataset) == null)
+ {
+ // materialise a mapping for highlighting between these
+ // sequences
+ if (fromDna)
{
- // materialise a mapping for highlighting between these sequences
- if (fromDna)
- {
- cf.addMap(dss, matchInDataset, xref.getMap().getMap(), xref.getMap().getMappedFromId());
- } else {
- cf.addMap(matchInDataset, dss, xref.getMap().getMap().getInverse(), xref.getMap().getMappedFromId());
- }
+ cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
+ xref.getMap().getMappedFromId());
+ }
+ else
+ {
+ cf.addMap(matchInDataset, dss, xref.getMap().getMap()
+ .getInverse(), xref.getMap().getMappedFromId());
}
}
+
refIterator.remove();
continue;
}
// first filter in case we are retrieving crossrefs that have already been
// retrieved. this happens for cases where a database record doesn't yield
// protein products for CDS
- DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
- for (SequenceI sq : dataset.getSequences())
- {
- boolean dupeFound = false;
- // !fromDna means we are looking only for nucleotide sequences, not
- // protein
- if (sq.isProtein() == fromDna)
- {
- for (DBRefEntry dbr : sq.getPrimaryDBRefs())
- {
- for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr))
- {
- sourceRefs.remove(found);
- dupeFound = true;
- }
- }
- }
- if (dupeFound)
- {
- dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
- }
- }
+ removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
if (sourceRefs.size() == 0)
{
// no more work to do! We already had all requested sequence records in
if (retrieved != null)
{
- updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
+ boolean addedXref = false;
+ List<SequenceI> newDsSeqs = new ArrayList<SequenceI>(), doNotAdd = new ArrayList<SequenceI>();
+
for (SequenceI retrievedSequence : retrieved)
{
// dataset gets contaminated ccwith non-ds sequences. why ??!
// try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
: retrievedSequence.getDatasetSequence();
- DBRefEntry[] dbr = retrievedSequence.getDBRefs();
- if (dbr != null)
+ addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
+ retrievedDss);
+ }
+ if (!addedXref)
+ {
+ // try again, after looking for matching IDs
+ // shouldn't need to do this unless the dbref mechanism has broken.
+ updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
+ for (SequenceI retrievedSequence : retrieved)
+ {
+ // dataset gets contaminated ccwith non-ds sequences. why ??!
+ // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
+ SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
+ : retrievedSequence.getDatasetSequence();
+ addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
+ retrievedDss);
+ }
+ }
+ for (SequenceI newToSeq : newDsSeqs)
+ {
+ if (!doNotAdd.contains(newToSeq)
+ && dataset.findIndex(newToSeq) == -1)
{
- for (DBRefEntry dbref : dbr)
+ dataset.addSequence(newToSeq);
+ matcher.add(newToSeq);
+ }
+ }
+ }
+ }
+
+ /**
+ * Search dataset for sequences with a primary reference contained in
+ * sourceRefs.
+ *
+ * @param sourceRefs
+ * - list of references to filter.
+ * @param fromDna
+ * - type of sequence to search for matching primary reference.
+ */
+ private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
+ boolean fromDna)
+ {
+ DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
+ for (SequenceI sq : dataset.getSequences())
+ {
+ boolean dupeFound = false;
+ // !fromDna means we are looking only for nucleotide sequences, not
+ // protein
+ if (sq.isProtein() == fromDna)
+ {
+ for (DBRefEntry dbr : sq.getPrimaryDBRefs())
+ {
+ for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr))
{
- // find any entry where we should put in the sequence being
- // cross-referenced into the map
- Mapping map = dbref.getMap();
- if (map != null)
+ sourceRefs.remove(found);
+ dupeFound = true;
+ }
+ }
+ }
+ if (dupeFound)
+ {
+ // rebuild the search array from the filtered sourceRefs list
+ dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
+ }
+ }
+ }
+
+ /**
+ * process sequence retrieved via a dbref on source sequence to resolve and
+ * transfer data
+ *
+ * @param cf
+ * @param sourceSequence
+ * @param retrievedSequence
+ * @return true if retrieveSequence was imported
+ */
+ private boolean importCrossRefSeq(AlignedCodonFrame cf,
+ List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd,
+ SequenceI sourceSequence, SequenceI retrievedSequence)
+ {
+ /**
+ * set when retrievedSequence has been verified as a crossreference for
+ * sourceSequence
+ */
+ boolean imported = false;
+ DBRefEntry[] dbr = retrievedSequence.getDBRefs();
+ if (dbr != null)
+ {
+ for (DBRefEntry dbref : dbr)
+ {
+ SequenceI matched = findInDataset(dbref);
+ if (matched == sourceSequence)
+ {
+ // verified retrieved and source sequence cross-reference each other
+ imported = true;
+ }
+ // find any entry where we should put in the sequence being
+ // cross-referenced into the map
+ Mapping map = dbref.getMap();
+ if (map != null)
+ {
+ if (map.getTo() != null && map.getMap() != null)
+ {
+ if (map.getTo() == sourceSequence)
+ {
+ // already called to import once, and most likely this sequence
+ // already imported !
+ continue;
+ }
+ if (matched == null)
{
- if (map.getTo() != null && map.getMap() != null)
+ /*
+ * sequence is new to dataset, so save a reference so it can be added.
+ */
+ newDsSeqs.add(map.getTo());
+ continue;
+ }
+
+ /*
+ * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one.
+ */
+
+ try
+ {
+ // compare ms with dss and replace with dss in mapping
+ // if map is congruent
+ SequenceI ms = map.getTo();
+ // TODO findInDataset requires exact sequence match but
+ // 'congruent' test is only for the mapped part
+ // maybe not a problem in practice since only ENA provide a
+ // mapping and it is to the full protein translation of CDS
+ // matcher.findIdMatch(map.getTo());
+ // TODO addendum: if matched is shorter than getTo, this will fail
+ // - when it should really succeed.
+ int sf = map.getMap().getToLowest();
+ int st = map.getMap().getToHighest();
+ SequenceI mappedrg = ms.getSubSequence(sf, st);
+ if (mappedrg.getLength() > 0
+ && ms.getSequenceAsString().equals(
+ matched.getSequenceAsString()))
{
- // TODO findInDataset requires exact sequence match but
- // 'congruent' test is only for the mapped part
- // maybe not a problem in practice since only ENA provide a
- // mapping and it is to the full protein translation of CDS
- SequenceI matched = findInDataset(dbref);
- // matcher.findIdMatch(map.getTo());
- if (matched != null)
+ /*
+ * sequences were a match,
+ */
+ String msg = "Mapping updated from " + ms.getName()
+ + " to retrieved crossreference "
+ + matched.getName();
+ System.out.println(msg);
+
+ DBRefEntry[] toRefs = map.getTo().getDBRefs();
+ if (toRefs != null)
{
/*
- * already got an xref to this sequence; update this
- * map to point to the same sequence, and add
- * any new dbrefs to it
+ * transfer database refs
*/
- DBRefEntry[] toRefs = map.getTo().getDBRefs();
- if (toRefs != null)
+ for (DBRefEntry ref : toRefs)
{
- for (DBRefEntry ref : toRefs)
+ if (dbref.getSrcAccString().equals(
+ ref.getSrcAccString()))
{
- matched.addDBRef(ref); // add or update mapping
+ continue; // avoid overwriting the ref on source sequence
}
- }
- map.setTo(matched);
- }
- else
- {
- if (dataset.findIndex(map.getTo()) == -1)
- {
- dataset.addSequence(map.getTo());
- matcher.add(map.getTo());
+ matched.addDBRef(ref); // add or update mapping
}
}
- try
- {
- // compare ms with dss and replace with dss in mapping
- // if map is congruent
- SequenceI ms = map.getTo();
- int sf = map.getMap().getToLowest();
- int st = map.getMap().getToHighest();
- SequenceI mappedrg = ms.getSubSequence(sf, st);
- // SequenceI loc = dss.getSubSequence(sf, st);
- if (mappedrg.getLength() > 0
- && ms.getSequenceAsString().equals(
- dss.getSequenceAsString()))
- // && mappedrg.getSequenceAsString().equals(
- // loc.getSequenceAsString()))
- {
- String msg = "Mapping updated from " + ms.getName()
- + " to retrieved crossreference "
- + dss.getName();
- System.out.println(msg);
- map.setTo(dss);
+ doNotAdd.add(map.getTo());
+ map.setTo(matched);
- /*
- * give the reverse reference the inverse mapping
- * (if it doesn't have one already)
- */
- setReverseMapping(dss, dbref, cf);
+ /*
+ * give the reverse reference the inverse mapping
+ * (if it doesn't have one already)
+ */
+ setReverseMapping(matched, dbref, cf);
+ /*
+ * copy sequence features as well, avoiding
+ * duplication (e.g. same variation from two
+ * transcripts)
+ */
+ SequenceFeature[] sfs = ms.getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature feat : sfs)
+ {
/*
- * copy sequence features as well, avoiding
- * duplication (e.g. same variation from two
- * transcripts)
+ * make a flyweight feature object which ignores Parent
+ * attribute in equality test; this avoids creating many
+ * otherwise duplicate exon features on genomic sequence
*/
- SequenceFeature[] sfs = ms.getSequenceFeatures();
- if (sfs != null)
+ SequenceFeature newFeature = new SequenceFeature(
+ feat)
{
- for (SequenceFeature feat : sfs)
+ @Override
+ public boolean equals(Object o)
{
- /*
- * make a flyweight feature object which ignores Parent
- * attribute in equality test; this avoids creating many
- * otherwise duplicate exon features on genomic sequence
- */
- SequenceFeature newFeature = new SequenceFeature(
- feat)
- {
- @Override
- public boolean equals(Object o)
- {
- return super.equals(o, true);
- }
- };
- dss.addSequenceFeature(newFeature);
+ return super.equals(o, true);
}
- }
+ };
+ matched.addSequenceFeature(newFeature);
}
- cf.addMap(retrievedDss, map.getTo(), map.getMap());
- } catch (Exception e)
- {
- System.err
- .println("Exception when consolidating Mapped sequence set...");
- e.printStackTrace(System.err);
}
+
}
+ cf.addMap(retrievedSequence, map.getTo(), map.getMap());
+ } catch (Exception e)
+ {
+ System.err
+ .println("Exception when consolidating Mapped sequence set...");
+ e.printStackTrace(System.err);
}
}
}
- retrievedSequence.updatePDBIds();
- rseqs.add(retrievedDss);
- if (dataset.findIndex(retrievedDss) == -1)
- {
- dataset.addSequence(retrievedDss);
- matcher.add(retrievedDss);
- }
}
}
+ if (imported)
+ {
+ retrievedSequence.updatePDBIds();
+ rseqs.add(retrievedSequence);
+ if (dataset.findIndex(retrievedSequence) == -1)
+ {
+ dataset.addSequence(retrievedSequence);
+ matcher.add(retrievedSequence);
+ }
+ }
+ return imported;
}
/**
* Sets the inverse sequence mapping in the corresponding dbref of the mapped
}
/**
- * Returns the first identical sequence in the dataset if any, else null
+ * Returns null or the first sequence in the dataset which is identical to
+ * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
+ * first one with an ID source|xrefacc
*
* @param xref
+ * with map and mapped-to sequence
* @return
*/
SequenceI findInDataset(DBRefEntry xref)
String name2 = xref.getSource() + "|" + name;
SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo
.getDatasetSequence();
+ // first check ds if ds is directly referenced
+ if (dataset.findIndex(dss) > -1)
+ {
+ return dss;
+ }
+ DBRefEntry template = new DBRefEntry(xref.getSource(), null,
+ xref.getAccessionId());
+ /**
+ * remember the first ID match - in case we don't find a match to template
+ */
+ SequenceI firstIdMatch = null;
for (SequenceI seq : dataset.getSequences())
{
+ // first check primary refs.
+ List<DBRefEntry> match = DBRefUtils.searchRefs(seq.getPrimaryDBRefs()
+ .toArray(new DBRefEntry[0]), template);
+ if (match != null && match.size() == 1 && sameSequence(seq, dss))
+ {
+ return seq;
+ }
/*
* clumsy alternative to using SequenceIdMatcher which currently
* returns sequences with a dbref to the matched accession id
* which we don't want
*/
- if (name.equals(seq.getName()) || seq.getName().startsWith(name2))
+ if (firstIdMatch == null
+ && (name.equals(seq.getName()) || seq.getName().startsWith(
+ name2)))
{
if (sameSequence(seq, dss))
{
- return seq;
+ firstIdMatch = seq;
}
}
}
- return null;
+ return firstIdMatch;
}
/**
import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
import jalview.io.FastaFile;
import jalview.util.Comparison;
+import jalview.util.LinkedIdentityHashSet;
import jalview.util.MessageManager;
import java.util.ArrayList;
private void resolveAndAddDatasetSeq(SequenceI currentSeq,
Set<SequenceI> seqs, boolean createDatasetSequence)
{
+ SequenceI alignedSeq = currentSeq;
if (currentSeq.getDatasetSequence() != null)
{
currentSeq = currentSeq.getDatasetSequence();
{
if (dbr.getMap() != null && dbr.getMap().getTo() != null)
{
+ if (dbr.getMap().getTo() == alignedSeq)
+ {
+ /*
+ * update mapping to be to the newly created dataset sequence
+ */
+ dbr.getMap().setTo(currentSeq);
+ }
if (dbr.getMap().getTo().getDatasetSequence() != null)
{
- throw new Error("Implementation error: Map.getTo() for dbref"
- + dbr + " is not a dataset sequence.");
- // TODO: if this happens, could also rewrite the reference to
- // point to new dataset sequence
+ throw new Error(
+ "Implementation error: Map.getTo() for dbref " + dbr
+ + " from " + curDs.getName()
+ + " is not a dataset sequence.");
}
// we recurse to add all forward references to dataset sequences via
// DBRefs/etc
return;
}
// try to avoid using SequenceI.equals at this stage, it will be expensive
- Set<SequenceI> seqs = new jalview.util.LinkedIdentityHashSet<SequenceI>();
+ Set<SequenceI> seqs = new LinkedIdentityHashSet<SequenceI>();
for (int i = 0; i < getHeight(); i++)
{
import jalview.analysis.AlignSeq;
import jalview.api.DBRefEntryI;
+import jalview.util.Comparison;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
import jalview.util.StringUtils;
@Override
public void setDBRefs(DBRefEntry[] dbref)
{
+ if (dbrefs == null && datasetSequence != null
+ && this != datasetSequence)
+ {
+ datasetSequence.setDBRefs(dbref);
+ return;
+ }
dbrefs = dbref;
+ if (dbrefs != null)
+ {
+ DBRefUtils.ensurePrimaries(this);
+ }
}
@Override
@Override
public void addDBRef(DBRefEntry entry)
{
- // TODO add to dataset sequence instead if there is one?
+ if (datasetSequence != null)
+ {
+ datasetSequence.addDBRef(entry);
+ return;
+ }
+
if (dbrefs == null)
{
dbrefs = new DBRefEntry[0];
temp[temp.length - 1] = entry;
dbrefs = temp;
+
+ DBRefUtils.ensurePrimaries(this);
}
@Override
public void setDatasetSequence(SequenceI seq)
{
- // TODO check for circular reference before setting?
+ if (seq == this)
+ {
+ throw new IllegalArgumentException(
+ "Implementation Error: self reference passed to SequenceI.setDatasetSequence");
+ }
+ if (seq != null && seq.getDatasetSequence() != null)
+ {
+ throw new IllegalArgumentException(
+ "Implementation error: cascading dataset sequences are not allowed.");
+ }
datasetSequence = seq;
}
private long _seqhash = 0;
+ /**
+ * Answers false if the sequence is more than 85% nucleotide (ACGTU), else
+ * true
+ */
@Override
public boolean isProtein()
{
if (_seqhash != sequence.hashCode())
{
_seqhash = sequence.hashCode();
- _isNa=jalview.util.Comparison.isNucleotide(new SequenceI[] { this });
+ _isNa = Comparison.isNucleotide(this);
}
return !_isNa;
};
public int[] findPositionMap();
/**
+ * Answers true if the sequence is composed of amino acid characters. Note
+ * that implementations may use heuristic methods which are not guaranteed to
+ * give the biologically 'right' answer.
*
- * @return true if sequence is composed of amino acid characters
+ * @return
*/
public boolean isProtein();
public void setVamsasId(String id);
+ /**
+ * set the array of Database references for the sequence.
+ *
+ * @param dbs
+ * @deprecated - use is discouraged since side-effects may occur if DBRefEntry
+ * set are not normalised.
+ */
+ @Deprecated
public void setDBRefs(DBRefEntry[] dbs);
public DBRefEntry[] getDBRefs();
URL url = getUrl(ids);
BufferedReader reader = getHttpResponse(url, ids);
+ if (reader == null)
+ {
+ // request failed
+ return null;
+ }
FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
return fp;
}
writePostBody(connection, ids);
}
- InputStream response = connection.getInputStream();
int responseCode = connection.getResponseCode();
if (responseCode != 200)
* note: a GET request for an invalid id returns an error code e.g. 415
* but POST request returns 200 and an empty Fasta response
*/
- throw new IOException(
- "Response code was not 200. Detected response was "
- + responseCode);
+ System.err.println("Response code " + responseCode + " for " + url);
+ return null;
}
+ // get content
+ InputStream response = connection.getInputStream();
+
// System.out.println(getClass().getName() + " took "
// + (System.currentTimeMillis() - now) + "ms to fetch");
import jalview.analysis.AlignmentUtils;
import jalview.analysis.Dna;
+import jalview.bin.Cache;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
import jalview.datamodel.Mapping;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+ " chunks. Unexpected problem (" + r.getLocalizedMessage()
+ ")";
System.err.println(msg);
+ r.printStackTrace();
break;
}
}
DBRefEntry dbr = new DBRefEntry(getDbSource(),
getEnsemblDataVersion(), proteinSeq.getName(), map);
querySeq.getDatasetSequence().addDBRef(dbr);
+ DBRefEntry[] uprots = DBRefUtils.selectRefs(ds.getDBRefs(),
+ new String[] { DBRefSource.UNIPROT });
+ DBRefEntry[] upxrefs = DBRefUtils.selectRefs(querySeq.getDBRefs(),
+ new String[] { DBRefSource.UNIPROT });
+ if (uprots != null)
+ {
+ for (DBRefEntry up : uprots)
+ {
+ // locate local uniprot ref and map
+ List<DBRefEntry> upx = DBRefUtils.searchRefs(upxrefs, up.getAccessionId());
+ DBRefEntry upxref;
+ if (upx.size() != 0)
+ {
+ upxref = upx.get(0);
+
+ if (upx.size() > 1)
+ {
+ Cache.log
+ .warn("Implementation issue - multiple uniprot acc on product sequence.");
+ }
+ }
+ else
+ {
+ upxref = new DBRefEntry(DBRefSource.UNIPROT,
+ getEnsemblDataVersion(), up.getAccessionId());
+ }
+
+ Mapping newMap = new Mapping(ds, mapList);
+ upxref.setVersion(getEnsemblDataVersion());
+ upxref.setMap(newMap);
+ if (upx.size() == 0)
+ {
+ // add the new uniprot ref
+ querySeq.getDatasetSequence().addDBRef(upxref);
+ }
+
+ }
+ }
/*
* copy exon features to protein, compute peptide variants from dna
throw new JalviewException("ENSEMBL Rest API not available.");
}
FileParse fp = getSequenceReader(ids);
+ if (fp == null)
+ {
+ return alignment;
+ }
+
FastaFile fr = new FastaFile(fp);
if (fr.hasWarningMessage())
{
case 502:
case 504:
case 505:
- message = MessageManager.getString("exception.fts_server_error");
+ message = MessageManager.formatMessage("exception.fts_server_error",
+ service);
break;
case 503:
message = MessageManager.getString("exception.service_not_available");
import jalview.datamodel.SequenceI;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@Override
public boolean matches(DBRefEntry refa, DBRefEntry refb)
{
- if (refa.getSource() != null && refb.getSource() != null
+ if (refa.getSource() != null
+ && refb.getSource() != null
&& DBRefUtils.getCanonicalName(refb.getSource()).equals(
DBRefUtils.getCanonicalName(refa.getSource())))
{
@Override
public boolean matches(DBRefEntry refa, DBRefEntry refb)
{
- if (refa.getSource() != null && refb.getSource() != null
+ if (refa.getSource() != null
+ && refb.getSource() != null
&& DBRefUtils.getCanonicalName(refb.getSource()).equals(
DBRefUtils.getCanonicalName(refa.getSource())))
{
@Override
public boolean matches(DBRefEntry refa, DBRefEntry refb)
{
- if (refa.getSource() != null && refb.getSource() != null
+ if (refa.getSource() != null
+ && refb.getSource() != null
&& DBRefUtils.getCanonicalName(refb.getSource()).equals(
DBRefUtils.getCanonicalName(refa.getSource())))
{
@Override
public boolean matches(DBRefEntry refa, DBRefEntry refb)
{
- if (refa.getSource() != null && refb.getSource() != null
+ if (refa.getSource() != null
+ && refb.getSource() != null
&& DBRefUtils.getCanonicalName(refb.getSource()).equals(
DBRefUtils.getCanonicalName(refa.getSource())))
{
return matches;
}
+ /**
+ * promote direct database references to primary for nucleotide or protein
+ * sequences if they have an appropriate primary ref
+ * <table>
+ * <tr>
+ * <th>Seq Type</th>
+ * <th>Primary DB</th>
+ * <th>Direct which will be promoted</th>
+ * </tr>
+ * <tr align=center>
+ * <td>peptides</td>
+ * <td>Ensembl</td>
+ * <td>Uniprot</td>
+ * </tr>
+ * <tr align=center>
+ * <td>peptides</td>
+ * <td>Ensembl</td>
+ * <td>Uniprot</td>
+ * </tr>
+ * <tr align=center>
+ * <td>dna</td>
+ * <td>Ensembl</td>
+ * <td>ENA</td>
+ * </tr>
+ * </table>
+ *
+ * @param sequence
+ */
+ public static void ensurePrimaries(SequenceI sequence)
+ {
+ List<DBRefEntry> pr = sequence.getPrimaryDBRefs();
+ if (pr.size() == 0)
+ {
+ // nothing to do
+ return;
+ }
+ List<DBRefEntry> selfs = new ArrayList<DBRefEntry>();
+ {
+ DBRefEntry[] selfArray = selectDbRefs(!sequence.isProtein(),
+ sequence.getDBRefs());
+ if (selfArray == null || selfArray.length == 0)
+ {
+ // nothing to do
+ return;
+ }
+ selfs.addAll(Arrays.asList(selfArray));
+ }
+
+ // filter non-primary refs
+ for (DBRefEntry p : pr)
+ {
+ while (selfs.contains(p))
+ {
+ selfs.remove(p);
+ }
+ }
+ List<DBRefEntry> toPromote = new ArrayList<DBRefEntry>();
+
+ for (DBRefEntry p : pr)
+ {
+ List<String> promType = new ArrayList<String>();
+ if (sequence.isProtein())
+ {
+ switch (getCanonicalName(p.getSource()))
+ {
+ case DBRefSource.UNIPROT:
+ // case DBRefSource.UNIPROTKB:
+ // case DBRefSource.UP_NAME:
+ // search for and promote ensembl
+ promType.add(DBRefSource.ENSEMBL);
+ break;
+ case DBRefSource.ENSEMBL:
+ // search for and promote Uniprot
+ promType.add(DBRefSource.UNIPROT);
+ break;
+ }
+ }
+ else
+ {
+ // TODO: promote transcript refs
+ }
+
+ // collate candidates and promote them
+ DBRefEntry[] candidates = selectRefs(
+ selfs.toArray(new DBRefEntry[0]),
+ promType.toArray(new String[0]));
+ if (candidates != null)
+ {
+ for (DBRefEntry cand : candidates)
+ {
+ if (cand.hasMap())
+ {
+ if (cand.getMap().getTo() != null
+ && cand.getMap().getTo() != sequence)
+ {
+ // can't promote refs with mappings to other sequences
+ continue;
+ }
+ if (cand.getMap().getMap().getFromLowest() != sequence
+ .getStart()
+ && cand.getMap().getMap().getFromHighest() != sequence
+ .getEnd())
+ {
+ // can't promote refs with mappings from a region of this sequence
+ // - eg CDS
+ continue;
+ }
+ }
+ // and promote
+ cand.setVersion(p.getVersion() + " (promoted)");
+ selfs.remove(cand);
+ toPromote.add(cand);
+ if (!cand.isPrimaryCandidate())
+ {
+ System.out.println("Warning: Couldn't promote dbref "
+ + cand.toString() + " for sequence "
+ + sequence.toString());
+ }
+ }
+ }
+ }
+ }
+
}
{
onlyPdbEntries.addElement(pdb);
}
+ if ("EMBL".equals(pdb.getType()))
+ {
+ // look for a CDS reference and add it, too.
+ String cdsId = (String) pdb.getProperty()
+ .get("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ // remove version
+ String[] vrs = cdsId.split("\\.");
+ dbr = new DBRefEntry(DBRefSource.EMBLCDS, vrs.length > 1 ? vrs[1]
+ : DBRefSource.UNIPROT + ":" + dbVersion, vrs[0]);
+ dbRefs.add(dbr);
+ }
+ }
+ if ("Ensembl".equals(pdb.getType()))
+ {
+ /*UniprotXML
+ * <dbReference type="Ensembl" id="ENST00000321556">
+ * <molecule id="Q9BXM7-1"/>
+ * <property type="protein sequence ID" value="ENSP00000364204"/>
+ * <property type="gene ID" value="ENSG00000158828"/>
+ * </dbReference>
+ */
+ String cdsId = (String) pdb.getProperty()
+ .get("protein sequence ID");
+ if (cdsId != null && cdsId.trim().length() > 0)
+ {
+ dbr = new DBRefEntry(DBRefSource.ENSEMBL, DBRefSource.UNIPROT
+ + ":" + dbVersion, cdsId.trim());
+ dbRefs.add(dbr);
+
+ }
+ }
+
}
sequence.setPDBId(onlyPdbEntries);
sequence.addSequenceFeature(sf);
}
}
- sequence.setDBRefs(dbRefs.toArray(new DBRefEntry[0]));
+ for (DBRefEntry dbr : dbRefs)
+ {
+ sequence.addDBRef(dbr);
+ }
return sequence;
}
seq.addDBRef(new DBRefEntry("ENSEMBLGENOMES", "0", "E2350"));
sources = new CrossRef(new SequenceI[] { seq }, al)
.findXrefSourcesForSequences(false);
- assertEquals(4, sources.size());
- assertEquals("[EMBL, EMBLCDS, GENEDB, ENSEMBL]", sources.toString());
+ // method is patched to remove EMBL from the sources to match
+ assertEquals(3, sources.size());
+ assertEquals("[EMBLCDS, GENEDB, ENSEMBL]", sources.toString());
/*
* add a sequence to the alignment which has a dbref to UNIPROT|A1234
al.addSequence(seq2);
sources = new CrossRef(new SequenceI[] { seq, seq2 }, al)
.findXrefSourcesForSequences(false);
- assertEquals(3, sources.size());
- assertEquals("[EMBLCDS, EMBL, GENEDB]", sources.toString());
+ // method removed EMBL from sources to match
+ assertEquals(2, sources.size());
+ assertEquals("[EMBLCDS, GENEDB]", sources.toString());
}
/**
public void testFindXrefSequences_withFetch()
{
SequenceI dna1 = new Sequence("AF039662", "GGGGCAGCACAAGAAC");
- dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
- dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P30419"));
- dna1.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "Q9ZTS2"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P30419"));
+ dna1.addDBRef(new DBRefEntry("UNIPROT", "ENA:0", "P00314"));
final SequenceI pep1 = new Sequence("Q9ZTS2", "MYQLIRSSW");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "Q9ZTS2"));
+
final SequenceI pep2 = new Sequence("P00314", "MRKLLAASG");
+ pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "P00314"));
/*
* argument false suppresses adding DAS sources
* 'spliced transcript' with CDS ranges
*/
SequenceI braf002 = new Sequence("ENST00000497784", "gCAGGCtaTCTGTTCaa");
- braf002.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
+ braf002.addDBRef(new DBRefEntry("UNIPROT", "ENSEMBL|0", "H7C5K3"));
braf002.addSequenceFeature(new SequenceFeature("CDS", "", 2, 6, 0f,
null));
braf002.addSequenceFeature(new SequenceFeature("CDS", "", 9, 15, 0f,
* which happens to be true for Uniprot,PDB,EMBL but not Pfam,Rfam,Ensembl
*/
final SequenceI pep1 = new Sequence("UNIPROT|P15056", "MAAL");
+ pep1.addDBRef(new DBRefEntry("UNIPROT", "0", "P15056"));
final SequenceI pep2 = new Sequence("UNIPROT|H7C5K3", "QALF");
-
+ pep2.addDBRef(new DBRefEntry("UNIPROT", "0", "H7C5K3"));
/*
* argument false suppresses adding DAS sources
* todo: define an interface type SequenceFetcherI and mock that
*/
final SequenceI x07547 = new Sequence("EMBL|X07547", "cccAAACCCTTTGGG");
DBRefEntry dbref7 = new DBRefEntry("UNIPROT", "0", "P0CE20");
- dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE19", "KPFG"),
+ dbref7.setMap(new Mapping(new Sequence("UNIPROT|P0CE20", "PFGK"),
new MapList(map2)));
x07547.addDBRef(dbref7);
DBRefEntry dbref8 = new DBRefEntry("UNIPROT", "0", "B0BCM4");
public void testReplace()
{
// seem to need a dataset sequence on the edited sequence here
- seqs[1].setDatasetSequence(seqs[1]);
+ seqs[1].createDatasetSequence();
new EditCommand("", Action.REPLACE, "ZXY", new SequenceI[] { seqs[1] },
4, 8, al);
assertEquals("abcdefghjk", seqs[0].getSequenceAsString());
assertEquals(23, startEnd[1]);
}
+ /**
+ * Tests that dbrefs with mappings to sequence get updated if the sequence
+ * acquires a dataset sequence
+ */
+ @Test(groups = "Functional")
+ public void testCreateDataset_updateDbrefMappings()
+ {
+ SequenceI pep = new Sequence("pep", "ASD");
+ SequenceI dna = new Sequence("dna", "aaaGCCTCGGATggg");
+ SequenceI cds = new Sequence("cds", "GCCTCGGAT");
+
+ // add dbref from dna to peptide
+ DBRefEntry dbr = new DBRefEntry("UNIPROT", "", "pep");
+ dbr.setMap(new Mapping(pep, new MapList(new int[] { 4, 15 }, new int[] {
+ 1, 4 }, 3, 1)));
+ dna.addDBRef(dbr);
+
+ // add dbref from dna to peptide
+ DBRefEntry dbr2 = new DBRefEntry("UNIPROT", "", "pep");
+ dbr2.setMap(new Mapping(pep, new MapList(new int[] { 1, 12 }, new int[]
+ { 1, 4 }, 3, 1)));
+ cds.addDBRef(dbr2);
+
+ // add dbref from peptide to dna
+ DBRefEntry dbr3 = new DBRefEntry("EMBL", "", "dna");
+ dbr3.setMap(new Mapping(dna, new MapList(new int[] { 1, 4 }, new int[] {
+ 4, 15 }, 1, 3)));
+ pep.addDBRef(dbr3);
+
+ // add dbref from peptide to cds
+ DBRefEntry dbr4 = new DBRefEntry("EMBLCDS", "", "cds");
+ dbr4.setMap(new Mapping(cds, new MapList(new int[] { 1, 4 }, new int[] {
+ 1, 12 }, 1, 3)));
+ pep.addDBRef(dbr4);
+
+ AlignmentI protein = new Alignment(new SequenceI[] { pep });
+
+ /*
+ * create the alignment dataset
+ */
+ ((Alignment) protein).createDatasetAlignment();
+
+ AlignmentI ds = protein.getDataset();
+
+ // should be 3 sequences in dataset
+ assertEquals(3, ds.getHeight());
+ assertTrue(ds.getSequences().contains(pep.getDatasetSequence()));
+ assertTrue(ds.getSequences().contains(dna));
+ assertTrue(ds.getSequences().contains(cds));
+
+ /*
+ * verify peptide.cdsdbref.peptidedbref is now mapped to peptide dataset
+ */
+ DBRefEntry[] dbRefs = pep.getDBRefs();
+ assertEquals(2, dbRefs.length);
+ assertSame(dna, dbRefs[0].map.to);
+ assertSame(cds, dbRefs[1].map.to);
+ assertEquals(1, dna.getDBRefs().length);
+ assertSame(pep.getDatasetSequence(), dna.getDBRefs()[0].map.to);
+ assertEquals(1, cds.getDBRefs().length);
+ assertSame(pep.getDatasetSequence(), cds.getDBRefs()[0].map.to);
+ }
+
}
* is there a usecase for this ? setDatasetSequence should throw an error if
* this actually occurs.
*/
- sq.getDatasetSequence().setDatasetSequence(sq); // loop!
+ try
+ {
+ sq.getDatasetSequence().setDatasetSequence(sq); // loop!
+ Assert.fail("Expected Error to be raised when calling setDatasetSequence with self reference");
+ } catch (IllegalArgumentException e)
+ {
+ // TODO Jalview error/exception class for raising implementation errors
+ assertTrue(e.getMessage().toLowerCase()
+ .contains("implementation error"));
+ }
assertNull(sq.getSequenceFeatures());
}
sq.addPDBId(new PDBEntry("2PDB", "A", Type.MMCIF, "filePath/test2"));
sq.addPDBId(new PDBEntry("2PDB", "B", Type.MMCIF, "filePath/test2"));
+ // these are the same as ones already added
DBRefEntry pdb1pdb = new DBRefEntry("PDB", "version1", "1PDB");
- DBRefEntry pdb2pdb = new DBRefEntry("PDB", "version1", "2PDB");
+ DBRefEntry pdb2pdb = new DBRefEntry("PDB", "version2", "2PDB");
List<DBRefEntry> primRefs = Arrays.asList(new DBRefEntry[] { pdb1pdb,
pdb2pdb });
- sq.getDatasetSequence().addDBRef(pdb1pdb);
- sq.getDatasetSequence().addDBRef(pdb2pdb);
+ sq.getDatasetSequence().addDBRef(pdb1pdb); // should do nothing
+ sq.getDatasetSequence().addDBRef(pdb2pdb); // should do nothing
sq.getDatasetSequence().addDBRef(
- new DBRefEntry("PDB", "version3", "3PDB"));
+ new DBRefEntry("PDB", "version3", "3PDB")); // should do nothing
sq.getDatasetSequence().addDBRef(
- new DBRefEntry("PDB", "version4", "4PDB"));
+ new DBRefEntry("PDB", "version4", "4PDB")); // should do nothing
PDBEntry pdbe1a=new PDBEntry("1PDB", "A", Type.PDB, "filePath/test1");
PDBEntry pdbe1b = new PDBEntry("1PDB", "B", Type.PDB, "filePath/test1");
new AlignmentAnnotation("Test annot", "Test annot description",
annots));
Assert.assertEquals(sq.getDescription(), "Test sequence description..");
- Assert.assertEquals(sq.getDBRefs().length, 5);
+ Assert.assertEquals(sq.getDBRefs().length, 5); // DBRefs are on dataset
+ // sequence
Assert.assertEquals(sq.getAllPDBEntries().size(), 4);
Assert.assertNotNull(sq.getAnnotation());
Assert.assertEquals(sq.getAnnotation()[0].annotations.length, 2);
- Assert.assertEquals(sq.getDatasetSequence().getDBRefs().length, 4);
+ Assert.assertEquals(sq.getDatasetSequence().getDBRefs().length, 5); // same
+ // as
+ // sq.getDBRefs()
Assert.assertEquals(sq.getDatasetSequence().getAllPDBEntries().size(),
4);
Assert.assertNotNull(sq.getDatasetSequence().getAnnotation());
Assert.assertEquals(derived.getDescription(),
"Test sequence description..");
- Assert.assertEquals(derived.getDBRefs().length, 4); // come from dataset
+ Assert.assertEquals(derived.getDBRefs().length, 5); // come from dataset
Assert.assertEquals(derived.getAllPDBEntries().size(), 4);
Assert.assertNotNull(derived.getAnnotation());
Assert.assertEquals(derived.getAnnotation()[0].annotations.length, 2);
- Assert.assertEquals(derived.getDatasetSequence().getDBRefs().length, 4);
+ Assert.assertEquals(derived.getDatasetSequence().getDBRefs().length, 5);
Assert.assertEquals(derived.getDatasetSequence().getAllPDBEntries()
.size(), 4);
Assert.assertNotNull(derived.getDatasetSequence().getAnnotation());
assertEquals(4, seq.getAllPDBEntries().size());
assertSame(pdbe5, seq.getAllPDBEntries().get(3));
}
+
+ @Test(
+ groups = { "Functional" },
+ expectedExceptions = { IllegalArgumentException.class })
+ public void testSetDatasetSequence_toSelf()
+ {
+ seq.setDatasetSequence(seq);
+ }
+
+ @Test(
+ groups = { "Functional" },
+ expectedExceptions = { IllegalArgumentException.class })
+ public void testSetDatasetSequence_cascading()
+ {
+ SequenceI seq2 = new Sequence("Seq2", "xyz");
+ seq2.createDatasetSequence();
+ seq.setDatasetSequence(seq2);
+ }
}
{
seq = new Sequence("PDB|4kqy|4KQY|A", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 1,
26);
- seq.setDatasetSequence(seq);
+ seq.createDatasetSequence();
for (int x = 1; x < 5; x++)
{
DBRefEntry dbRef = new DBRefEntry();
// make sure dataset is initialised ? not sure about this
for (int i = 0; i < al.getSequencesArray().length; ++i)
{
- al.getSequenceAt(i).setDatasetSequence(
- al.getSequenceAt(i).createDatasetSequence());
+ al.getSequenceAt(i).createDatasetSequence();
}
assertNotNull("Couldn't read supplied alignment data.", al);
return al;
for (Sequence seq : seqs)
{
- seq.setDatasetSequence(seq);
+ seq.createDatasetSequence();
expectedSeqs.put(seq.getName(), seq);
}
// make sure dataset is initialised ? not sure about this
for (int i = 0; i < al.getSequencesArray().length; ++i)
{
- al.getSequenceAt(i).setDatasetSequence(al.getSequenceAt(i));
+ al.getSequenceAt(i).createDatasetSequence();
}
String outputfile = rf.formatSequences(ioformat, al, true);
System.out.println("Output file in '" + ioformat + "':\n"
package jalview.ws.dbsources;
import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertNull;
import jalview.datamodel.PDBEntry;
import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
import jalview.datamodel.UniprotEntry;
import java.io.Reader;
+ "<protein><recommendedName><fullName>Mitogen-activated protein kinase 13</fullName><fullName>Henry</fullName></recommendedName></protein>"
+ "<dbReference type=\"PDB\" id=\"2FSQ\"><property type=\"method\" value=\"X-ray\"/><property type=\"resolution\" value=\"1.40\"/></dbReference>"
+ "<dbReference type=\"PDBsum\" id=\"2FSR\"/>"
+ + "<dbReference type=\"EMBL\" id=\"AE007869\"><property type=\"protein sequence ID\" value=\"AAK85932.1\"/><property type=\"molecule type\" value=\"Genomic_DNA\"/></dbReference>"
+ "<feature type=\"signal peptide\" evidence=\"7\"><location><begin position=\"1\"/><end position=\"18\"/></location></feature>"
+ "<feature type=\"propeptide\" description=\"Activation peptide\" id=\"PRO_0000027399\" evidence=\"9 16 17 18\"><location><begin position=\"19\"/><end position=\"20\"/></location></feature>"
+ "<feature type=\"chain\" description=\"Granzyme B\" id=\"PRO_0000027400\"><location><begin position=\"21\"/><end position=\"247\"/></location></feature>"
* Check cross-references
*/
Vector<PDBEntry> xrefs = entry.getDbReference();
- assertEquals(2, xrefs.size());
+ assertEquals(3, xrefs.size());
PDBEntry xref = xrefs.get(0);
assertEquals("2FSQ", xref.getId());
assertEquals("2FSR", xref.getId());
assertEquals("PDBsum", xref.getType());
assertNull(xref.getProperty());
+
+ xref = xrefs.get(2);
+ assertEquals("AE007869", xref.getId());
+ assertEquals("EMBL", xref.getType());
+ assertNotNull(xref.getProperty());
+ assertEquals("AAK85932.1",
+ (String) xref.getProperty().get("protein sequence ID"));
+ assertEquals("Genomic_DNA",
+ (String) xref.getProperty().get("molecule type"));
+ assertEquals(2, xref.getProperty().size());
+
}
+ @Test(groups = { "Functional" })
+ public void testGetUniprotSequence()
+ {
+ UniprotEntry entry = new Uniprot().getUniprotEntries(
+ new StringReader(UNIPROT_XML)).get(0);
+ SequenceI seq = new Uniprot().uniprotEntryToSequenceI(entry);
+ assertNotNull(seq);
+ assertEquals(6, seq.getDBRefs().length); // 2*Uniprot, PDB, PDBsum, 2*EMBL
+
+ }
/**
* Test the method that formats the sequence id
*/
sfs[0].getType()));
assertEquals(embl.getDbSource(), sfs[0].getFeatureGroup());
DBRefEntry[] dr = DBRefUtils.selectRefs(seq.getDBRefs(),
- new String[] { DBRefSource.UNIPROT, DBRefSource.UNIPROTKB,
- DBRefSource.EMBLCDSProduct, DBRefSource.ENSEMBL });
+ new String[] { DBRefSource.UNIPROT });
assertNotNull(dr);
assertEquals("Expected a single Uniprot cross reference", 1, dr.length);
assertEquals("Expected cross reference map to be one amino acid", dr[0]
package jalview.ws.sifts;
import jalview.api.DBRefEntryI;
+import jalview.bin.Cache;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.Sequence;
@BeforeTest(alwaysRun = true)
public void setUpSiftsClient() throws SiftsException
{
+ // read test props before manipulating config
+ Cache.loadProperties("test/jalview/io/testProps.jvprops");
// SIFTs entries are updated weekly - so use saved SIFTs file to enforce
// test reproducibility
new SiftsSettings();