/*
* first find seq's xrefs (dna-to-peptide or peptide-to-dna)
*/
- DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
+ List<DBRefEntry> rfs = DBRefUtils.selectDbRefs(!fromDna,
+ seq.getDBRefs());
addXrefsToSources(rfs, sources);
if (dataset != null)
{
/*
* find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
*/
- DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
+ List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
+ seq.getDBRefs());
List<SequenceI> foundSeqs = new ArrayList<>();
/*
*/
for (SequenceI rs : foundSeqs)
{
- DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!fromDna,
+ List<DBRefEntry> xrs = DBRefUtils.selectDbRefs(!fromDna,
rs.getDBRefs());
addXrefsToSources(xrs, sources);
}
* @param xrefs
* @param sources
*/
- void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
+ void addXrefsToSources(List<DBRefEntry> xrefs, List<String> sources)
{
if (xrefs != null)
{
dss = dss.getDatasetSequence();
}
boolean found = false;
- DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!fromDna,
+ List<DBRefEntry> xrfs = DBRefUtils.selectDbRefs(!fromDna,
dss.getDBRefs());
// ENST & ENSP comes in to both Protein and nucleotide, so we need to
// filter them
// out later.
- if ((xrfs == null || xrfs.length == 0) && dataset != null)
+ if ((xrfs == null || xrfs.size() == 0) && dataset != null)
{
/*
* found no suitable dbrefs on sequence - look for sequences in the
* alignment which share a dbref with this one
*/
- DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
+ List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
seq.getDBRefs());
/*
{
// do a bit more work - search for sequences with references matching
// xrefs on this sequence.
- found = searchDataset(fromDna, dss, xref, rseqs, cf, false);
+ found = searchDataset(fromDna, dss, xref, rseqs, cf, false,
+ DBRefUtils.SEARCH_MODE_FULL);
}
if (found)
{
}
private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
- DBRefEntry[] xrfs, boolean fromDna, AlignedCodonFrame cf)
+ List<DBRefEntry> xrfs, boolean fromDna, AlignedCodonFrame cf)
{
ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
SequenceI[] retrieved = null;
addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
retrievedDss);
}
+ // JBPNote: What assumptions are made for dbref structures on
+ // retrieved sequences ?
+ // addedXref will be true means importCrossRefSeq found
+ // sequences with dbrefs with mappings to sequences congruent with dss
+
if (!addedXref)
{
// try again, after looking for matching IDs
private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
boolean fromDna)
{
- DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
- for (SequenceI sq : dataset.getSequences())
+ List<DBRefEntry> dbrSourceSet = new ArrayList<>(sourceRefs);
+ List<SequenceI> dsSeqs = dataset.getSequences();
+ for (int ids = 0, nds = dsSeqs.size(); ids < nds; ids++)
{
+ SequenceI sq = dsSeqs.get(ids);
boolean dupeFound = false;
// !fromDna means we are looking only for nucleotide sequences, not
// protein
if (sq.isProtein() == fromDna)
{
- for (DBRefEntry dbr : sq.getPrimaryDBRefs())
+ List<DBRefEntry> sqdbrefs = sq.getPrimaryDBRefs();
+ for (int idb = 0, ndb = sqdbrefs.size(); idb < ndb; idb++)
{
- for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr))
+ DBRefEntry dbr = sqdbrefs.get(idb);
+ List<DBRefEntry> searchrefs = DBRefUtils.searchRefs(dbrSourceSet,
+ dbr, DBRefUtils.SEARCH_MODE_FULL);
+ for (int isr = 0, nsr = searchrefs.size(); isr < nsr; isr++)
{
- sourceRefs.remove(found);
+ sourceRefs.remove(searchrefs.get(isr));
dupeFound = true;
}
}
if (dupeFound)
{
// rebuild the search array from the filtered sourceRefs list
- dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
+ dbrSourceSet.clear();
+ dbrSourceSet.addAll(sourceRefs);
}
}
}
/**
* process sequence retrieved via a dbref on source sequence to resolve and
- * transfer data
+ * transfer data JBPNote: as of 2022-02-03 - this assumes retrievedSequence
+ * has dbRefs with Mapping references to a sequence congruent with
+ * sourceSequence
*
* @param cf
* @param sourceSequence
* sourceSequence
*/
boolean imported = false;
- DBRefEntry[] dbr = retrievedSequence.getDBRefs();
+ List<DBRefEntry> dbr = retrievedSequence.getDBRefs();
if (dbr != null)
{
- for (DBRefEntry dbref : dbr)
+ for (int ib = 0, nb = dbr.size(); ib < nb; ib++)
{
+
+ DBRefEntry dbref = dbr.get(ib);
+ // matched will return null if the dbref has no map
SequenceI matched = findInDataset(dbref);
if (matched == sourceSequence)
{
Mapping map = dbref.getMap();
if (map != null)
{
- if (map.getTo() != null && map.getMap() != null)
+ SequenceI ms = map.getTo();
+ if (ms != null && map.getMap() != null)
{
- if (map.getTo() == sourceSequence)
+ if (ms == sourceSequence)
{
// already called to import once, and most likely this sequence
// already imported !
/*
* sequence is new to dataset, so save a reference so it can be added.
*/
- newDsSeqs.add(map.getTo());
+ newDsSeqs.add(ms);
continue;
}
{
// compare ms with dss and replace with dss in mapping
// if map is congruent
- SequenceI ms = map.getTo();
// TODO findInDataset requires exact sequence match but
// 'congruent' test is only for the mapped part
// maybe not a problem in practice since only ENA provide a
+ matched.getName();
System.out.println(msg);
- DBRefEntry[] toRefs = map.getTo().getDBRefs();
+ List<DBRefEntry> toRefs = map.getTo().getDBRefs();
if (toRefs != null)
{
/*
{
return;
}
- DBRefEntry[] dbrefs = mapTo.getDBRefs();
+ List<DBRefEntry> dbrefs = mapTo.getDBRefs();
if (dbrefs == null)
{
return;
/**
* Returns null or the first sequence in the dataset which is identical to
* xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
- * first one with an ID source|xrefacc
+ * first one with an ID source|xrefacc JBPNote: Could refactor this to
+ * AlignmentI/DatasetI
*
* @param xref
* with map and mapped-to sequence
for (SequenceI seq : dataset.getSequences())
{
// first check primary refs.
- List<DBRefEntry> match = DBRefUtils.searchRefs(
- seq.getPrimaryDBRefs().toArray(new DBRefEntry[0]), template);
+ List<DBRefEntry> match = DBRefUtils.searchRefs(seq.getPrimaryDBRefs(),
+ template, DBRefUtils.SEARCH_MODE_FULL);
if (match != null && match.size() == 1 && sameSequence(seq, dss))
{
return seq;
/**
* Updates any empty mappings in the cross-references with one to a compatible
* retrieved sequence if found, and adds any new mappings to the
- * AlignedCodonFrame
+ * AlignedCodonFrame JBPNote: TODO: this relies on sequence IDs like
+ * UNIPROT|ACCESSION - which do not always happen.
*
* @param mapFrom
* @param xrefs
* @param retrieved
* @param acf
*/
- void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs,
+ void updateDbrefMappings(SequenceI mapFrom, List<DBRefEntry> xrefs,
SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
{
SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
* @return true if matches were found.
*/
private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
- DBRefEntry[] lrfs, List<SequenceI> foundSeqs,
+ List<DBRefEntry> lrfs, List<SequenceI> foundSeqs,
AlignedCodonFrame cf)
{
boolean found = false;
{
return false;
}
- for (int i = 0; i < lrfs.length; i++)
+ for (int i = 0, n = lrfs.size(); i < n; i++)
{
- DBRefEntry xref = new DBRefEntry(lrfs[i]);
- // add in wildcards
- xref.setVersion(null);
- xref.setMap(null);
- found |= searchDataset(fromDna, sequenceI, xref, foundSeqs, cf,
- false);
+ // DBRefEntry xref = new DBRefEntry(lrfs.get(i));
+ // // add in wildcards
+ // xref.setVersion(null);
+ // xref.setMap(null);
+ found |= searchDataset(fromDna, sequenceI, lrfs.get(i), foundSeqs, cf,
+ false, DBRefUtils.SEARCH_MODE_NO_MAP_NO_VERSION);
}
return found;
}
* sequenceI or all the returned sequences (eg a genomic reference
* associated with a locus and one or more transcripts)</li>
* </ul>
+ * @param mode
+ * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional
* @return true if relationship found and sequence added.
*/
boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
- boolean direct)
+ boolean direct, int mode)
{
boolean found = false;
if (dataset == null)
System.err.println("Empty dataset sequence set - NO VECTOR");
return false;
}
- List<SequenceI> ds;
- synchronized (ds = dataset.getSequences())
+ List<SequenceI> ds = dataset.getSequences();
+ synchronized (ds)
{
for (SequenceI nxt : ds)
{
}
// look for direct or indirect references in common
- DBRefEntry[] poss = nxt.getDBRefs();
+ List<DBRefEntry> poss = nxt.getDBRefs();
List<DBRefEntry> cands = null;
// todo: indirect specifies we select either direct references to nxt
// that match xrf which is indirect to sequenceI, or indirect
// references to nxt that match xrf which is direct to sequenceI
- cands = DBRefUtils.searchRefs(poss, xrf);
+ cands = DBRefUtils.searchRefs(poss, xrf, mode);
// else
// {
// poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);