package jalview.analysis;
import jalview.analysis.CrossRef.MySequenceFeature;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.Comparison;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
import jalview.ws.SequenceFetcherFactory;
import jalview.ws.seqfetcher.ASequenceFetcher;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class CrossRefs
{
/*
* A sub-class that ignores Parent attribute when comparing sequence
* features. This avoids 'duplicate' CDS features that only
* differ in their parent Transcript ids.
*/
class MySequenceFeature extends SequenceFeature
{
private SequenceFeature feat;
MySequenceFeature(SequenceFeature sf)
{
this.feat = sf;
}
@Override
public boolean equals(Object o)
{
return feat.equals(o, true);
}
}
/**
* Finds cross-references for sequences from a specified source database.
* These may be found in four ways:
*
* - as a DBRefEntry on the known sequence, which has a mapped-to sequence
* - a sequence of complementary type in the alignment dataset, which has a
* DBRefEntry to one of the known sequence's 'direct' DBRefs
* - a sequence of complementary type in the alignment, which has a
* DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
* - by fetching the accession from the remote database
*
*
* @param seqs
* the sequences whose cross-references we are searching for
* @param dna
* true if the sequences are from a nucleotide alignment, else false
* @param source
* the database source we want cross-references to
* @param dataset
* the alignment dataset the sequences belong to
* @return an alignment containing cross-reference sequences, or null if none
* found
*/
public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
String source, AlignmentI dataset)
{
/*
* filter to only those sequences of the right type (nucleotide/protein)
*/
List fromSeqs = new ArrayList();
for (SequenceI seq : seqs)
{
if (dna == Comparison.isNucleotide(seq))
{
fromSeqs.add(seq);
}
}
return findXrefSequences(fromSeqs, dna, source, dataset);
}
/**
* Finds cross-references for sequences from a specified source database.
* These may be found in four ways:
*
* - as a DBRefEntry on the known sequence, which has a mapped-to sequence
* - a sequence of complementary type in the alignment dataset, which has a
* DBRefEntry to one of the known sequence's 'direct' DBRefs
* - a sequence of complementary type in the alignment, which has a
* DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
* - by fetching the accession from the remote database
*
*
* @param seqs
* the sequences whose cross-references we are searching for,
* filtered to only those which are of the type denoted by 'dna'
* @param dna
* true if the sequences are from a nucleotide alignment, else false
* @param source
* the database source we want cross-references to
* @param dataset
* the alignment dataset the sequences belong to
* @return an alignment containing cross-reference sequences, or null if none
* found
*/
static AlignmentI findXrefSequences(List fromSeqs,
boolean dna, String source, AlignmentI dataset)
{
List foundSeqs = new ArrayList();
AlignedCodonFrame mappings = new AlignedCodonFrame();
List unresolvedRefs = new ArrayList();
/*
* first extract any mapped sequences from sourceRefs
* if successful, sequence is removed from fromSeqs
* if unsuccessful, dbrefs are added to unresolvedRefs
*/
findMappedDbrefs(fromSeqs, source, foundSeqs,
unresolvedRefs, mappings);
/*
* then search the alignment dataset for dbref resolutions
*/
findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
unresolvedRefs, mappings);
/*
* fetch any remaining sourceRefs from the source database
*/
fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
dna, dataset);
if (foundSeqs.isEmpty())
{
return null;
}
AlignmentI crossRefs = new Alignment(
foundSeqs.toArray(new SequenceI[foundSeqs.size()]));
crossRefs.addCodonFrame(mappings);
return crossRefs;
}
/**
* Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
* found, adds the sequence to foundSeqs and removes the dbref from the list.
* DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
* version number to 0 i.e. use source and accession only).
*
* @param fromSeqs
* the dataset sequences we are searching from
* @param source
* the database source we are searching dbrefs for
* @param foundSeqs
* a list of found sequences to add to
* @param unresolvedRefs
* a list of unresolved cross-references to add to
* @param mappings
* a set of sequence mappings to add to
* @return
*/
static void findMappedDbrefs(List fromSeqs, String source,
List foundSeqs, List unresolvedRefs,
AlignedCodonFrame mappings)
{
Iterator it = fromSeqs.iterator();
while (it.hasNext())
{
SequenceI seq = it.next();
SequenceI dss = seq.getDatasetSequence();
dss = dss == null ? seq : dss;
DBRefEntry[] dbRefs = seq.getDBRefs();
if (dbRefs == null)
{
continue;
}
boolean resolved = false;
for (DBRefEntry dbref : dbRefs)
{
if (!source.equals(dbref.getSource()))
{
continue;
}
DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
dbref.getAccessionId());
Mapping map = dbref.getMap();
if (map != null)
{
unresolvedRefs.remove(todo);
resolved = true;
SequenceI mappedTo = map.getTo();
if (mappedTo != null)
{
foundSeqs.add(new Sequence(mappedTo));
/*
* check mapping is not 'direct' (it shouldn't be if we reach here)
* and add mapping (dna-to-peptide or vice versa) to the set
*/
MapList mapList = map.getMap();
int fromRatio = mapList.getFromRatio();
int toRatio = mapList.getToRatio();
if (fromRatio != toRatio)
{
if (fromRatio == 3)
{
mappings.addMap(dss, mappedTo, mapList);
}
else
{
mappings.addMap(mappedTo, dss, mapList.getInverse());
}
}
}
}
else
{
/*
* no mapping to resolve dbref - add source+accession to list to resolve
*/
if (!unresolvedRefs.contains(todo))
{
unresolvedRefs.add(todo);
}
}
}
if (resolved)
{
it.remove();
}
}
}
/**
* Tries to fetch seq's database references to 'source' database, and add them
* to the foundSeqs list. If found, tries to make a mapping between seq and
* the retrieved sequence and insert it into the database reference.
*
* @param fromSeqs
* @param sourceRefs
* @param foundSeqs
* @param mappings
* @param dna
*/
static void fetchCrossReferences(List fromSeqs,
List sourceRefs, List foundSeqs,
AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
{
ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
SequenceI[] retrieved;
try
{
retrieved = sftch.getSequences(sourceRefs, !dna);
} catch (Exception e)
{
System.err.println("Problem whilst retrieving cross references: "
+ e.getMessage());
e.printStackTrace();
return;
}
if (retrieved == null)
{
return;
}
updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
SequenceIdMatcher matcher = new SequenceIdMatcher(
dataset.getSequences());
List copiedFeatures = new ArrayList();
CrossRefs me = new CrossRefs();
for (int rs = 0; rs < retrieved.length; rs++)
{
// TODO: examine each sequence for 'redundancy'
DBRefEntry[] dbr = retrieved[rs].getDBRefs();
if (dbr != null && dbr.length > 0)
{
for (int di = 0; di < dbr.length; di++)
{
// find any entry where we should put in the sequence being
// cross-referenced into the map
Mapping map = dbr[di].getMap();
if (map != null)
{
if (map.getTo() != null && map.getMap() != null)
{
SequenceI matched = matcher.findIdMatch(map.getTo());
if (matched != null)
{
/*
* already got an xref to this sequence; update this
* map to point to the same sequence, and add
* any new dbrefs to it
*/
for (DBRefEntry ref : map.getTo().getDBRefs())
{
matched.addDBRef(ref); // add or update mapping
}
map.setTo(matched);
}
else
{
matcher.add(map.getTo());
}
try
{
// compare ms with dss and replace with dss in mapping
// if map is congruent
SequenceI ms = map.getTo();
int sf = map.getMap().getToLowest();
int st = map.getMap().getToHighest();
SequenceI mappedrg = ms.getSubSequence(sf, st);
// SequenceI loc = dss.getSubSequence(sf, st);
if (mappedrg.getLength() > 0
&& ms.getSequenceAsString().equals(
fromSeqs.getSequenceAsString()))
// && mappedrg.getSequenceAsString().equals(
// loc.getSequenceAsString()))
{
String msg = "Mapping updated from " + ms.getName()
+ " to retrieved crossreference "
+ fromSeqs.getName();
System.out.println(msg);
// method to update all refs of existing To on
// retrieved sequence with dss and merge any props
// on To onto dss.
map.setTo(fromSeqs);
/*
* copy sequence features as well, avoiding
* duplication (e.g. same variation from 2
* transcripts)
*/
SequenceFeature[] sfs = ms.getSequenceFeatures();
if (sfs != null)
{
for (SequenceFeature feat : sfs)
{
/*
* we override SequenceFeature.equals here (but
* not elsewhere) to ignore Parent attribute
* TODO not quite working yet!
*/
if (!copiedFeatures
.contains(me.new MySequenceFeature(feat)))
{
fromSeqs.addSequenceFeature(feat);
copiedFeatures.add(feat);
}
}
}
}
mappings.addMap(retrieved[rs].getDatasetSequence(),
map.getTo(), map.getMap());
} catch (Exception e)
{
System.err
.println("Exception when consolidating Mapped sequence set...");
e.printStackTrace(System.err);
}
}
}
}
}
retrieved[rs].updatePDBIds();
foundSeqs.add(retrieved[rs]);
}
}
/**
* Searches the alignment for a sequence of complementary type to 'seq' which
* shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
* removes the resolved sourceRef from the search list.
*
* @param fromSeqs
* @param source
* @param unresolvedRefs
* @param foundSeqs
* @param unresolvedRefs
* @param mappings
* @return
*/
static void findIndirectCrossReferences(List fromSeqs,
String source, AlignmentI dataset,
List foundSeqs, List unresolvedRefs,
AlignedCodonFrame mappings)
{
Iterator refs = unresolvedRefs.iterator();
while (refs.hasNext())
{
DBRefEntry dbref = refs.next();
boolean found = false;
// boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
// foundSeqs,
// unresolvedRefs, mappings);
if (found)
{
refs.remove();
}
}
}
/**
* Searches the dataset for a sequence of opposite type to 'excluding', which
* has a cross-reference matching dbref. If found, adds the sequence to
* foundSeqs and removes dbref from the search list.
*
* @param excluding
* a sequence to ignore (start point of search)
* @param dbref
* a cross-reference to try to match
* @param dataset
* sequences to search in
* @param foundSeqs
* result list to add to
* @param mappings
* a set of sequence mappings to add to
* @return true if relationship found and sequence added
*/
static boolean searchDatasetForCrossReference(SequenceI excluding,
DBRefEntry dbref, AlignmentI dataset, List foundSeqs,
AlignedCodonFrame mappings)
{
boolean fromNucleotide = Comparison.isNucleotide(excluding);
boolean found = false;
if (dataset == null)
{
return false;
}
if (dataset.getSequences() == null)
{
return false;
}
List ds;
synchronized (ds = dataset.getSequences())
{
for (SequenceI nxt : ds)
{
if (nxt != null)
{
if (nxt.getDatasetSequence() != null)
{
System.err
.println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
}
if (nxt == excluding || nxt == excluding.getDatasetSequence())
{
continue;
}
if (foundSeqs.contains(nxt))
{
/*
* already added this sequence to cross-refs
*/
continue;
}
boolean isDna = Comparison.isNucleotide(nxt);
if (isDna == fromNucleotide)
{
/*
* skip this sequence - wrong molecule type
*/
continue;
}
/*
* check if this sequence has any dbref matching source and accession
* (version and mapping may differ)
*/
List candidates = DBRefUtils.searchRefs(
nxt.getDBRefs(), dbref);
if (candidates.isEmpty())
{
continue;
}
found = true;
foundSeqs.add(nxt);
if (mappings != null)
{
// don't search if we aren't given a codon map object
for (DBRefEntry candidate : candidates)
{
if (candidate.hasMap())
{
Mapping mapping = candidate.getMap();
MapList map = mapping.getMap();
if (mapping.getTo() != null
&& map.getFromRatio() != map.getToRatio())
{
if (fromNucleotide)
{
// map is from dna seq to a protein product
mappings.addMap(excluding, nxt, map);
}
else
{
// map is from protein seq to its coding dna
mappings.addMap(nxt, excluding, map.getInverse());
}
}
}
}
}
}
}
}
return found;
}
/**
* Updates any empty mappings in the cross-references with one to a compatible
* retrieved sequence if found, and adds any new mappings to the
* AlignedCodonFrame
*
* @param dna
* @param fromSeqs
* @param xrefs
* @param retrieved
* @param mappings
*/
static void updateDbrefMappings(boolean dna, List fromSeqs,
List xrefs, SequenceI[] retrieved,
AlignedCodonFrame mappings)
{
SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
for (DBRefEntry xref : xrefs)
{
if (!xref.hasMap())
{
String targetSeqName = xref.getSource() + "|"
+ xref.getAccessionId();
SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
if (matches == null)
{
return;
}
for (SequenceI seq : matches)
{
MapList mapping = null;
if (dna)
{
mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
}
else
{
mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
if (mapping != null)
{
mapping = mapping.getInverse();
}
}
if (mapping != null)
{
xref.setMap(new Mapping(seq, mapping));
if (dna)
{
AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
}
if (dna)
{
mappings.addMap(fromSeqs, seq, mapping);
}
else
{
mappings.addMap(seq, fromSeqs, mapping.getInverse());
}
continue;
}
}
}
}
}
}