package jalview.analysis; import jalview.analysis.CrossRef.MySequenceFeature; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.ws.SequenceFetcherFactory; import jalview.ws.seqfetcher.ASequenceFetcher; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class CrossRefs { /* * A sub-class that ignores Parent attribute when comparing sequence * features. This avoids 'duplicate' CDS features that only * differ in their parent Transcript ids. */ class MySequenceFeature extends SequenceFeature { private SequenceFeature feat; MySequenceFeature(SequenceFeature sf) { this.feat = sf; } @Override public boolean equals(Object o) { return feat.equals(o, true); } } /** * Finds cross-references for sequences from a specified source database. * These may be found in four ways: *

as a DBRefEntry on the known sequence, which has a mapped-to sequence
a sequence of complementary type in the alignment dataset, which has a * DBRefEntry to one of the known sequence's 'direct' DBRefs
a sequence of complementary type in the alignment, which has a * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
by fetching the accession from the remote database

* * @param seqs * the sequences whose cross-references we are searching for * @param dna * true if the sequences are from a nucleotide alignment, else false * @param source * the database source we want cross-references to * @param dataset * the alignment dataset the sequences belong to * @return an alignment containing cross-reference sequences, or null if none * found */ public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna, String source, AlignmentI dataset) { /* * filter to only those sequences of the right type (nucleotide/protein) */ List fromSeqs = new ArrayList(); for (SequenceI seq : seqs) { if (dna == Comparison.isNucleotide(seq)) { fromSeqs.add(seq); } } return findXrefSequences(fromSeqs, dna, source, dataset); } /** * Finds cross-references for sequences from a specified source database. * These may be found in four ways: *

as a DBRefEntry on the known sequence, which has a mapped-to sequence
a sequence of complementary type in the alignment dataset, which has a * DBRefEntry to one of the known sequence's 'direct' DBRefs
a sequence of complementary type in the alignment, which has a * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
by fetching the accession from the remote database

* * @param seqs * the sequences whose cross-references we are searching for, * filtered to only those which are of the type denoted by 'dna' * @param dna * true if the sequences are from a nucleotide alignment, else false * @param source * the database source we want cross-references to * @param dataset * the alignment dataset the sequences belong to * @return an alignment containing cross-reference sequences, or null if none * found */ static AlignmentI findXrefSequences(List fromSeqs, boolean dna, String source, AlignmentI dataset) { List foundSeqs = new ArrayList(); AlignedCodonFrame mappings = new AlignedCodonFrame(); List unresolvedRefs = new ArrayList(); /* * first extract any mapped sequences from sourceRefs * if successful, sequence is removed from fromSeqs * if unsuccessful, dbrefs are added to unresolvedRefs */ findMappedDbrefs(fromSeqs, source, foundSeqs, unresolvedRefs, mappings); /* * then search the alignment dataset for dbref resolutions */ findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs, unresolvedRefs, mappings); /* * fetch any remaining sourceRefs from the source database */ fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings, dna, dataset); if (foundSeqs.isEmpty()) { return null; } AlignmentI crossRefs = new Alignment( foundSeqs.toArray(new SequenceI[foundSeqs.size()])); crossRefs.addCodonFrame(mappings); return crossRefs; } /** * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If * found, adds the sequence to foundSeqs and removes the dbref from the list. * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting * version number to 0 i.e. use source and accession only). * * @param fromSeqs * the dataset sequences we are searching from * @param source * the database source we are searching dbrefs for * @param foundSeqs * a list of found sequences to add to * @param unresolvedRefs * a list of unresolved cross-references to add to * @param mappings * a set of sequence mappings to add to * @return */ static void findMappedDbrefs(List fromSeqs, String source, List foundSeqs, List unresolvedRefs, AlignedCodonFrame mappings) { Iterator it = fromSeqs.iterator(); while (it.hasNext()) { SequenceI seq = it.next(); SequenceI dss = seq.getDatasetSequence(); dss = dss == null ? seq : dss; DBRefEntry[] dbRefs = seq.getDBRefs(); if (dbRefs == null) { continue; } boolean resolved = false; for (DBRefEntry dbref : dbRefs) { if (!source.equals(dbref.getSource())) { continue; } DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0", dbref.getAccessionId()); Mapping map = dbref.getMap(); if (map != null) { unresolvedRefs.remove(todo); resolved = true; SequenceI mappedTo = map.getTo(); if (mappedTo != null) { foundSeqs.add(new Sequence(mappedTo)); /* * check mapping is not 'direct' (it shouldn't be if we reach here) * and add mapping (dna-to-peptide or vice versa) to the set */ MapList mapList = map.getMap(); int fromRatio = mapList.getFromRatio(); int toRatio = mapList.getToRatio(); if (fromRatio != toRatio) { if (fromRatio == 3) { mappings.addMap(dss, mappedTo, mapList); } else { mappings.addMap(mappedTo, dss, mapList.getInverse()); } } } } else { /* * no mapping to resolve dbref - add source+accession to list to resolve */ if (!unresolvedRefs.contains(todo)) { unresolvedRefs.add(todo); } } } if (resolved) { it.remove(); } } } /** * Tries to fetch seq's database references to 'source' database, and add them * to the foundSeqs list. If found, tries to make a mapping between seq and * the retrieved sequence and insert it into the database reference. * * @param fromSeqs * @param sourceRefs * @param foundSeqs * @param mappings * @param dna */ static void fetchCrossReferences(List fromSeqs, List sourceRefs, List foundSeqs, AlignedCodonFrame mappings, boolean dna, AlignmentI dataset) { ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher(); SequenceI[] retrieved; try { retrieved = sftch.getSequences(sourceRefs, !dna); } catch (Exception e) { System.err.println("Problem whilst retrieving cross references: " + e.getMessage()); e.printStackTrace(); return; } if (retrieved == null) { return; } updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings); SequenceIdMatcher matcher = new SequenceIdMatcher( dataset.getSequences()); List copiedFeatures = new ArrayList(); CrossRefs me = new CrossRefs(); for (int rs = 0; rs < retrieved.length; rs++) { // TODO: examine each sequence for 'redundancy' DBRefEntry[] dbr = retrieved[rs].getDBRefs(); if (dbr != null && dbr.length > 0) { for (int di = 0; di < dbr.length; di++) { // find any entry where we should put in the sequence being // cross-referenced into the map Mapping map = dbr[di].getMap(); if (map != null) { if (map.getTo() != null && map.getMap() != null) { SequenceI matched = matcher.findIdMatch(map.getTo()); if (matched != null) { /* * already got an xref to this sequence; update this * map to point to the same sequence, and add * any new dbrefs to it */ for (DBRefEntry ref : map.getTo().getDBRefs()) { matched.addDBRef(ref); // add or update mapping } map.setTo(matched); } else { matcher.add(map.getTo()); } try { // compare ms with dss and replace with dss in mapping // if map is congruent SequenceI ms = map.getTo(); int sf = map.getMap().getToLowest(); int st = map.getMap().getToHighest(); SequenceI mappedrg = ms.getSubSequence(sf, st); // SequenceI loc = dss.getSubSequence(sf, st); if (mappedrg.getLength() > 0 && ms.getSequenceAsString().equals( fromSeqs.getSequenceAsString())) // && mappedrg.getSequenceAsString().equals( // loc.getSequenceAsString())) { String msg = "Mapping updated from " + ms.getName() + " to retrieved crossreference " + fromSeqs.getName(); System.out.println(msg); // method to update all refs of existing To on // retrieved sequence with dss and merge any props // on To onto dss. map.setTo(fromSeqs); /* * copy sequence features as well, avoiding * duplication (e.g. same variation from 2 * transcripts) */ SequenceFeature[] sfs = ms.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature feat : sfs) { /* * we override SequenceFeature.equals here (but * not elsewhere) to ignore Parent attribute * TODO not quite working yet! */ if (!copiedFeatures .contains(me.new MySequenceFeature(feat))) { fromSeqs.addSequenceFeature(feat); copiedFeatures.add(feat); } } } } mappings.addMap(retrieved[rs].getDatasetSequence(), map.getTo(), map.getMap()); } catch (Exception e) { System.err .println("Exception when consolidating Mapped sequence set..."); e.printStackTrace(System.err); } } } } } retrieved[rs].updatePDBIds(); foundSeqs.add(retrieved[rs]); } } /** * Searches the alignment for a sequence of complementary type to 'seq' which * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and * removes the resolved sourceRef from the search list. * * @param fromSeqs * @param source * @param unresolvedRefs * @param foundSeqs * @param unresolvedRefs * @param mappings * @return */ static void findIndirectCrossReferences(List fromSeqs, String source, AlignmentI dataset, List foundSeqs, List unresolvedRefs, AlignedCodonFrame mappings) { Iterator refs = unresolvedRefs.iterator(); while (refs.hasNext()) { DBRefEntry dbref = refs.next(); boolean found = false; // boolean found = searchDatasetForCrossReference(fromSeqs, dbref, // foundSeqs, // unresolvedRefs, mappings); if (found) { refs.remove(); } } } /** * Searches the dataset for a sequence of opposite type to 'excluding', which * has a cross-reference matching dbref. If found, adds the sequence to * foundSeqs and removes dbref from the search list. * * @param excluding * a sequence to ignore (start point of search) * @param dbref * a cross-reference to try to match * @param dataset * sequences to search in * @param foundSeqs * result list to add to * @param mappings * a set of sequence mappings to add to * @return true if relationship found and sequence added */ static boolean searchDatasetForCrossReference(SequenceI excluding, DBRefEntry dbref, AlignmentI dataset, List foundSeqs, AlignedCodonFrame mappings) { boolean fromNucleotide = Comparison.isNucleotide(excluding); boolean found = false; if (dataset == null) { return false; } if (dataset.getSequences() == null) { return false; } List ds; synchronized (ds = dataset.getSequences()) { for (SequenceI nxt : ds) { if (nxt != null) { if (nxt.getDatasetSequence() != null) { System.err .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!"); } if (nxt == excluding || nxt == excluding.getDatasetSequence()) { continue; } if (foundSeqs.contains(nxt)) { /* * already added this sequence to cross-refs */ continue; } boolean isDna = Comparison.isNucleotide(nxt); if (isDna == fromNucleotide) { /* * skip this sequence - wrong molecule type */ continue; } /* * check if this sequence has any dbref matching source and accession * (version and mapping may differ) */ List candidates = DBRefUtils.searchRefs( nxt.getDBRefs(), dbref); if (candidates.isEmpty()) { continue; } found = true; foundSeqs.add(nxt); if (mappings != null) { // don't search if we aren't given a codon map object for (DBRefEntry candidate : candidates) { if (candidate.hasMap()) { Mapping mapping = candidate.getMap(); MapList map = mapping.getMap(); if (mapping.getTo() != null && map.getFromRatio() != map.getToRatio()) { if (fromNucleotide) { // map is from dna seq to a protein product mappings.addMap(excluding, nxt, map); } else { // map is from protein seq to its coding dna mappings.addMap(nxt, excluding, map.getInverse()); } } } } } } } } return found; } /** * Updates any empty mappings in the cross-references with one to a compatible * retrieved sequence if found, and adds any new mappings to the * AlignedCodonFrame * * @param dna * @param fromSeqs * @param xrefs * @param retrieved * @param mappings */ static void updateDbrefMappings(boolean dna, List fromSeqs, List xrefs, SequenceI[] retrieved, AlignedCodonFrame mappings) { SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved); for (DBRefEntry xref : xrefs) { if (!xref.hasMap()) { String targetSeqName = xref.getSource() + "|" + xref.getAccessionId(); SequenceI[] matches = matcher.findAllIdMatches(targetSeqName); if (matches == null) { return; } for (SequenceI seq : matches) { MapList mapping = null; if (dna) { mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs); } else { mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq); if (mapping != null) { mapping = mapping.getInverse(); } } if (mapping != null) { xref.setMap(new Mapping(seq, mapping)); if (dna) { AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping); } if (dna) { mappings.addMap(fromSeqs, seq, mapping); } else { mappings.addMap(seq, fromSeqs, mapping.getInverse()); } continue; } } } } } }