package jalview.analysis; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.ws.SequenceFetcherFactory; import jalview.ws.seqfetcher.ASequenceFetcher; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class CrossRefs { /** * Finds cross-references for sequences from a specified source database. * These may be found in four ways: *

as a DBRefEntry on the known sequence, which has a mapped-to sequence
a sequence of complementary type in the alignment dataset, which has a * DBRefEntry to one of the known sequence's 'direct' DBRefs
a sequence of complementary type in the alignment, which has a * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs
by fetching the accession from the remote database

* * @param seqs * the sequences whose cross-references we are searching for * @param dna * true if the sequences are from a nucleotide alignment, else false * @param source * the database source we want cross-references to * @param dataset * the alignment dataset the sequences belong to * @return an alignment containing cross-reference sequences, or null if none * found */ public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna, String source, AlignmentI dataset) { List foundSeqs = new ArrayList(); AlignedCodonFrame mappings = new AlignedCodonFrame(); List sourceRefs = new ArrayList(); for (SequenceI seq : seqs) { if (dna != Comparison.isNucleotide(seq)) { /* * mixed alignment, and this sequence is of the wrong type */ continue; } /* * get this sequence's dbrefs to source database (if any) */ List seqSourceRefs = DBRefUtils.searchRefsForSource( seq.getDBRefs(), source); /* * first extract any mapped sequences from sourceRefs */ findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings); /* * for remaining sourceRefs, try to match a * complementary sequence in the dataset */ findIndirectCrossReferences(seq, source, seqSourceRefs, dataset, foundSeqs, mappings); } /* * fetch any remaining sourceRefs from the source database */ fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset); if (foundSeqs.isEmpty()) { return null; } AlignmentI crossRefs = new Alignment( foundSeqs.toArray(new SequenceI[foundSeqs.size()])); crossRefs.addCodonFrame(mappings); return crossRefs; } /** * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If * found, adds the sequence to foundSeqs and removes the dbref from the list. * * @param seq * the dataset sequence we are searching from * @param sourceRefs * the sequence's dbrefs to 'source' * @param foundSeqs * a list of cross-references to add to * @param mappings * a set of sequence mappings to add to * @return */ static void findMappedDbrefs(SequenceI seq, List sourceRefs, List foundSeqs, AlignedCodonFrame mappings) { Iterator refs = sourceRefs.iterator(); while (refs.hasNext()) { DBRefEntry dbref = refs.next(); Mapping map = dbref.getMap(); if (map != null) { SequenceI mappedTo = map.getTo(); if (mappedTo != null) { foundSeqs.add(new Sequence(mappedTo)); refs.remove(); /* * check mapping is not 'direct' (it shouldn't be if we reach here) * and add mapping (dna-to-peptide or vice versa) to the set */ MapList mapList = map.getMap(); int fromRatio = mapList.getFromRatio(); int toRatio = mapList.getToRatio(); if (fromRatio != toRatio) { if (fromRatio == 3) { mappings.addMap(seq, mappedTo, mapList); } else { mappings.addMap(mappedTo, seq, mapList.getInverse()); } } } } } } /** * Tries to fetch seq's database references to 'source' database, and add them * to the foundSeqs list. If found, tries to make a mapping between seq and * the retrieved sequence and insert it into the database reference. * * @param seq * @param sourceRefs * @param foundSeqs * @param mappings * @param dna */ static void fetchCrossReferences(SequenceI seq, List sourceRefs, List foundSeqs, AlignedCodonFrame mappings, boolean dna, AlignmentI dataset) { ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher(); SequenceI[] retrieved; try { retrieved = sftch.getSequences(sourceRefs, !dna); } catch (Exception e) { System.err .println("Problem whilst retrieving cross references for Sequence : " + seq.getName()); e.printStackTrace(); return; } if (retrieved != null) { updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings); SequenceIdMatcher matcher = new SequenceIdMatcher( dataset.getSequences()); List copiedFeatures = new ArrayList(); CrossRef me = new CrossRef(); for (int rs = 0; rs < retrieved.length; rs++) { // TODO: examine each sequence for 'redundancy' DBRefEntry[] dbr = retrieved[rs].getDBRefs(); if (dbr != null && dbr.length > 0) { for (int di = 0; di < dbr.length; di++) { // find any entry where we should put in the sequence being // cross-referenced into the map Mapping map = dbr[di].getMap(); if (map != null) { if (map.getTo() != null && map.getMap() != null) { SequenceI matched = matcher.findIdMatch(map.getTo()); if (matched != null) { /* * already got an xref to this sequence; update this * map to point to the same sequence, and add * any new dbrefs to it */ for (DBRefEntry ref : map.getTo().getDBRefs()) { matched.addDBRef(ref); // add or update mapping } map.setTo(matched); } else { matcher.add(map.getTo()); } try { // compare ms with dss and replace with dss in mapping // if map is congruent SequenceI ms = map.getTo(); int sf = map.getMap().getToLowest(); int st = map.getMap().getToHighest(); SequenceI mappedrg = ms.getSubSequence(sf, st); // SequenceI loc = dss.getSubSequence(sf, st); if (mappedrg.getLength() > 0 && ms.getSequenceAsString().equals( seq.getSequenceAsString())) // && mappedrg.getSequenceAsString().equals( // loc.getSequenceAsString())) { String msg = "Mapping updated from " + ms.getName() + " to retrieved crossreference " + seq.getName(); System.out.println(msg); // method to update all refs of existing To on // retrieved sequence with dss and merge any props // on To onto dss. map.setTo(seq); /* * copy sequence features as well, avoiding * duplication (e.g. same variation from 2 * transcripts) */ SequenceFeature[] sfs = ms.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature feat : sfs) { /* * we override SequenceFeature.equals here (but * not elsewhere) to ignore Parent attribute * TODO not quite working yet! */ if (!copiedFeatures .contains(me.new MySequenceFeature(feat))) { seq.addSequenceFeature(feat); copiedFeatures.add(feat); } } } } mappings.addMap(retrieved[rs].getDatasetSequence(), map.getTo(), map.getMap()); } catch (Exception e) { System.err .println("Exception when consolidating Mapped sequence set..."); e.printStackTrace(System.err); } } } } } retrieved[rs].updatePDBIds(); foundSeqs.add(retrieved[rs]); } } } /** * Searches the alignment for a sequence of complementary type to 'seq' which * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and * removes the resolved sourceRef from the search list. * * @param seq * @param source * @param sourceRefs * @param dataset * @param foundSeqs * @param mappings * @return */ static void findIndirectCrossReferences(SequenceI seq, String source, List sourceRefs, AlignmentI dataset, List foundSeqs, AlignedCodonFrame mappings) { Iterator refs = sourceRefs.iterator(); while (refs.hasNext()) { DBRefEntry dbref = refs.next(); boolean found = searchDatasetForCrossReference(seq, dbref, dataset, foundSeqs, mappings); if (found) { refs.remove(); } } } /** * Searches the dataset for a sequence of opposite type to 'excluding', which * has a cross-reference matching dbref. If found, adds the sequence to * foundSeqs and removes dbref from the search list. * * @param excluding * a sequence to ignore (start point of search) * @param dbref * a cross-reference to try to match * @param dataset * sequences to search in * @param foundSeqs * result list to add to * @param mappings * a set of sequence mappings to add to * @return true if relationship found and sequence added */ static boolean searchDatasetForCrossReference(SequenceI excluding, DBRefEntry dbref, AlignmentI dataset, List foundSeqs, AlignedCodonFrame mappings) { boolean fromNucleotide = Comparison.isNucleotide(excluding); boolean found = false; if (dataset == null) { return false; } if (dataset.getSequences() == null) { return false; } List ds; synchronized (ds = dataset.getSequences()) { for (SequenceI nxt : ds) { if (nxt != null) { if (nxt.getDatasetSequence() != null) { System.err .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!"); } if (nxt == excluding || nxt == excluding.getDatasetSequence()) { continue; } if (foundSeqs.contains(nxt)) { /* * already added this sequence to cross-refs */ continue; } boolean isDna = Comparison.isNucleotide(nxt); if (isDna == fromNucleotide) { /* * skip this sequence - wrong molecule type */ continue; } /* * check if this sequence has any dbref matching source and accession * (version and mapping may differ) */ List candidates = DBRefUtils.searchRefs( nxt.getDBRefs(), dbref); if (candidates.isEmpty()) { continue; } found = true; foundSeqs.add(nxt); if (mappings != null) { // don't search if we aren't given a codon map object for (DBRefEntry candidate : candidates) { if (candidate.hasMap()) { Mapping mapping = candidate.getMap(); MapList map = mapping.getMap(); if (mapping.getTo() != null && map.getFromRatio() != map.getToRatio()) { if (fromNucleotide) { // map is from dna seq to a protein product mappings.addMap(excluding, nxt, map); } else { // map is from protein seq to its coding dna mappings.addMap(nxt, excluding, map.getInverse()); } } } } } } } } return found; } /** * Updates any empty mappings in the cross-references with one to a compatible * retrieved sequence if found, and adds any new mappings to the * AlignedCodonFrame * * @param dna * @param mapFrom * @param xrefs * @param retrieved * @param mappings */ static void updateDbrefMappings(boolean dna, SequenceI mapFrom, List xrefs, SequenceI[] retrieved, AlignedCodonFrame mappings) { SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved); for (DBRefEntry xref : xrefs) { if (!xref.hasMap()) { String targetSeqName = xref.getSource() + "|" + xref.getAccessionId(); SequenceI[] matches = matcher.findAllIdMatches(targetSeqName); if (matches == null) { return; } for (SequenceI seq : matches) { MapList mapping = null; if (dna) { mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom); } else { mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq); if (mapping != null) { mapping = mapping.getInverse(); } } if (mapping != null) { xref.setMap(new Mapping(seq, mapping)); if (dna) { AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping); } if (dna) { mappings.addMap(mapFrom, seq, mapping); } else { mappings.addMap(seq, mapFrom, mapping.getInverse()); } continue; } } } } } }