1 package jalview.analysis;
3 import jalview.analysis.CrossRef.MySequenceFeature;
4 import jalview.datamodel.AlignedCodonFrame;
5 import jalview.datamodel.Alignment;
6 import jalview.datamodel.AlignmentI;
7 import jalview.datamodel.DBRefEntry;
8 import jalview.datamodel.Mapping;
9 import jalview.datamodel.Sequence;
10 import jalview.datamodel.SequenceFeature;
11 import jalview.datamodel.SequenceI;
12 import jalview.util.Comparison;
13 import jalview.util.DBRefUtils;
14 import jalview.util.MapList;
15 import jalview.ws.SequenceFetcherFactory;
16 import jalview.ws.seqfetcher.ASequenceFetcher;
18 import java.util.ArrayList;
19 import java.util.Iterator;
20 import java.util.List;
22 public class CrossRefs
25 * A sub-class that ignores Parent attribute when comparing sequence
26 * features. This avoids 'duplicate' CDS features that only
27 * differ in their parent Transcript ids.
29 class MySequenceFeature extends SequenceFeature
31 private SequenceFeature feat;
33 MySequenceFeature(SequenceFeature sf)
39 public boolean equals(Object o)
41 return feat.equals(o, true);
46 * Finds cross-references for sequences from a specified source database.
47 * These may be found in four ways:
49 * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
50 * <li>a sequence of complementary type in the alignment dataset, which has a
51 * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
52 * <li>a sequence of complementary type in the alignment, which has a
53 * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
54 * <li>by fetching the accession from the remote database</li>
58 * the sequences whose cross-references we are searching for
60 * true if the sequences are from a nucleotide alignment, else false
62 * the database source we want cross-references to
64 * the alignment dataset the sequences belong to
65 * @return an alignment containing cross-reference sequences, or null if none
68 public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
69 String source, AlignmentI dataset)
72 * filter to only those sequences of the right type (nucleotide/protein)
74 List<SequenceI> fromSeqs = new ArrayList<SequenceI>();
75 for (SequenceI seq : seqs)
77 if (dna == Comparison.isNucleotide(seq))
82 return findXrefSequences(fromSeqs, dna, source, dataset);
86 * Finds cross-references for sequences from a specified source database.
87 * These may be found in four ways:
89 * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
90 * <li>a sequence of complementary type in the alignment dataset, which has a
91 * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
92 * <li>a sequence of complementary type in the alignment, which has a
93 * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
94 * <li>by fetching the accession from the remote database</li>
98 * the sequences whose cross-references we are searching for,
99 * filtered to only those which are of the type denoted by 'dna'
101 * true if the sequences are from a nucleotide alignment, else false
103 * the database source we want cross-references to
105 * the alignment dataset the sequences belong to
106 * @return an alignment containing cross-reference sequences, or null if none
109 static AlignmentI findXrefSequences(List<SequenceI> fromSeqs,
110 boolean dna, String source, AlignmentI dataset)
112 List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
113 AlignedCodonFrame mappings = new AlignedCodonFrame();
115 List<DBRefEntry> unresolvedRefs = new ArrayList<DBRefEntry>();
118 * first extract any mapped sequences from sourceRefs
119 * if successful, sequence is removed from fromSeqs
120 * if unsuccessful, dbrefs are added to unresolvedRefs
122 findMappedDbrefs(fromSeqs, source, foundSeqs,
123 unresolvedRefs, mappings);
126 * then search the alignment dataset for dbref resolutions
128 findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
129 unresolvedRefs, mappings);
132 * fetch any remaining sourceRefs from the source database
134 fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
137 if (foundSeqs.isEmpty())
141 AlignmentI crossRefs = new Alignment(
142 foundSeqs.toArray(new SequenceI[foundSeqs.size()]));
143 crossRefs.addCodonFrame(mappings);
148 * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
149 * found, adds the sequence to foundSeqs and removes the dbref from the list.
150 * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
151 * version number to 0 i.e. use source and accession only).
154 * the dataset sequences we are searching from
156 * the database source we are searching dbrefs for
158 * a list of found sequences to add to
159 * @param unresolvedRefs
160 * a list of unresolved cross-references to add to
162 * a set of sequence mappings to add to
165 static void findMappedDbrefs(List<SequenceI> fromSeqs, String source,
166 List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
167 AlignedCodonFrame mappings)
169 Iterator<SequenceI> it = fromSeqs.iterator();
172 SequenceI seq = it.next();
173 SequenceI dss = seq.getDatasetSequence();
174 dss = dss == null ? seq : dss;
176 DBRefEntry[] dbRefs = seq.getDBRefs();
181 boolean resolved = false;
182 for (DBRefEntry dbref : dbRefs)
184 if (!source.equals(dbref.getSource()))
188 DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
189 dbref.getAccessionId());
190 Mapping map = dbref.getMap();
193 unresolvedRefs.remove(todo);
195 SequenceI mappedTo = map.getTo();
196 if (mappedTo != null)
198 foundSeqs.add(new Sequence(mappedTo));
201 * check mapping is not 'direct' (it shouldn't be if we reach here)
202 * and add mapping (dna-to-peptide or vice versa) to the set
204 MapList mapList = map.getMap();
205 int fromRatio = mapList.getFromRatio();
206 int toRatio = mapList.getToRatio();
207 if (fromRatio != toRatio)
211 mappings.addMap(dss, mappedTo, mapList);
215 mappings.addMap(mappedTo, dss, mapList.getInverse());
223 * no mapping to resolve dbref - add source+accession to list to resolve
225 if (!unresolvedRefs.contains(todo))
227 unresolvedRefs.add(todo);
239 * Tries to fetch seq's database references to 'source' database, and add them
240 * to the foundSeqs list. If found, tries to make a mapping between seq and
241 * the retrieved sequence and insert it into the database reference.
249 static void fetchCrossReferences(List<SequenceI> fromSeqs,
250 List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
251 AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
253 ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
254 SequenceI[] retrieved;
257 retrieved = sftch.getSequences(sourceRefs, !dna);
258 } catch (Exception e)
260 System.err.println("Problem whilst retrieving cross references: "
266 if (retrieved == null)
270 updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
272 SequenceIdMatcher matcher = new SequenceIdMatcher(
273 dataset.getSequences());
274 List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
275 CrossRefs me = new CrossRefs();
276 for (int rs = 0; rs < retrieved.length; rs++)
278 // TODO: examine each sequence for 'redundancy'
279 DBRefEntry[] dbr = retrieved[rs].getDBRefs();
280 if (dbr != null && dbr.length > 0)
282 for (int di = 0; di < dbr.length; di++)
284 // find any entry where we should put in the sequence being
285 // cross-referenced into the map
286 Mapping map = dbr[di].getMap();
289 if (map.getTo() != null && map.getMap() != null)
291 SequenceI matched = matcher.findIdMatch(map.getTo());
295 * already got an xref to this sequence; update this
296 * map to point to the same sequence, and add
297 * any new dbrefs to it
299 for (DBRefEntry ref : map.getTo().getDBRefs())
301 matched.addDBRef(ref); // add or update mapping
307 matcher.add(map.getTo());
311 // compare ms with dss and replace with dss in mapping
312 // if map is congruent
313 SequenceI ms = map.getTo();
314 int sf = map.getMap().getToLowest();
315 int st = map.getMap().getToHighest();
316 SequenceI mappedrg = ms.getSubSequence(sf, st);
317 // SequenceI loc = dss.getSubSequence(sf, st);
318 if (mappedrg.getLength() > 0
319 && ms.getSequenceAsString().equals(
320 fromSeqs.getSequenceAsString()))
321 // && mappedrg.getSequenceAsString().equals(
322 // loc.getSequenceAsString()))
324 String msg = "Mapping updated from " + ms.getName()
325 + " to retrieved crossreference "
326 + fromSeqs.getName();
327 System.out.println(msg);
328 // method to update all refs of existing To on
329 // retrieved sequence with dss and merge any props
333 * copy sequence features as well, avoiding
334 * duplication (e.g. same variation from 2
337 SequenceFeature[] sfs = ms.getSequenceFeatures();
340 for (SequenceFeature feat : sfs)
343 * we override SequenceFeature.equals here (but
344 * not elsewhere) to ignore Parent attribute
345 * TODO not quite working yet!
348 .contains(me.new MySequenceFeature(feat)))
350 fromSeqs.addSequenceFeature(feat);
351 copiedFeatures.add(feat);
356 mappings.addMap(retrieved[rs].getDatasetSequence(),
357 map.getTo(), map.getMap());
358 } catch (Exception e)
361 .println("Exception when consolidating Mapped sequence set...");
362 e.printStackTrace(System.err);
368 retrieved[rs].updatePDBIds();
369 foundSeqs.add(retrieved[rs]);
374 * Searches the alignment for a sequence of complementary type to 'seq' which
375 * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
376 * removes the resolved sourceRef from the search list.
380 * @param unresolvedRefs
382 * @param unresolvedRefs
386 static void findIndirectCrossReferences(List<SequenceI> fromSeqs,
387 String source, AlignmentI dataset,
388 List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
389 AlignedCodonFrame mappings)
391 Iterator<DBRefEntry> refs = unresolvedRefs.iterator();
392 while (refs.hasNext())
394 DBRefEntry dbref = refs.next();
395 boolean found = false;
396 // boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
398 // unresolvedRefs, mappings);
407 * Searches the dataset for a sequence of opposite type to 'excluding', which
408 * has a cross-reference matching dbref. If found, adds the sequence to
409 * foundSeqs and removes dbref from the search list.
412 * a sequence to ignore (start point of search)
414 * a cross-reference to try to match
416 * sequences to search in
418 * result list to add to
420 * a set of sequence mappings to add to
421 * @return true if relationship found and sequence added
423 static boolean searchDatasetForCrossReference(SequenceI excluding,
424 DBRefEntry dbref, AlignmentI dataset, List<SequenceI> foundSeqs,
425 AlignedCodonFrame mappings)
427 boolean fromNucleotide = Comparison.isNucleotide(excluding);
428 boolean found = false;
433 if (dataset.getSequences() == null)
438 synchronized (ds = dataset.getSequences())
440 for (SequenceI nxt : ds)
444 if (nxt.getDatasetSequence() != null)
447 .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
449 if (nxt == excluding || nxt == excluding.getDatasetSequence())
453 if (foundSeqs.contains(nxt))
456 * already added this sequence to cross-refs
460 boolean isDna = Comparison.isNucleotide(nxt);
461 if (isDna == fromNucleotide)
464 * skip this sequence - wrong molecule type
470 * check if this sequence has any dbref matching source and accession
471 * (version and mapping may differ)
473 List<DBRefEntry> candidates = DBRefUtils.searchRefs(
474 nxt.getDBRefs(), dbref);
476 if (candidates.isEmpty())
482 if (mappings != null)
484 // don't search if we aren't given a codon map object
485 for (DBRefEntry candidate : candidates)
487 if (candidate.hasMap())
489 Mapping mapping = candidate.getMap();
490 MapList map = mapping.getMap();
491 if (mapping.getTo() != null
492 && map.getFromRatio() != map.getToRatio())
496 // map is from dna seq to a protein product
497 mappings.addMap(excluding, nxt, map);
501 // map is from protein seq to its coding dna
502 mappings.addMap(nxt, excluding, map.getInverse());
515 * Updates any empty mappings in the cross-references with one to a compatible
516 * retrieved sequence if found, and adds any new mappings to the
525 static void updateDbrefMappings(boolean dna, List<SequenceI> fromSeqs,
526 List<DBRefEntry> xrefs, SequenceI[] retrieved,
527 AlignedCodonFrame mappings)
529 SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
530 for (DBRefEntry xref : xrefs)
534 String targetSeqName = xref.getSource() + "|"
535 + xref.getAccessionId();
536 SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
541 for (SequenceI seq : matches)
543 MapList mapping = null;
546 mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
550 mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
553 mapping = mapping.getInverse();
558 xref.setMap(new Mapping(seq, mapping));
561 AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
565 mappings.addMap(fromSeqs, seq, mapping);
569 mappings.addMap(seq, fromSeqs, mapping.getInverse());