-package jalview.analysis;
-
-import jalview.analysis.CrossRef.MySequenceFeature;
-import jalview.datamodel.AlignedCodonFrame;
-import jalview.datamodel.Alignment;
-import jalview.datamodel.AlignmentI;
-import jalview.datamodel.DBRefEntry;
-import jalview.datamodel.Mapping;
-import jalview.datamodel.Sequence;
-import jalview.datamodel.SequenceFeature;
-import jalview.datamodel.SequenceI;
-import jalview.util.Comparison;
-import jalview.util.DBRefUtils;
-import jalview.util.MapList;
-import jalview.ws.SequenceFetcherFactory;
-import jalview.ws.seqfetcher.ASequenceFetcher;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-public class CrossRefs
-{
- /*
- * A sub-class that ignores Parent attribute when comparing sequence
- * features. This avoids 'duplicate' CDS features that only
- * differ in their parent Transcript ids.
- */
- class MySequenceFeature extends SequenceFeature
- {
- private SequenceFeature feat;
-
- MySequenceFeature(SequenceFeature sf)
- {
- this.feat = sf;
- }
-
- @Override
- public boolean equals(Object o)
- {
- return feat.equals(o, true);
- }
- }
-
- /**
- * Finds cross-references for sequences from a specified source database.
- * These may be found in four ways:
- * <ul>
- * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
- * <li>a sequence of complementary type in the alignment dataset, which has a
- * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
- * <li>a sequence of complementary type in the alignment, which has a
- * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
- * <li>by fetching the accession from the remote database</li>
- * </ul>
- *
- * @param seqs
- * the sequences whose cross-references we are searching for
- * @param dna
- * true if the sequences are from a nucleotide alignment, else false
- * @param source
- * the database source we want cross-references to
- * @param dataset
- * the alignment dataset the sequences belong to
- * @return an alignment containing cross-reference sequences, or null if none
- * found
- */
- public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
- String source, AlignmentI dataset)
- {
- /*
- * filter to only those sequences of the right type (nucleotide/protein)
- */
- List<SequenceI> fromSeqs = new ArrayList<SequenceI>();
- for (SequenceI seq : seqs)
- {
- if (dna == Comparison.isNucleotide(seq))
- {
- fromSeqs.add(seq);
- }
- }
- return findXrefSequences(fromSeqs, dna, source, dataset);
- }
-
- /**
- * Finds cross-references for sequences from a specified source database.
- * These may be found in four ways:
- * <ul>
- * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
- * <li>a sequence of complementary type in the alignment dataset, which has a
- * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
- * <li>a sequence of complementary type in the alignment, which has a
- * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
- * <li>by fetching the accession from the remote database</li>
- * </ul>
- *
- * @param seqs
- * the sequences whose cross-references we are searching for,
- * filtered to only those which are of the type denoted by 'dna'
- * @param dna
- * true if the sequences are from a nucleotide alignment, else false
- * @param source
- * the database source we want cross-references to
- * @param dataset
- * the alignment dataset the sequences belong to
- * @return an alignment containing cross-reference sequences, or null if none
- * found
- */
- static AlignmentI findXrefSequences(List<SequenceI> fromSeqs,
- boolean dna, String source, AlignmentI dataset)
- {
- List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
- AlignedCodonFrame mappings = new AlignedCodonFrame();
-
- List<DBRefEntry> unresolvedRefs = new ArrayList<DBRefEntry>();
-
- /*
- * first extract any mapped sequences from sourceRefs
- * if successful, sequence is removed from fromSeqs
- * if unsuccessful, dbrefs are added to unresolvedRefs
- */
- findMappedDbrefs(fromSeqs, source, foundSeqs,
- unresolvedRefs, mappings);
-
- /*
- * then search the alignment dataset for dbref resolutions
- */
- findIndirectCrossReferences(fromSeqs, source, dataset, foundSeqs,
- unresolvedRefs, mappings);
-
- /*
- * fetch any remaining sourceRefs from the source database
- */
- fetchCrossReferences(fromSeqs, unresolvedRefs, foundSeqs, mappings,
- dna, dataset);
-
- if (foundSeqs.isEmpty())
- {
- return null;
- }
- AlignmentI crossRefs = new Alignment(
- foundSeqs.toArray(new SequenceI[foundSeqs.size()]));
- crossRefs.addCodonFrame(mappings);
- return crossRefs;
- }
-
- /**
- * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
- * found, adds the sequence to foundSeqs and removes the dbref from the list.
- * DBRefs with no mapping are added to the 'unresolvedRefs' list (setting
- * version number to 0 i.e. use source and accession only).
- *
- * @param fromSeqs
- * the dataset sequences we are searching from
- * @param source
- * the database source we are searching dbrefs for
- * @param foundSeqs
- * a list of found sequences to add to
- * @param unresolvedRefs
- * a list of unresolved cross-references to add to
- * @param mappings
- * a set of sequence mappings to add to
- * @return
- */
- static void findMappedDbrefs(List<SequenceI> fromSeqs, String source,
- List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
- AlignedCodonFrame mappings)
- {
- Iterator<SequenceI> it = fromSeqs.iterator();
- while (it.hasNext())
- {
- SequenceI seq = it.next();
- SequenceI dss = seq.getDatasetSequence();
- dss = dss == null ? seq : dss;
-
- DBRefEntry[] dbRefs = seq.getDBRefs();
- if (dbRefs == null)
- {
- continue;
- }
- boolean resolved = false;
- for (DBRefEntry dbref : dbRefs)
- {
- if (!source.equals(dbref.getSource()))
- {
- continue;
- }
- DBRefEntry todo = new DBRefEntry(dbref.getSource(), "0",
- dbref.getAccessionId());
- Mapping map = dbref.getMap();
- if (map != null)
- {
- unresolvedRefs.remove(todo);
- resolved = true;
- SequenceI mappedTo = map.getTo();
- if (mappedTo != null)
- {
- foundSeqs.add(new Sequence(mappedTo));
-
- /*
- * check mapping is not 'direct' (it shouldn't be if we reach here)
- * and add mapping (dna-to-peptide or vice versa) to the set
- */
- MapList mapList = map.getMap();
- int fromRatio = mapList.getFromRatio();
- int toRatio = mapList.getToRatio();
- if (fromRatio != toRatio)
- {
- if (fromRatio == 3)
- {
- mappings.addMap(dss, mappedTo, mapList);
- }
- else
- {
- mappings.addMap(mappedTo, dss, mapList.getInverse());
- }
- }
- }
- }
- else
- {
- /*
- * no mapping to resolve dbref - add source+accession to list to resolve
- */
- if (!unresolvedRefs.contains(todo))
- {
- unresolvedRefs.add(todo);
- }
- }
- }
- if (resolved)
- {
- it.remove();
- }
- }
- }
-
- /**
- * Tries to fetch seq's database references to 'source' database, and add them
- * to the foundSeqs list. If found, tries to make a mapping between seq and
- * the retrieved sequence and insert it into the database reference.
- *
- * @param fromSeqs
- * @param sourceRefs
- * @param foundSeqs
- * @param mappings
- * @param dna
- */
- static void fetchCrossReferences(List<SequenceI> fromSeqs,
- List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
- AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
- {
- ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
- SequenceI[] retrieved;
- try
- {
- retrieved = sftch.getSequences(sourceRefs, !dna);
- } catch (Exception e)
- {
- System.err.println("Problem whilst retrieving cross references: "
- + e.getMessage());
- e.printStackTrace();
- return;
- }
-
- if (retrieved == null)
- {
- return;
- }
- updateDbrefMappings(dna, fromSeqs, sourceRefs, retrieved, mappings);
-
- SequenceIdMatcher matcher = new SequenceIdMatcher(
- dataset.getSequences());
- List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
- CrossRefs me = new CrossRefs();
- for (int rs = 0; rs < retrieved.length; rs++)
- {
- // TODO: examine each sequence for 'redundancy'
- DBRefEntry[] dbr = retrieved[rs].getDBRefs();
- if (dbr != null && dbr.length > 0)
- {
- for (int di = 0; di < dbr.length; di++)
- {
- // find any entry where we should put in the sequence being
- // cross-referenced into the map
- Mapping map = dbr[di].getMap();
- if (map != null)
- {
- if (map.getTo() != null && map.getMap() != null)
- {
- SequenceI matched = matcher.findIdMatch(map.getTo());
- if (matched != null)
- {
- /*
- * already got an xref to this sequence; update this
- * map to point to the same sequence, and add
- * any new dbrefs to it
- */
- for (DBRefEntry ref : map.getTo().getDBRefs())
- {
- matched.addDBRef(ref); // add or update mapping
- }
- map.setTo(matched);
- }
- else
- {
- matcher.add(map.getTo());
- }
- try
- {
- // compare ms with dss and replace with dss in mapping
- // if map is congruent
- SequenceI ms = map.getTo();
- int sf = map.getMap().getToLowest();
- int st = map.getMap().getToHighest();
- SequenceI mappedrg = ms.getSubSequence(sf, st);
- // SequenceI loc = dss.getSubSequence(sf, st);
- if (mappedrg.getLength() > 0
- && ms.getSequenceAsString().equals(
- fromSeqs.getSequenceAsString()))
- // && mappedrg.getSequenceAsString().equals(
- // loc.getSequenceAsString()))
- {
- String msg = "Mapping updated from " + ms.getName()
- + " to retrieved crossreference "
- + fromSeqs.getName();
- System.out.println(msg);
- // method to update all refs of existing To on
- // retrieved sequence with dss and merge any props
- // on To onto dss.
- map.setTo(fromSeqs);
- /*
- * copy sequence features as well, avoiding
- * duplication (e.g. same variation from 2
- * transcripts)
- */
- SequenceFeature[] sfs = ms.getSequenceFeatures();
- if (sfs != null)
- {
- for (SequenceFeature feat : sfs)
- {
- /*
- * we override SequenceFeature.equals here (but
- * not elsewhere) to ignore Parent attribute
- * TODO not quite working yet!
- */
- if (!copiedFeatures
- .contains(me.new MySequenceFeature(feat)))
- {
- fromSeqs.addSequenceFeature(feat);
- copiedFeatures.add(feat);
- }
- }
- }
- }
- mappings.addMap(retrieved[rs].getDatasetSequence(),
- map.getTo(), map.getMap());
- } catch (Exception e)
- {
- System.err
- .println("Exception when consolidating Mapped sequence set...");
- e.printStackTrace(System.err);
- }
- }
- }
- }
- }
- retrieved[rs].updatePDBIds();
- foundSeqs.add(retrieved[rs]);
- }
- }
-
- /**
- * Searches the alignment for a sequence of complementary type to 'seq' which
- * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
- * removes the resolved sourceRef from the search list.
- *
- * @param fromSeqs
- * @param source
- * @param unresolvedRefs
- * @param foundSeqs
- * @param unresolvedRefs
- * @param mappings
- * @return
- */
- static void findIndirectCrossReferences(List<SequenceI> fromSeqs,
- String source, AlignmentI dataset,
- List<SequenceI> foundSeqs, List<DBRefEntry> unresolvedRefs,
- AlignedCodonFrame mappings)
- {
- Iterator<DBRefEntry> refs = unresolvedRefs.iterator();
- while (refs.hasNext())
- {
- DBRefEntry dbref = refs.next();
- boolean found = false;
- // boolean found = searchDatasetForCrossReference(fromSeqs, dbref,
- // foundSeqs,
- // unresolvedRefs, mappings);
- if (found)
- {
- refs.remove();
- }
- }
- }
-
- /**
- * Searches the dataset for a sequence of opposite type to 'excluding', which
- * has a cross-reference matching dbref. If found, adds the sequence to
- * foundSeqs and removes dbref from the search list.
- *
- * @param excluding
- * a sequence to ignore (start point of search)
- * @param dbref
- * a cross-reference to try to match
- * @param dataset
- * sequences to search in
- * @param foundSeqs
- * result list to add to
- * @param mappings
- * a set of sequence mappings to add to
- * @return true if relationship found and sequence added
- */
- static boolean searchDatasetForCrossReference(SequenceI excluding,
- DBRefEntry dbref, AlignmentI dataset, List<SequenceI> foundSeqs,
- AlignedCodonFrame mappings)
- {
- boolean fromNucleotide = Comparison.isNucleotide(excluding);
- boolean found = false;
- if (dataset == null)
- {
- return false;
- }
- if (dataset.getSequences() == null)
- {
- return false;
- }
- List<SequenceI> ds;
- synchronized (ds = dataset.getSequences())
- {
- for (SequenceI nxt : ds)
- {
- if (nxt != null)
- {
- if (nxt.getDatasetSequence() != null)
- {
- System.err
- .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
- }
- if (nxt == excluding || nxt == excluding.getDatasetSequence())
- {
- continue;
- }
- if (foundSeqs.contains(nxt))
- {
- /*
- * already added this sequence to cross-refs
- */
- continue;
- }
- boolean isDna = Comparison.isNucleotide(nxt);
- if (isDna == fromNucleotide)
- {
- /*
- * skip this sequence - wrong molecule type
- */
- continue;
- }
-
- /*
- * check if this sequence has any dbref matching source and accession
- * (version and mapping may differ)
- */
- List<DBRefEntry> candidates = DBRefUtils.searchRefs(
- nxt.getDBRefs(), dbref);
-
- if (candidates.isEmpty())
- {
- continue;
- }
- found = true;
- foundSeqs.add(nxt);
- if (mappings != null)
- {
- // don't search if we aren't given a codon map object
- for (DBRefEntry candidate : candidates)
- {
- if (candidate.hasMap())
- {
- Mapping mapping = candidate.getMap();
- MapList map = mapping.getMap();
- if (mapping.getTo() != null
- && map.getFromRatio() != map.getToRatio())
- {
- if (fromNucleotide)
- {
- // map is from dna seq to a protein product
- mappings.addMap(excluding, nxt, map);
- }
- else
- {
- // map is from protein seq to its coding dna
- mappings.addMap(nxt, excluding, map.getInverse());
- }
- }
- }
- }
- }
- }
- }
- }
- return found;
- }
-
- /**
- * Updates any empty mappings in the cross-references with one to a compatible
- * retrieved sequence if found, and adds any new mappings to the
- * AlignedCodonFrame
- *
- * @param dna
- * @param fromSeqs
- * @param xrefs
- * @param retrieved
- * @param mappings
- */
- static void updateDbrefMappings(boolean dna, List<SequenceI> fromSeqs,
- List<DBRefEntry> xrefs, SequenceI[] retrieved,
- AlignedCodonFrame mappings)
- {
- SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
- for (DBRefEntry xref : xrefs)
- {
- if (!xref.hasMap())
- {
- String targetSeqName = xref.getSource() + "|"
- + xref.getAccessionId();
- SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
- if (matches == null)
- {
- return;
- }
- for (SequenceI seq : matches)
- {
- MapList mapping = null;
- if (dna)
- {
- mapping = AlignmentUtils.mapCdnaToProtein(seq, fromSeqs);
- }
- else
- {
- mapping = AlignmentUtils.mapCdnaToProtein(fromSeqs, seq);
- if (mapping != null)
- {
- mapping = mapping.getInverse();
- }
- }
- if (mapping != null)
- {
- xref.setMap(new Mapping(seq, mapping));
- if (dna)
- {
- AlignmentUtils.computeProteinFeatures(fromSeqs, seq, mapping);
- }
- if (dna)
- {
- mappings.addMap(fromSeqs, seq, mapping);
- }
- else
- {
- mappings.addMap(seq, fromSeqs, mapping.getInverse());
- }
- continue;
- }
- }
- }
- }
- }
-}