X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fanalysis%2FCrossRef.java;h=d96ab58e34d87b6e7c2bdce4f42e67fac82c2b04;hb=b10330c2e330e409d2fd3812f1001f9542dc1b36;hp=4e8f070ac7d44ba08482716ef51030c61d924262;hpb=9c39e96af6b84257604da448101505361dced686;p=jalview.git diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 4e8f070..d96ab58 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -54,12 +54,6 @@ public class CrossRef private AlignmentI dataset; /* - * true if we are searching for cross-references from nucleotide, - * i.e. for protein sequences, false if the reverse - */ - private boolean fromDna; - - /* * the sequences for which we are seeking cross-references */ private SequenceI[] fromSeqs; @@ -76,7 +70,6 @@ public class CrossRef public CrossRef(SequenceI[] seqs, AlignmentI ds) { fromSeqs = seqs; - fromDna = ds.isNucleotide(); dataset = ds.getDataset() == null ? ds : ds.getDataset(); } @@ -88,16 +81,20 @@ public class CrossRef * reference from another sequence in the dataset which has a cross-reference * to a direct DBRefEntry on the given sequence * + * + * @param dna + * - when true, cross-references *from* dna returned. When false, + * cross-references *from* protein are returned * @return */ - public List findXrefSourcesForSequences() + public List findXrefSourcesForSequences(boolean dna) { List sources = new ArrayList(); for (SequenceI seq : fromSeqs) { if (seq != null) { - findXrefSourcesForSequence(seq, sources); + findXrefSourcesForSequence(seq, dna, sources); } } return sources; @@ -117,7 +114,8 @@ public class CrossRef * @param sources * a list of sources to add matches to */ - void findXrefSourcesForSequence(SequenceI seq, List sources) + void findXrefSourcesForSequence(SequenceI seq, boolean fromDna, + List sources) { /* * first find seq's xrefs (dna-to-peptide or peptide-to-dna) @@ -136,7 +134,7 @@ public class CrossRef * find sequences in the alignment which xref one of these DBRefs * i.e. is xref-ed to a common sequence identifier */ - searchDatasetXrefs(seq, lrfs, rseqs, null); + searchDatasetXrefs(fromDna, seq, lrfs, rseqs, null); /* * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources @@ -163,7 +161,10 @@ public class CrossRef { for (DBRefEntry ref : xrefs) { - String source = ref.getSource(); + /* + * avoid duplication e.g. ENSEMBL and Ensembl + */ + String source = DBRefUtils.getCanonicalName(ref.getSource()); if (!sources.contains(source)) { sources.add(source); @@ -173,19 +174,27 @@ public class CrossRef } /** + * Attempts to find cross-references from the sequences provided in the + * constructor to the given source database. Cross-references may be found + *
    + *
  • in dbrefs on the sequence which hold a mapping to a sequence + *
      + *
    • provided with a fetched sequence (e.g. ENA translation), or
    • + *
    • populated previously after getting cross-references
    • + *
    + *
  • as other sequences in the alignment which share a dbref identifier with + * the sequence
  • + *
  • by fetching from the remote database
  • + *
+ * The cross-referenced sequences, and mappings to them, are added to the + * alignment dataset. * - * @param seqs - * sequences whose xrefs are being retrieved - * @param dna - * true if sequences are nucleotide * @param source - * @param al - * alignment to search for cross-referenced sequences (and possibly - * add to) - * @return products (as dataset sequences) + * @return cross-referenced sequences (as dataset sequences) */ - public Alignment findXrefSequences(String source) + public Alignment findXrefSequences(String source, boolean fromDna) { + List rseqs = new ArrayList(); AlignedCodonFrame cf = new AlignedCodonFrame(); SequenceIdMatcher matcher = new SequenceIdMatcher( @@ -215,7 +224,7 @@ public class CrossRef * which have a dbref to an accession id for this sequence, * and add them to the results */ - found = searchDatasetXrefs(dss, lrfs, rseqs, cf); + found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf); } if (xrfs == null && !found) { @@ -244,13 +253,20 @@ public class CrossRef * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707} */ found = true; - SequenceI matchInDataset = findInDataset(mappedTo);// matcher.findIdMatch(mappedTo); + /* + * problem: matcher.findIdMatch() is lenient - returns a sequence + * with a dbref to the search arg e.g. ENST for ENSP - wrong + * but findInDataset() matches ENSP when looking for Uniprot... + */ + SequenceI matchInDataset = findInDataset(xref); + /*matcher.findIdMatch(mappedTo);*/ if (matchInDataset != null) { if (!rseqs.contains(matchInDataset)) { rseqs.add(matchInDataset); } + refIterator.remove(); continue; } SequenceI rsq = new Sequence(mappedTo); @@ -279,7 +295,7 @@ public class CrossRef + xref.getAccessionId()); if (matchedSeq != null) { - if (constructMapping(seq, matchedSeq, xref, cf)) + if (constructMapping(seq, matchedSeq, xref, cf, fromDna)) { found = true; } @@ -290,7 +306,7 @@ public class CrossRef { // do a bit more work - search for sequences with references matching // xrefs on this sequence. - found = searchDataset(dss, xref, rseqs, cf, false); + found = searchDataset(fromDna, dss, xref, rseqs, cf, false); } if (found) { @@ -319,7 +335,7 @@ public class CrossRef if (retrieved != null) { - updateDbrefMappings(seq, xrfs, retrieved, cf); + updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna); for (SequenceI retrievedSequence : retrieved) { SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence @@ -337,8 +353,11 @@ public class CrossRef if (map.getTo() != null && map.getMap() != null) { // TODO findInDataset requires exact sequence match but - // 'congruent' test only for the mapped part - SequenceI matched = findInDataset(map.getTo());// matcher.findIdMatch(map.getTo()); + // 'congruent' test is only for the mapped part + // maybe not a problem in practice since only ENA provide a + // mapping and it is to the full protein translation of CDS + SequenceI matched = findInDataset(dbref); + // matcher.findIdMatch(map.getTo()); if (matched != null) { /* @@ -379,15 +398,17 @@ public class CrossRef + " to retrieved crossreference " + dss.getName(); System.out.println(msg); - // method to update all refs of existing To on - // retrieved sequence with dss and merge any props - // on To onto dss. - // TODO don't we have to change the mapped to ranges - // if not to the whole sequence? map.setTo(dss); + + /* + * give the reverse reference the inverse mapping + * (if it doesn't have one already) + */ + setReverseMapping(dss, dbref, cf); + /* * copy sequence features as well, avoiding - * duplication (e.g. same variation from 2 + * duplication (e.g. same variation from two * transcripts) */ SequenceFeature[] sfs = ms.getSequenceFeatures(); @@ -397,7 +418,7 @@ public class CrossRef { /* * make a flyweight feature object which ignores Parent - * attribute in equality test, to avoid creating many + * attribute in equality test; this avoids creating many * otherwise duplicate exon features on genomic sequence */ SequenceFeature newFeature = new SequenceFeature( @@ -425,9 +446,9 @@ public class CrossRef } } retrievedSequence.updatePDBIds(); - rseqs.add(retrievedSequence); + rseqs.add(retrievedDss); dataset.addSequence(retrievedDss); - matcher.add(retrievedSequence); + matcher.add(retrievedDss); } } } @@ -437,33 +458,85 @@ public class CrossRef if (rseqs.size() > 0) { ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()])); - if (cf != null && !cf.isEmpty()) + if (!cf.isEmpty()) { - ral.addCodonFrame(cf); + dataset.addCodonFrame(cf); } } return ral; } /** + * Sets the inverse sequence mapping in the corresponding dbref of the mapped + * to sequence (if any). This is used after fetching a cross-referenced + * sequence, if the fetched sequence has a mapping to the original sequence, + * to set the mapping in the original sequence's dbref. + * + * @param mapFrom + * the sequence mapped from + * @param dbref + * @param mappings + */ + void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref, + AlignedCodonFrame mappings) + { + SequenceI mapTo = dbref.getMap().getTo(); + if (mapTo == null) + { + return; + } + DBRefEntry[] dbrefs = mapTo.getDBRefs(); + if (dbrefs == null) + { + return; + } + for (DBRefEntry toRef : dbrefs) + { + if (toRef.hasMap() && mapFrom == toRef.getMap().getTo()) + { + /* + * found the reverse dbref; update its mapping if null + */ + if (toRef.getMap().getMap() == null) + { + MapList inverse = dbref.getMap().getMap().getInverse(); + toRef.getMap().setMap(inverse); + mappings.addMap(mapTo, mapFrom, inverse); + } + } + } + } + + /** * Returns the first identical sequence in the dataset if any, else null * - * @param mappedTo + * @param xref * @return */ - SequenceI findInDataset(SequenceI mappedTo) + SequenceI findInDataset(DBRefEntry xref) { - if (mappedTo == null) + if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null) { return null; } - SequenceI dss = mappedTo.getDatasetSequence() == null ? mappedTo - : mappedTo.getDatasetSequence(); + SequenceI mapsTo = xref.getMap().getTo(); + String name = xref.getAccessionId(); + String name2 = xref.getSource() + "|" + name; + SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo + .getDatasetSequence(); for (SequenceI seq : dataset.getSequences()) { - if (sameSequence(seq, dss)) + /* + * clumsy alternative to using SequenceIdMatcher which currently + * returns sequences with a dbref to the matched accession id + * which we don't want + */ + if (name.equals(seq.getName()) || seq.getName().startsWith(name2)) { - return seq; + if (sameSequence(seq, dss)) + { + return seq; + } } } return null; @@ -520,8 +593,8 @@ public class CrossRef * @param retrieved * @param acf */ - void updateDbrefMappings(SequenceI mapFrom, - DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf) + void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs, + SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna) { SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved); for (DBRefEntry xref : xrefs) @@ -537,16 +610,25 @@ public class CrossRef } for (SequenceI seq : matches) { - constructMapping(mapFrom, seq, xref, acf); + constructMapping(mapFrom, seq, xref, acf, fromDna); } } } } /** - * Tries to make a mapping from dna to protein. If successful, adds the - * mapping to the dbref and the mappings collection and answers true, - * otherwise answers false. + * Tries to make a mapping between sequences. If successful, adds the mapping + * to the dbref and the mappings collection and answers true, otherwise + * answers false. The following methods of making are mapping are tried in + * turn: + *
    + *
  • if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for + * example, the case after fetching EMBL cross-references for a Uniprot + * sequence
  • + *
  • else check if the dna translates exactly to the protein (give or take + * start and stop codons>
  • + *
  • else try to map based on CDS features on the dna sequence
  • + *
* * @param mapFrom * @param mapTo @@ -555,9 +637,32 @@ public class CrossRef * @return */ boolean constructMapping(SequenceI mapFrom, SequenceI mapTo, - DBRefEntry xref, AlignedCodonFrame mappings) + DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna) { MapList mapping = null; + + /* + * look for a reverse mapping, if found make its inverse + */ + if (mapTo.getDBRefs() != null) + { + for (DBRefEntry dbref : mapTo.getDBRefs()) + { + String name = dbref.getSource() + "|" + dbref.getAccessionId(); + if (dbref.hasMap() && mapFrom.getName().startsWith(name)) + { + /* + * looks like we've found a map from 'mapTo' to 'mapFrom' + * - invert it to make the mapping the other way + */ + MapList reverse = dbref.getMap().getMap().getInverse(); + xref.setMap(new Mapping(mapTo, reverse)); + mappings.addMap(mapFrom, mapTo, reverse); + return true; + } + } + } + if (fromDna) { mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom); @@ -593,12 +698,15 @@ public class CrossRef * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry * based on source and accession string only - Map and Version are nulled. * + * @param fromDna + * - true if context was searching from Dna sequences, false if + * context was searching from Protein sequences * @param sequenceI * @param lrfs * @param rseqs * @return true if matches were found. */ - private boolean searchDatasetXrefs(SequenceI sequenceI, + private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI, DBRefEntry[] lrfs, List rseqs, AlignedCodonFrame cf) { boolean found = false; @@ -612,7 +720,7 @@ public class CrossRef // add in wildcards xref.setVersion(null); xref.setMap(null); - found |= searchDataset(sequenceI, xref, rseqs, cf, false); + found |= searchDataset(fromDna, sequenceI, xref, rseqs, cf, false); } return found; } @@ -621,6 +729,9 @@ public class CrossRef * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the * associated sequence to rseqs * + * @param fromDna + * true if context was searching for refs *from* dna sequence, false + * if context was searching for refs *from* protein sequence * @param sequenceI * a sequence to ignore (start point of search) * @param xrf @@ -633,8 +744,9 @@ public class CrossRef * - search all references or only subset * @return true if relationship found and sequence added. */ - boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf, - List rseqs, AlignedCodonFrame cf, boolean direct) + boolean searchDataset(boolean fromDna, SequenceI sequenceI, + DBRefEntry xrf, List rseqs, AlignedCodonFrame cf, + boolean direct) { boolean found = false; if (dataset == null)