src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.DBRefUtils;
  32 import jalview.util.MapList;
  33 import jalview.ws.SequenceFetcherFactory;
  34 import jalview.ws.seqfetcher.ASequenceFetcher;
  35
  36 import java.util.ArrayList;
  37 import java.util.Iterator;
  38 import java.util.List;
  39
  40 /**
  41  * Functions for cross-referencing sequence databases.
  42  *
  43  * @author JimP
  44  *
  45  */
  46 public class CrossRef
  47 {
  48   /*
  49    * the dataset of the alignment for which we are searching for
  50    * cross-references; in some cases we may resolve xrefs by
  51    * searching in the dataset
  52    */
  53   private AlignmentI dataset;
  54
  55   /*
  56    * the sequences for which we are seeking cross-references
  57    */
  58   private SequenceI[] fromSeqs;
  59
  60   /**
  61    * matcher built from dataset
  62    */
  63   SequenceIdMatcher matcher;
  64
  65   /**
  66    * sequences found by cross-ref searches to fromSeqs
  67    */
  68   List<SequenceI> rseqs;
  69
  70   /**
  71    * mappings constructed
  72    */
  73   AlignedCodonFrame cf;
  74
  75   /**
  76    * Constructor
  77    *
  78    * @param seqs
  79    *          the sequences for which we are seeking cross-references
  80    * @param ds
  81    *          the containing alignment dataset (may be searched to resolve
  82    *          cross-references)
  83    */
  84   public CrossRef(SequenceI[] seqs, AlignmentI ds)
  85   {
  86     fromSeqs = seqs;
  87     dataset = ds.getDataset() == null ? ds : ds.getDataset();
  88   }
  89
  90   /**
  91    * Returns a list of distinct database sources for which sequences have either
  92    * <ul>
  93    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
  94    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
  95    * reference from another sequence in the dataset which has a cross-reference
  96    * to a direct DBRefEntry on the given sequence</li>
  97    * </ul>
  98    *
  99    * @param dna
 100    *          - when true, cross-references *from* dna returned. When false,
 101    *          cross-references *from* protein are returned
 102    * @return
 103    */
 104   public List<String> findXrefSourcesForSequences(boolean dna)
 105   {
 106     List<String> sources = new ArrayList<String>();
 107     for (SequenceI seq : fromSeqs)
 108     {
 109       if (seq != null)
 110       {
 111         findXrefSourcesForSequence(seq, dna, sources);
 112       }
 113     }
 114     return sources;
 115   }
 116
 117   /**
 118    * Returns a list of distinct database sources for which a sequence has either
 119    * <ul>
 120    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
 121    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
 122    * reference from another sequence in the dataset which has a cross-reference
 123    * to a direct DBRefEntry on the given sequence</li>
 124    * </ul>
 125    *
 126    * @param seq
 127    *          the sequence whose dbrefs we are searching against
 128    * @param fromDna
 129    *          when true, context is DNA - so sources identifying protein
 130    *          products will be returned.
 131    * @param sources
 132    *          a list of sources to add matches to
 133    */
 134   void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
 135           List<String> sources)
 136   {
 137     /*
 138      * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
 139      */
 140     DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
 141     addXrefsToSources(rfs, sources);
 142     if (dataset != null)
 143     {
 144       /*
 145        * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
 146        */
 147       DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
 148       List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
 149
 150       /*
 151        * find sequences in the alignment which xref one of these DBRefs
 152        * i.e. is xref-ed to a common sequence identifier
 153        */
 154       searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null);
 155
 156       /*
 157        * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
 158        */
 159       for (SequenceI rs : foundSeqs)
 160       {
 161         DBRefEntry[] xrs = DBRefUtils
 162                 .selectDbRefs(!fromDna, rs.getDBRefs());
 163         addXrefsToSources(xrs, sources);
 164       }
 165     }
 166   }
 167
 168   /**
 169    * Helper method that adds the source identifiers of some cross-references to
 170    * a (non-redundant) list of database sources
 171    *
 172    * @param xrefs
 173    * @param sources
 174    */
 175   void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
 176   {
 177     if (xrefs != null)
 178     {
 179       for (DBRefEntry ref : xrefs)
 180       {
 181         /*
 182          * avoid duplication e.g. ENSEMBL and Ensembl
 183          */
 184         String source = DBRefUtils.getCanonicalName(ref.getSource());
 185         if (!sources.contains(source))
 186         {
 187           sources.add(source);
 188         }
 189       }
 190     }
 191   }
 192
 193   /**
 194    * Attempts to find cross-references from the sequences provided in the
 195    * constructor to the given source database. Cross-references may be found
 196    * <ul>
 197    * <li>in dbrefs on the sequence which hold a mapping to a sequence
 198    * <ul>
 199    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
 200    * <li>populated previously after getting cross-references</li>
 201    * </ul>
 202    * <li>as other sequences in the alignment which share a dbref identifier with
 203    * the sequence</li>
 204    * <li>by fetching from the remote database</li>
 205    * </ul>
 206    * The cross-referenced sequences, and mappings to them, are added to the
 207    * alignment dataset.
 208    *
 209    * @param source
 210    * @return cross-referenced sequences (as dataset sequences)
 211    */
 212   public Alignment findXrefSequences(String source, boolean fromDna)
 213   {
 214
 215     rseqs = new ArrayList<SequenceI>();
 216     cf = new AlignedCodonFrame();
 217     matcher = new SequenceIdMatcher(
 218             dataset.getSequences());
 219
 220     for (SequenceI seq : fromSeqs)
 221     {
 222       SequenceI dss = seq;
 223       while (dss.getDatasetSequence() != null)
 224       {
 225         dss = dss.getDatasetSequence();
 226       }
 227       boolean found = false;
 228       DBRefEntry[] xrfs = DBRefUtils
 229               .selectDbRefs(!fromDna, dss.getDBRefs());
 230       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 231       {
 232         /*
 233          * found no suitable dbrefs on sequence - look for sequences in the
 234          * alignment which share a dbref with this one
 235          */
 236         DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
 237                 seq.getDBRefs());
 238
 239         /*
 240          * find sequences (except this one!), of complementary type,
 241          *  which have a dbref to an accession id for this sequence,
 242          *  and add them to the results
 243          */
 244         found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
 245       }
 246       if (xrfs == null && !found)
 247       {
 248         /*
 249          * no dbref to source on this sequence or matched
 250          * complementary sequence in the dataset
 251          */
 252         continue;
 253       }
 254       List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
 255               source);
 256       Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
 257       while (refIterator.hasNext())
 258       {
 259         DBRefEntry xref = refIterator.next();
 260         found = false;
 261         if (xref.hasMap())
 262         {
 263           SequenceI mappedTo = xref.getMap().getTo();
 264           if (mappedTo != null)
 265           {
 266             /*
 267              * dbref contains the sequence it maps to; add it to the
 268              * results unless we have done so already (could happen if
 269              * fetching xrefs for sequences which have xrefs in common)
 270              * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
 271              */
 272             found = true;
 273             /*
 274              * problem: matcher.findIdMatch() is lenient - returns a sequence
 275              * with a dbref to the search arg e.g. ENST for ENSP - wrong
 276              * but findInDataset() matches ENSP when looking for Uniprot...
 277              */
 278             SequenceI matchInDataset = findInDataset(xref);
 279             /*matcher.findIdMatch(mappedTo);*/
 280             if (matchInDataset != null)
 281             {
 282               if (!rseqs.contains(matchInDataset))
 283               {
 284                 rseqs.add(matchInDataset);
 285               }
 286               refIterator.remove();
 287               continue;
 288             }
 289             SequenceI rsq = new Sequence(mappedTo);
 290             rseqs.add(rsq);
 291             if (xref.getMap().getMap().getFromRatio() != xref.getMap()
 292                     .getMap().getToRatio())
 293             {
 294               // get sense of map correct for adding to product alignment.
 295               if (fromDna)
 296               {
 297                 // map is from dna seq to a protein product
 298                 cf.addMap(dss, rsq, xref.getMap().getMap(), xref.getMap()
 299                         .getMappedFromId());
 300               }
 301               else
 302               {
 303                 // map should be from protein seq to its coding dna
 304                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(),
 305                         xref.getMap().getMappedFromId());
 306               }
 307             }
 308           }
 309         }
 310
 311         if (!found)
 312         {
 313           SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
 314                   + xref.getAccessionId());
 315           if (matchedSeq != null)
 316           {
 317             if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
 318             {
 319               found = true;
 320             }
 321           }
 322         }
 323
 324         if (!found)
 325         {
 326           // do a bit more work - search for sequences with references matching
 327           // xrefs on this sequence.
 328           found = searchDataset(fromDna, dss, xref, rseqs, cf, false);
 329         }
 330         if (found)
 331         {
 332           refIterator.remove();
 333         }
 334       }
 335
 336       /*
 337        * fetch from source database any dbrefs we haven't resolved up to here
 338        */
 339       if (!sourceRefs.isEmpty())
 340       {
 341         retrieveCrossRef(sourceRefs, seq, xrfs, fromDna);
 342       }
 343     }
 344
 345     Alignment ral = null;
 346     if (rseqs.size() > 0)
 347     {
 348       ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
 349       if (!cf.isEmpty())
 350       {
 351         dataset.addCodonFrame(cf);
 352       }
 353     }
 354     return ral;
 355   }
 356
 357   private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
 358           DBRefEntry[] xrfs, boolean fromDna)
 359   {
 360     ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
 361     SequenceI[] retrieved = null;
 362     SequenceI dss = seq.getDatasetSequence() == null ? seq : seq
 363             .getDatasetSequence();
 364     try
 365     {
 366       retrieved = sftch.getSequences(sourceRefs, !fromDna);
 367     } catch (Exception e)
 368     {
 369       System.err
 370               .println("Problem whilst retrieving cross references for Sequence : "
 371                       + seq.getName());
 372       e.printStackTrace();
 373     }
 374
 375     if (retrieved != null)
 376     {
 377       updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
 378       for (SequenceI retrievedSequence : retrieved)
 379       {
 380         // dataset gets contaminated ccwith non-ds sequences. why ??!
 381         // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
 382         SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
 383                 : retrievedSequence.getDatasetSequence();
 384         DBRefEntry[] dbr = retrievedSequence.getDBRefs();
 385         if (dbr != null)
 386         {
 387           for (DBRefEntry dbref : dbr)
 388           {
 389             // find any entry where we should put in the sequence being
 390             // cross-referenced into the map
 391             Mapping map = dbref.getMap();
 392             if (map != null)
 393             {
 394               if (map.getTo() != null && map.getMap() != null)
 395               {
 396                 // TODO findInDataset requires exact sequence match but
 397                 // 'congruent' test is only for the mapped part
 398                 // maybe not a problem in practice since only ENA provide a
 399                 // mapping and it is to the full protein translation of CDS
 400                 SequenceI matched = findInDataset(dbref);
 401                 // matcher.findIdMatch(map.getTo());
 402                 if (matched != null)
 403                 {
 404                   /*
 405                    * already got an xref to this sequence; update this
 406                    * map to point to the same sequence, and add
 407                    * any new dbrefs to it
 408                    */
 409                   DBRefEntry[] toRefs = map.getTo().getDBRefs();
 410                   if (toRefs != null)
 411                   {
 412                     for (DBRefEntry ref : toRefs)
 413                     {
 414                       matched.addDBRef(ref); // add or update mapping
 415                     }
 416                   }
 417                   map.setTo(matched);
 418                 }
 419                 else
 420                 {
 421                   matcher.add(map.getTo());
 422                 }
 423                 try
 424                 {
 425                   // compare ms with dss and replace with dss in mapping
 426                   // if map is congruent
 427                   SequenceI ms = map.getTo();
 428                   int sf = map.getMap().getToLowest();
 429                   int st = map.getMap().getToHighest();
 430                   SequenceI mappedrg = ms.getSubSequence(sf, st);
 431                   // SequenceI loc = dss.getSubSequence(sf, st);
 432                   if (mappedrg.getLength() > 0
 433                           && ms.getSequenceAsString().equals(
 434                                   dss.getSequenceAsString()))
 435                   // && mappedrg.getSequenceAsString().equals(
 436                   // loc.getSequenceAsString()))
 437                   {
 438                     String msg = "Mapping updated from " + ms.getName()
 439                             + " to retrieved crossreference "
 440                             + dss.getName();
 441                     System.out.println(msg);
 442                     map.setTo(dss);
 443
 444                     /*
 445                      * give the reverse reference the inverse mapping
 446                      * (if it doesn't have one already)
 447                      */
 448                     setReverseMapping(dss, dbref, cf);
 449
 450                     /*
 451                      * copy sequence features as well, avoiding
 452                      * duplication (e.g. same variation from two
 453                      * transcripts)
 454                      */
 455                     SequenceFeature[] sfs = ms.getSequenceFeatures();
 456                     if (sfs != null)
 457                     {
 458                       for (SequenceFeature feat : sfs)
 459                       {
 460                         /*
 461                          * make a flyweight feature object which ignores Parent
 462                          * attribute in equality test; this avoids creating many
 463                          * otherwise duplicate exon features on genomic sequence
 464                          */
 465                         SequenceFeature newFeature = new SequenceFeature(
 466                                 feat)
 467                         {
 468                           @Override
 469                           public boolean equals(Object o)
 470                           {
 471                             return super.equals(o, true);
 472                           }
 473                         };
 474                         dss.addSequenceFeature(newFeature);
 475                       }
 476                     }
 477                   }
 478                   cf.addMap(retrievedDss, map.getTo(), map.getMap());
 479                 } catch (Exception e)
 480                 {
 481                   System.err
 482                           .println("Exception when consolidating Mapped sequence set...");
 483                   e.printStackTrace(System.err);
 484                 }
 485               }
 486             }
 487           }
 488         }
 489         retrievedSequence.updatePDBIds();
 490         rseqs.add(retrievedDss);
 491         dataset.addSequence(retrievedDss);
 492         matcher.add(retrievedDss);
 493       }
 494     }
 495   }
 496   /**
 497    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
 498    * to sequence (if any). This is used after fetching a cross-referenced
 499    * sequence, if the fetched sequence has a mapping to the original sequence,
 500    * to set the mapping in the original sequence's dbref.
 501    *
 502    * @param mapFrom
 503    *          the sequence mapped from
 504    * @param dbref
 505    * @param mappings
 506    */
 507   void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
 508           AlignedCodonFrame mappings)
 509   {
 510     SequenceI mapTo = dbref.getMap().getTo();
 511     if (mapTo == null)
 512     {
 513       return;
 514     }
 515     DBRefEntry[] dbrefs = mapTo.getDBRefs();
 516     if (dbrefs == null)
 517     {
 518       return;
 519     }
 520     for (DBRefEntry toRef : dbrefs)
 521     {
 522       if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
 523       {
 524         /*
 525          * found the reverse dbref; update its mapping if null
 526          */
 527         if (toRef.getMap().getMap() == null)
 528         {
 529           MapList inverse = dbref.getMap().getMap().getInverse();
 530           toRef.getMap().setMap(inverse);
 531           mappings.addMap(mapTo, mapFrom, inverse);
 532         }
 533       }
 534     }
 535   }
 536
 537   /**
 538    * Returns the first identical sequence in the dataset if any, else null
 539    *
 540    * @param xref
 541    * @return
 542    */
 543   SequenceI findInDataset(DBRefEntry xref)
 544   {
 545     if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
 546     {
 547       return null;
 548     }
 549     SequenceI mapsTo = xref.getMap().getTo();
 550     String name = xref.getAccessionId();
 551     String name2 = xref.getSource() + "|" + name;
 552     SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo
 553             .getDatasetSequence();
 554     for (SequenceI seq : dataset.getSequences())
 555     {
 556       /*
 557        * clumsy alternative to using SequenceIdMatcher which currently
 558        * returns sequences with a dbref to the matched accession id
 559        * which we don't want
 560        */
 561       if (name.equals(seq.getName()) || seq.getName().startsWith(name2))
 562       {
 563         if (sameSequence(seq, dss))
 564         {
 565           return seq;
 566         }
 567       }
 568     }
 569     return null;
 570   }
 571
 572   /**
 573    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
 574    * case), else false. This method compares the lengths, then each character in
 575    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
 576    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
 577    *
 578    * @param seq1
 579    * @param seq2
 580    * @return
 581    */
 582   // TODO move to Sequence / SequenceI
 583   static boolean sameSequence(SequenceI seq1, SequenceI seq2)
 584   {
 585     if (seq1 == seq2)
 586     {
 587       return true;
 588     }
 589     if (seq1 == null || seq2 == null)
 590     {
 591       return false;
 592     }
 593     char[] c1 = seq1.getSequence();
 594     char[] c2 = seq2.getSequence();
 595     if (c1.length != c2.length)
 596     {
 597       return false;
 598     }
 599     for (int i = 0; i < c1.length; i++)
 600     {
 601       int diff = c1[i] - c2[i];
 602       /*
 603        * same char or differ in case only ('a'-'A' == 32)
 604        */
 605       if (diff != 0 && diff != 32 && diff != -32)
 606       {
 607         return false;
 608       }
 609     }
 610     return true;
 611   }
 612
 613   /**
 614    * Updates any empty mappings in the cross-references with one to a compatible
 615    * retrieved sequence if found, and adds any new mappings to the
 616    * AlignedCodonFrame
 617    *
 618    * @param mapFrom
 619    * @param xrefs
 620    * @param retrieved
 621    * @param acf
 622    */
 623   void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs,
 624           SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
 625   {
 626     SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
 627     for (DBRefEntry xref : xrefs)
 628     {
 629       if (!xref.hasMap())
 630       {
 631         String targetSeqName = xref.getSource() + "|"
 632                 + xref.getAccessionId();
 633         SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName);
 634         if (matches == null)
 635         {
 636           return;
 637         }
 638         for (SequenceI seq : matches)
 639         {
 640           constructMapping(mapFrom, seq, xref, acf, fromDna);
 641         }
 642       }
 643     }
 644   }
 645
 646   /**
 647    * Tries to make a mapping between sequences. If successful, adds the mapping
 648    * to the dbref and the mappings collection and answers true, otherwise
 649    * answers false. The following methods of making are mapping are tried in
 650    * turn:
 651    * <ul>
 652    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
 653    * example, the case after fetching EMBL cross-references for a Uniprot
 654    * sequence</li>
 655    * <li>else check if the dna translates exactly to the protein (give or take
 656    * start and stop codons></li>
 657    * <li>else try to map based on CDS features on the dna sequence</li>
 658    * </ul>
 659    *
 660    * @param mapFrom
 661    * @param mapTo
 662    * @param xref
 663    * @param mappings
 664    * @return
 665    */
 666   boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
 667           DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
 668   {
 669     MapList mapping = null;
 670
 671     /*
 672      * look for a reverse mapping, if found make its inverse
 673      */
 674     if (mapTo.getDBRefs() != null)
 675     {
 676       for (DBRefEntry dbref : mapTo.getDBRefs())
 677       {
 678         String name = dbref.getSource() + "|" + dbref.getAccessionId();
 679         if (dbref.hasMap() && mapFrom.getName().startsWith(name))
 680         {
 681           /*
 682            * looks like we've found a map from 'mapTo' to 'mapFrom'
 683            * - invert it to make the mapping the other way
 684            */
 685           MapList reverse = dbref.getMap().getMap().getInverse();
 686           xref.setMap(new Mapping(mapTo, reverse));
 687           mappings.addMap(mapFrom, mapTo, reverse);
 688           return true;
 689         }
 690       }
 691     }
 692
 693     if (fromDna)
 694     {
 695       mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
 696     }
 697     else
 698     {
 699       mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
 700       if (mapping != null)
 701       {
 702         mapping = mapping.getInverse();
 703       }
 704     }
 705     if (mapping == null)
 706     {
 707       return false;
 708     }
 709     xref.setMap(new Mapping(mapTo, mapping));
 710
 711     /*
 712      * and add a reverse DbRef with the inverse mapping
 713      */
 714     if (mapFrom.getDatasetSequence() != null
 715             && mapFrom.getDatasetSequence().getSourceDBRef() != null)
 716     {
 717       DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
 718               .getSourceDBRef());
 719       dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
 720               .getInverse()));
 721       mapTo.addDBRef(dbref);
 722     }
 723
 724     if (fromDna)
 725     {
 726       AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
 727       mappings.addMap(mapFrom, mapTo, mapping);
 728     }
 729     else
 730     {
 731       mappings.addMap(mapTo, mapFrom, mapping.getInverse());
 732     }
 733
 734     return true;
 735   }
 736
 737   /**
 738    * find references to lrfs in the cross-reference set of each sequence in
 739    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 740    * based on source and accession string only - Map and Version are nulled.
 741    *
 742    * @param fromDna
 743    *          - true if context was searching from Dna sequences, false if
 744    *          context was searching from Protein sequences
 745    * @param sequenceI
 746    * @param lrfs
 747    * @param foundSeqs
 748    * @return true if matches were found.
 749    */
 750   private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
 751           DBRefEntry[] lrfs, List<SequenceI> foundSeqs, AlignedCodonFrame cf)
 752   {
 753     boolean found = false;
 754     if (lrfs == null)
 755     {
 756       return false;
 757     }
 758     for (int i = 0; i < lrfs.length; i++)
 759     {
 760       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 761       // add in wildcards
 762       xref.setVersion(null);
 763       xref.setMap(null);
 764       found |= searchDataset(fromDna, sequenceI, xref, foundSeqs, cf, false);
 765     }
 766     return found;
 767   }
 768
 769   /**
 770    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
 771    * associated sequence to rseqs
 772    *
 773    * @param fromDna
 774    *          true if context was searching for refs *from* dna sequence, false
 775    *          if context was searching for refs *from* protein sequence
 776    * @param fromSeq
 777    *          a sequence to ignore (start point of search)
 778    * @param xrf
 779    *          a cross-reference to try to match
 780    * @param foundSeqs
 781    *          result list to add to
 782    * @param mappings
 783    *          a set of sequence mappings to add to
 784    * @param direct
 785    *          - indicates the type of relationship between returned sequences,
 786    *          xrf, and sequenceI that is required.
 787    *          <ul>
 788    *          <li>direct implies xrf is a primary reference for sequenceI AND
 789    *          the sequences to be located (eg a uniprot ID for a protein
 790    *          sequence, and a uniprot ref on a transcript sequence).</li>
 791    *          <li>indirect means xrf is a cross reference with respect to
 792    *          sequenceI or all the returned sequences (eg a genomic reference
 793    *          associated with a locus and one or more transcripts)</li>
 794    *          </ul>
 795    * @return true if relationship found and sequence added.
 796    */
 797   boolean searchDataset(boolean fromDna, SequenceI fromSeq,
 798           DBRefEntry xrf, List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
 799           boolean direct)
 800   {
 801     boolean found = false;
 802     if (dataset == null)
 803     {
 804       return false;
 805     }
 806     if (dataset.getSequences() == null)
 807     {
 808       System.err.println("Empty dataset sequence set - NO VECTOR");
 809       return false;
 810     }
 811     List<SequenceI> ds;
 812     synchronized (ds = dataset.getSequences())
 813     {
 814       for (SequenceI nxt : ds)
 815       {
 816         if (nxt != null)
 817         {
 818           if (nxt.getDatasetSequence() != null)
 819           {
 820             System.err
 821                     .println("Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! ("
 822                             + nxt.getDisplayId(true)
 823                             + " has ds reference "
 824                             + nxt.getDatasetSequence().getDisplayId(true)
 825                             + ")");
 826           }
 827           if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence())
 828           {
 829             continue;
 830           }
 831           /*
 832            * only look at same molecule type if 'direct', or
 833            * complementary type if !direct
 834            */
 835           {
 836             boolean isDna = !nxt.isProtein();
 837             if (direct ? (isDna != fromDna) : (isDna == fromDna))
 838             {
 839               // skip this sequence because it is wrong molecule type
 840               continue;
 841             }
 842           }
 843
 844           // look for direct or indirect references in common
 845           DBRefEntry[] poss = nxt.getDBRefs();
 846           List<DBRefEntry> cands = null;
 847
 848           // todo: indirect specifies we select either direct references to nxt
 849           // that match xrf which is indirect to sequenceI, or indirect
 850           // references to nxt that match xrf which is direct to sequenceI
 851           cands = DBRefUtils.searchRefs(poss, xrf);
 852           // else
 853           // {
 854           // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);
 855           // cands = DBRefUtils.searchRefs(poss, xrf);
 856           // }
 857           if (!cands.isEmpty())
 858           {
 859             if (!foundSeqs.contains(nxt))
 860             {
 861               found = true;
 862               foundSeqs.add(nxt);
 863               if (mappings != null && !direct)
 864               {
 865                 /*
 866                  * if the matched sequence has mapped dbrefs to
 867                  * protein product / cdna, add equivalent mappings to
 868                  * our source sequence
 869                  */
 870                 for (DBRefEntry candidate : cands)
 871                 {
 872                   Mapping mapping = candidate.getMap();
 873                   if (mapping != null)
 874                   {
 875                     MapList map = mapping.getMap();
 876                     if (mapping.getTo() != null
 877                             && map.getFromRatio() != map.getToRatio())
 878                     {
 879                       /*
 880                        * add a mapping, as from dna to peptide sequence
 881                        */
 882                       if (map.getFromRatio() == 3)
 883                       {
 884                         mappings.addMap(nxt, fromSeq, map);
 885                       }
 886                       else
 887                       {
 888                         mappings.addMap(nxt, fromSeq, map.getInverse());
 889                       }
 890                     }
 891                   }
 892                 }
 893               }
 894             }
 895           }
 896         }
 897       }
 898     }
 899     return found;
 900   }
 901 }