src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.Comparison;
  32 import jalview.util.DBRefUtils;
  33 import jalview.util.MapList;
  34 import jalview.ws.SequenceFetcherFactory;
  35 import jalview.ws.seqfetcher.ASequenceFetcher;
  36
  37 import java.util.ArrayList;
  38 import java.util.Iterator;
  39 import java.util.List;
  40
  41 /**
  42  * Functions for cross-referencing sequence databases.
  43  *
  44  * @author JimP
  45  *
  46  */
  47 public class CrossRef
  48 {
  49   /*
  50    * the dataset of the alignment for which we are searching for
  51    * cross-references; in some cases we may resolve xrefs by
  52    * searching in the dataset
  53    */
  54   private AlignmentI dataset;
  55
  56   /*
  57    * true if we are searching for cross-references from nucleotide,
  58    * i.e. for protein sequences, false if the reverse
  59    */
  60   private boolean fromDna;
  61
  62   /*
  63    * the sequences for which we are seeking cross-references
  64    */
  65   private SequenceI[] fromSeqs;
  66
  67   /**
  68    * Constructor
  69    *
  70    * @param seqs
  71    *          the sequences for which we are seeking cross-references
  72    * @param ds
  73    *          the containing alignment dataset (may be searched to resolve
  74    *          cross-references)
  75    */
  76   public CrossRef(SequenceI[] seqs, AlignmentI ds)
  77   {
  78     fromSeqs = seqs;
  79     fromDna = ds.isNucleotide();
  80     dataset = ds.getDataset() == null ? ds : ds.getDataset();
  81   }
  82
  83   /**
  84    * Returns a list of distinct database sources for which sequences have either
  85    * <ul>
  86    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
  87    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
  88    * reference from another sequence in the dataset which has a cross-reference
  89    * to a direct DBRefEntry on the given sequence</li>
  90    * </ul>
  91    * @return
  92    */
  93   public List<String> findXrefSourcesForSequences()
  94   {
  95     List<String> sources = new ArrayList<String>();
  96     for (SequenceI seq : fromSeqs)
  97     {
  98       if (seq != null)
  99       {
 100         findXrefSourcesForSequence(seq, sources);
 101       }
 102     }
 103     return sources;
 104   }
 105
 106   /**
 107    * Returns a list of distinct database sources for which a sequence has either
 108    * <ul>
 109    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
 110    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
 111    * reference from another sequence in the dataset which has a cross-reference
 112    * to a direct DBRefEntry on the given sequence</li>
 113    * </ul>
 114    *
 115    * @param seq
 116    *          the sequence whose dbrefs we are searching against
 117    * @param sources
 118    *          a list of sources to add matches to
 119    */
 120   void findXrefSourcesForSequence(SequenceI seq, List<String> sources)
 121   {
 122     /*
 123      * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
 124      */
 125     DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
 126     addXrefsToSources(rfs, sources);
 127     if (dataset != null)
 128     {
 129       /*
 130        * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
 131        */
 132       DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
 133       List<SequenceI> rseqs = new ArrayList<SequenceI>();
 134
 135       /*
 136        * find sequences in the alignment which xref one of these DBRefs
 137        * i.e. is xref-ed to a common sequence identifier
 138        */
 139       searchDatasetXrefs(seq, lrfs, rseqs, null);
 140
 141       /*
 142        * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
 143        */
 144       for (SequenceI rs : rseqs)
 145       {
 146         DBRefEntry[] xrs = DBRefUtils
 147                 .selectDbRefs(!fromDna, rs.getDBRefs());
 148         addXrefsToSources(xrs, sources);
 149       }
 150     }
 151   }
 152
 153   /**
 154    * Helper method that adds the source identifiers of some cross-references to
 155    * a (non-redundant) list of database sources
 156    *
 157    * @param xrefs
 158    * @param sources
 159    */
 160   void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
 161   {
 162     if (xrefs != null)
 163     {
 164       for (DBRefEntry ref : xrefs)
 165       {
 166         /*
 167          * avoid duplication e.g. ENSEMBL and Ensembl
 168          */
 169         String source = DBRefUtils.getCanonicalName(ref.getSource());
 170         if (!sources.contains(source))
 171         {
 172           sources.add(source);
 173         }
 174       }
 175     }
 176   }
 177
 178   /**
 179    * Attempts to find cross-references from the sequences provided in the
 180    * constructor to the given source database. Cross-references may be found
 181    * <ul>
 182    * <li>in dbrefs on the sequence which hold a mapping to a sequence
 183    * <ul>
 184    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
 185    * <li>populated previously after getting cross-references</li>
 186    * </ul>
 187    * <li>as other sequences in the alignment which share a dbref identifier with
 188    * the sequence</li>
 189    * <li>by fetching from the remote database</li>
 190    * </ul>
 191    * The cross-referenced sequences, and mappings to them, are added to the
 192    * alignment dataset.
 193    *
 194    * @param source
 195    * @return cross-referenced sequences (as dataset sequences)
 196    */
 197   public Alignment findXrefSequences(String source)
 198   {
 199
 200     List<SequenceI> rseqs = new ArrayList<SequenceI>();
 201     AlignedCodonFrame cf = new AlignedCodonFrame();
 202     SequenceIdMatcher matcher = new SequenceIdMatcher(
 203             dataset.getSequences());
 204
 205     for (SequenceI seq : fromSeqs)
 206     {
 207       SequenceI dss = seq;
 208       while (dss.getDatasetSequence() != null)
 209       {
 210         dss = dss.getDatasetSequence();
 211       }
 212       boolean found = false;
 213       DBRefEntry[] xrfs = DBRefUtils
 214               .selectDbRefs(!fromDna, dss.getDBRefs());
 215       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 216       {
 217         /*
 218          * found no suitable dbrefs on sequence - look for sequences in the
 219          * alignment which share a dbref with this one
 220          */
 221         DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
 222                 seq.getDBRefs());
 223
 224         /*
 225          * find sequences (except this one!), of complementary type,
 226          *  which have a dbref to an accession id for this sequence,
 227          *  and add them to the results
 228          */
 229         found = searchDatasetXrefs(dss, lrfs, rseqs, cf);
 230       }
 231       if (xrfs == null && !found)
 232       {
 233         /*
 234          * no dbref to source on this sequence or matched
 235          * complementary sequence in the dataset
 236          */
 237         continue;
 238       }
 239       List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
 240               source);
 241       Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
 242       while (refIterator.hasNext())
 243       {
 244         DBRefEntry xref = refIterator.next();
 245         found = false;
 246         if (xref.hasMap())
 247         {
 248           SequenceI mappedTo = xref.getMap().getTo();
 249           if (mappedTo != null)
 250           {
 251             /*
 252              * dbref contains the sequence it maps to; add it to the
 253              * results unless we have done so already (could happen if
 254              * fetching xrefs for sequences which have xrefs in common)
 255              * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
 256              */
 257             found = true;
 258             /*
 259              * problem: matcher.findIdMatch() is lenient - returns a sequence
 260              * with a dbref to the search arg e.g. ENST for ENSP - wrong
 261              * but findInDataset() matches ENSP when looking for Uniprot...
 262              */
 263             SequenceI matchInDataset = findInDataset(xref);
 264             /*matcher.findIdMatch(mappedTo);*/
 265             if (matchInDataset != null)
 266             {
 267               if (!rseqs.contains(matchInDataset))
 268               {
 269                 rseqs.add(matchInDataset);
 270               }
 271               refIterator.remove();
 272               continue;
 273             }
 274             SequenceI rsq = new Sequence(mappedTo);
 275             rseqs.add(rsq);
 276             if (xref.getMap().getMap().getFromRatio() != xref.getMap()
 277                     .getMap().getToRatio())
 278             {
 279               // get sense of map correct for adding to product alignment.
 280               if (fromDna)
 281               {
 282                 // map is from dna seq to a protein product
 283                 cf.addMap(dss, rsq, xref.getMap().getMap());
 284               }
 285               else
 286               {
 287                 // map should be from protein seq to its coding dna
 288                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
 289               }
 290             }
 291           }
 292         }
 293
 294         if (!found)
 295         {
 296           SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
 297                   + xref.getAccessionId());
 298           if (matchedSeq != null)
 299           {
 300             if (constructMapping(seq, matchedSeq, xref, cf))
 301             {
 302               found = true;
 303             }
 304           }
 305         }
 306
 307         if (!found)
 308         {
 309           // do a bit more work - search for sequences with references matching
 310           // xrefs on this sequence.
 311           found = searchDataset(dss, xref, rseqs, cf, false);
 312         }
 313         if (found)
 314         {
 315           refIterator.remove();
 316         }
 317       }
 318
 319       /*
 320        * fetch from source database any dbrefs we haven't resolved up to here
 321        */
 322       if (!sourceRefs.isEmpty())
 323       {
 324         ASequenceFetcher sftch = SequenceFetcherFactory
 325                 .getSequenceFetcher();
 326         SequenceI[] retrieved = null;
 327         try
 328         {
 329           retrieved = sftch.getSequences(sourceRefs, !fromDna);
 330         } catch (Exception e)
 331         {
 332           System.err
 333                   .println("Problem whilst retrieving cross references for Sequence : "
 334                           + seq.getName());
 335           e.printStackTrace();
 336         }
 337
 338         if (retrieved != null)
 339         {
 340           updateDbrefMappings(seq, xrfs, retrieved, cf);
 341           for (SequenceI retrievedSequence : retrieved)
 342           {
 343             SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
 344                     : retrievedSequence.getDatasetSequence();
 345             DBRefEntry[] dbr = retrievedSequence.getDBRefs();
 346             if (dbr != null)
 347             {
 348               for (DBRefEntry dbref : dbr)
 349               {
 350                 // find any entry where we should put in the sequence being
 351                 // cross-referenced into the map
 352                 Mapping map = dbref.getMap();
 353                 if (map != null)
 354                 {
 355                   if (map.getTo() != null && map.getMap() != null)
 356                   {
 357                     // TODO findInDataset requires exact sequence match but
 358                     // 'congruent' test is only for the mapped part
 359                     // maybe not a problem in practice since only ENA provide a
 360                     // mapping and it is to the full protein translation of CDS
 361                     SequenceI matched = findInDataset(dbref);
 362                     // matcher.findIdMatch(map.getTo());
 363                     if (matched != null)
 364                     {
 365                       /*
 366                        * already got an xref to this sequence; update this
 367                        * map to point to the same sequence, and add
 368                        * any new dbrefs to it
 369                        */
 370                       DBRefEntry[] toRefs = map.getTo().getDBRefs();
 371                       if (toRefs != null)
 372                       {
 373                         for (DBRefEntry ref : toRefs)
 374                         {
 375                           matched.addDBRef(ref); // add or update mapping
 376                         }
 377                       }
 378                       map.setTo(matched);
 379                     }
 380                     else
 381                     {
 382                       matcher.add(map.getTo());
 383                     }
 384                     try
 385                     {
 386                       // compare ms with dss and replace with dss in mapping
 387                       // if map is congruent
 388                       SequenceI ms = map.getTo();
 389                       int sf = map.getMap().getToLowest();
 390                       int st = map.getMap().getToHighest();
 391                       SequenceI mappedrg = ms.getSubSequence(sf, st);
 392                       // SequenceI loc = dss.getSubSequence(sf, st);
 393                       if (mappedrg.getLength() > 0
 394                               && ms.getSequenceAsString().equals(
 395                                       dss.getSequenceAsString()))
 396                       // && mappedrg.getSequenceAsString().equals(
 397                       // loc.getSequenceAsString()))
 398                       {
 399                         String msg = "Mapping updated from " + ms.getName()
 400                                 + " to retrieved crossreference "
 401                                 + dss.getName();
 402                         System.out.println(msg);
 403                         map.setTo(dss);
 404
 405                         /*
 406                          * give the reverse reference the inverse mapping
 407                          * (if it doesn't have one already)
 408                          */
 409                         setReverseMapping(dss, dbref, cf);
 410
 411                         /*
 412                          * copy sequence features as well, avoiding
 413                          * duplication (e.g. same variation from two
 414                          * transcripts)
 415                          */
 416                         SequenceFeature[] sfs = ms.getSequenceFeatures();
 417                         if (sfs != null)
 418                         {
 419                           for (SequenceFeature feat : sfs)
 420                           {
 421                             /*
 422                              * make a flyweight feature object which ignores Parent
 423                              * attribute in equality test; this avoids creating many
 424                              * otherwise duplicate exon features on genomic sequence
 425                              */
 426                             SequenceFeature newFeature = new SequenceFeature(
 427                                     feat)
 428                             {
 429                               @Override
 430                               public boolean equals(Object o)
 431                               {
 432                                 return super.equals(o, true);
 433                               }
 434                             };
 435                             dss.addSequenceFeature(newFeature);
 436                           }
 437                         }
 438                       }
 439                       cf.addMap(retrievedDss, map.getTo(), map.getMap());
 440                     } catch (Exception e)
 441                     {
 442                       System.err
 443                               .println("Exception when consolidating Mapped sequence set...");
 444                       e.printStackTrace(System.err);
 445                     }
 446                   }
 447                 }
 448               }
 449             }
 450             retrievedSequence.updatePDBIds();
 451             rseqs.add(retrievedDss);
 452             dataset.addSequence(retrievedDss);
 453             matcher.add(retrievedDss);
 454           }
 455         }
 456       }
 457     }
 458
 459     Alignment ral = null;
 460     if (rseqs.size() > 0)
 461     {
 462       ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
 463       if (!cf.isEmpty())
 464       {
 465         dataset.addCodonFrame(cf);
 466       }
 467     }
 468     return ral;
 469   }
 470
 471   /**
 472    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
 473    * to sequence (if any). This is used after fetching a cross-referenced
 474    * sequence, if the fetched sequence has a mapping to the original sequence,
 475    * to set the mapping in the original sequence's dbref.
 476    *
 477    * @param mapFrom
 478    *          the sequence mapped from
 479    * @param dbref
 480    * @param mappings
 481    */
 482   void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
 483           AlignedCodonFrame mappings)
 484   {
 485     SequenceI mapTo = dbref.getMap().getTo();
 486     if (mapTo == null)
 487     {
 488       return;
 489     }
 490     DBRefEntry[] dbrefs = mapTo.getDBRefs();
 491     if (dbrefs == null)
 492     {
 493       return;
 494     }
 495     for (DBRefEntry toRef : dbrefs)
 496     {
 497       if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
 498       {
 499         /*
 500          * found the reverse dbref; update its mapping if null
 501          */
 502         if (toRef.getMap().getMap() == null)
 503         {
 504           MapList inverse = dbref.getMap().getMap().getInverse();
 505           toRef.getMap().setMap(inverse);
 506           mappings.addMap(mapTo, mapFrom, inverse);
 507         }
 508       }
 509     }
 510   }
 511
 512   /**
 513    * Returns the first identical sequence in the dataset if any, else null
 514    *
 515    * @param xref
 516    * @return
 517    */
 518   SequenceI findInDataset(DBRefEntry xref)
 519   {
 520     if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
 521     {
 522       return null;
 523     }
 524     SequenceI mapsTo = xref.getMap().getTo();
 525     String name = xref.getAccessionId();
 526     String name2 = xref.getSource() + "|" + name;
 527     SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo
 528             .getDatasetSequence();
 529     for (SequenceI seq : dataset.getSequences())
 530     {
 531       /*
 532        * clumsy alternative to using SequenceIdMatcher which currently
 533        * returns sequences with a dbref to the matched accession id
 534        * which we don't want
 535        */
 536       if (name.equals(seq.getName()) || seq.getName().startsWith(name2))
 537       {
 538         if (sameSequence(seq, dss))
 539         {
 540           return seq;
 541         }
 542       }
 543     }
 544     return null;
 545   }
 546
 547   /**
 548    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
 549    * case), else false. This method compares the lengths, then each character in
 550    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
 551    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
 552    *
 553    * @param seq1
 554    * @param seq2
 555    * @return
 556    */
 557   // TODO move to Sequence / SequenceI
 558   static boolean sameSequence(SequenceI seq1, SequenceI seq2)
 559   {
 560     if (seq1 == seq2)
 561     {
 562       return true;
 563     }
 564     if (seq1 == null || seq2 == null)
 565     {
 566       return false;
 567     }
 568     char[] c1 = seq1.getSequence();
 569     char[] c2 = seq2.getSequence();
 570     if (c1.length != c2.length)
 571     {
 572       return false;
 573     }
 574     for (int i = 0; i < c1.length; i++)
 575     {
 576       int diff = c1[i] - c2[i];
 577       /*
 578        * same char or differ in case only ('a'-'A' == 32)
 579        */
 580       if (diff != 0 && diff != 32 && diff != -32)
 581       {
 582         return false;
 583       }
 584     }
 585     return true;
 586   }
 587
 588   /**
 589    * Updates any empty mappings in the cross-references with one to a compatible
 590    * retrieved sequence if found, and adds any new mappings to the
 591    * AlignedCodonFrame
 592    *
 593    * @param mapFrom
 594    * @param xrefs
 595    * @param retrieved
 596    * @param acf
 597    */
 598   void updateDbrefMappings(SequenceI mapFrom,
 599           DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
 600   {
 601     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 602     for (DBRefEntry xref : xrefs)
 603     {
 604       if (!xref.hasMap())
 605       {
 606         String targetSeqName = xref.getSource() + "|"
 607                 + xref.getAccessionId();
 608         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 609         if (matches == null)
 610         {
 611           return;
 612         }
 613         for (SequenceI seq : matches)
 614         {
 615           constructMapping(mapFrom, seq, xref, acf);
 616         }
 617       }
 618     }
 619   }
 620
 621   /**
 622    * Tries to make a mapping between sequences. If successful, adds the mapping
 623    * to the dbref and the mappings collection and answers true, otherwise
 624    * answers false. The following methods of making are mapping are tried in
 625    * turn:
 626    * <ul>
 627    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
 628    * example, the case after fetching EMBL cross-references for a Uniprot
 629    * sequence</li>
 630    * <li>else check if the dna translates exactly to the protein (give or take
 631    * start and stop codons></li>
 632    * <li>else try to map based on CDS features on the dna sequence</li>
 633    * </ul>
 634    *
 635    * @param mapFrom
 636    * @param mapTo
 637    * @param xref
 638    * @param mappings
 639    * @return
 640    */
 641   boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
 642           DBRefEntry xref, AlignedCodonFrame mappings)
 643   {
 644     MapList mapping = null;
 645
 646     /*
 647      * look for a reverse mapping, if found make its inverse
 648      */
 649     if (mapTo.getDBRefs() != null)
 650     {
 651       for (DBRefEntry dbref : mapTo.getDBRefs())
 652       {
 653         String name = dbref.getSource() + "|" + dbref.getAccessionId();
 654         if (dbref.hasMap() && mapFrom.getName().startsWith(name))
 655         {
 656           /*
 657            * looks like we've found a map from 'mapTo' to 'mapFrom'
 658            * - invert it to make the mapping the other way
 659            */
 660           MapList reverse = dbref.getMap().getMap().getInverse();
 661           xref.setMap(new Mapping(mapTo, reverse));
 662           mappings.addMap(mapFrom, mapTo, reverse);
 663           return true;
 664         }
 665       }
 666     }
 667
 668     if (fromDna)
 669     {
 670       mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
 671     }
 672     else
 673     {
 674       mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
 675       if (mapping != null)
 676       {
 677         mapping = mapping.getInverse();
 678       }
 679     }
 680     if (mapping == null)
 681     {
 682       return false;
 683     }
 684     xref.setMap(new Mapping(mapTo, mapping));
 685     if (fromDna)
 686     {
 687       AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
 688       mappings.addMap(mapFrom, mapTo, mapping);
 689     }
 690     else
 691     {
 692       mappings.addMap(mapTo, mapFrom, mapping.getInverse());
 693     }
 694
 695     return true;
 696   }
 697
 698   /**
 699    * find references to lrfs in the cross-reference set of each sequence in
 700    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 701    * based on source and accession string only - Map and Version are nulled.
 702    *
 703    * @param sequenceI
 704    * @param lrfs
 705    * @param rseqs
 706    * @return true if matches were found.
 707    */
 708   private boolean searchDatasetXrefs(SequenceI sequenceI,
 709           DBRefEntry[] lrfs, List<SequenceI> rseqs, AlignedCodonFrame cf)
 710   {
 711     boolean found = false;
 712     if (lrfs == null)
 713     {
 714       return false;
 715     }
 716     for (int i = 0; i < lrfs.length; i++)
 717     {
 718       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 719       // add in wildcards
 720       xref.setVersion(null);
 721       xref.setMap(null);
 722       found |= searchDataset(sequenceI, xref, rseqs, cf, false);
 723     }
 724     return found;
 725   }
 726
 727   /**
 728    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
 729    * associated sequence to rseqs
 730    *
 731    * @param sequenceI
 732    *          a sequence to ignore (start point of search)
 733    * @param xrf
 734    *          a cross-reference to try to match
 735    * @param rseqs
 736    *          result list to add to
 737    * @param cf
 738    *          a set of sequence mappings to add to
 739    * @param direct
 740    *          - search all references or only subset
 741    * @return true if relationship found and sequence added.
 742    */
 743   boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
 744           List<SequenceI> rseqs, AlignedCodonFrame cf, boolean direct)
 745   {
 746     boolean found = false;
 747     if (dataset == null)
 748     {
 749       return false;
 750     }
 751     if (dataset.getSequences() == null)
 752     {
 753       System.err.println("Empty dataset sequence set - NO VECTOR");
 754       return false;
 755     }
 756     List<SequenceI> ds;
 757     synchronized (ds = dataset.getSequences())
 758     {
 759       for (SequenceI nxt : ds)
 760       {
 761         if (nxt != null)
 762         {
 763           if (nxt.getDatasetSequence() != null)
 764           {
 765             System.err
 766                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 767           }
 768           if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
 769           {
 770             continue;
 771           }
 772           /*
 773            * only look at same molecule type if 'direct', or
 774            * complementary type if !direct
 775            */
 776           {
 777             boolean isDna = Comparison
 778                     .isNucleotide(new SequenceI[] { nxt });
 779             if (direct ? (isDna != fromDna) : (isDna == fromDna))
 780             {
 781               // skip this sequence because it is wrong molecule type
 782               continue;
 783             }
 784           }
 785
 786           // look for direct or indirect references in common
 787           DBRefEntry[] poss = nxt.getDBRefs();
 788           List<DBRefEntry> cands = null;
 789           /*
 790            * TODO does this make any sense?
 791            * if 'direct', search the dbrefs for xrf
 792            * else, filter the dbrefs by type and then search for xrf
 793            * - the result is the same isn't it?
 794            */
 795           if (direct)
 796           {
 797             cands = DBRefUtils.searchRefs(poss, xrf);
 798           }
 799           else
 800           {
 801             poss = DBRefUtils.selectDbRefs(!fromDna, poss);
 802             cands = DBRefUtils.searchRefs(poss, xrf);
 803           }
 804           if (!cands.isEmpty())
 805           {
 806             if (!rseqs.contains(nxt))
 807             {
 808               found = true;
 809               rseqs.add(nxt);
 810               if (cf != null)
 811               {
 812                 // don't search if we aren't given a codon map object
 813                 for (DBRefEntry candidate : cands)
 814                 {
 815                   Mapping mapping = candidate.getMap();
 816                   if (mapping != null)
 817                   {
 818                     MapList map = mapping.getMap();
 819                     if (mapping.getTo() != null
 820                             && map.getFromRatio() != map.getToRatio())
 821                     {
 822                       // get sense of map correct for adding to product
 823                       // alignment.
 824                       if (fromDna)
 825                       {
 826                         // map is from dna seq to a protein product
 827                         cf.addMap(sequenceI, nxt, map);
 828                       }
 829                       else
 830                       {
 831                         // map should be from protein seq to its coding dna
 832                         cf.addMap(nxt, sequenceI, map.getInverse());
 833                       }
 834                     }
 835                   }
 836                 }
 837               }
 838               // TODO: add mapping between sequences if necessary
 839             }
 840           }
 841         }
 842       }
 843     }
 844     return found;
 845   }
 846 }