src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.Comparison;
  32 import jalview.util.DBRefUtils;
  33 import jalview.util.MapList;
  34 import jalview.ws.SequenceFetcherFactory;
  35 import jalview.ws.seqfetcher.ASequenceFetcher;
  36
  37 import java.util.ArrayList;
  38 import java.util.Iterator;
  39 import java.util.List;
  40
  41 /**
  42  * Functions for cross-referencing sequence databases.
  43  *
  44  * @author JimP
  45  *
  46  */
  47 public class CrossRef
  48 {
  49   /*
  50    * the dataset of the alignment for which we are searching for
  51    * cross-references; in some cases we may resolve xrefs by
  52    * searching in the dataset
  53    */
  54   private AlignmentI dataset;
  55
  56   /*
  57    * the sequences for which we are seeking cross-references
  58    */
  59   private SequenceI[] fromSeqs;
  60
  61   /**
  62    * Constructor
  63    *
  64    * @param seqs
  65    *          the sequences for which we are seeking cross-references
  66    * @param ds
  67    *          the containing alignment dataset (may be searched to resolve
  68    *          cross-references)
  69    */
  70   public CrossRef(SequenceI[] seqs, AlignmentI ds)
  71   {
  72     fromSeqs = seqs;
  73     dataset = ds.getDataset() == null ? ds : ds.getDataset();
  74   }
  75
  76   /**
  77    * Returns a list of distinct database sources for which sequences have either
  78    * <ul>
  79    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
  80    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
  81    * reference from another sequence in the dataset which has a cross-reference
  82    * to a direct DBRefEntry on the given sequence</li>
  83    * </ul>
  84    *
  85    * @param dna
  86    *          - when true, cross-references *from* dna returned. When false,
  87    *          cross-references *from* protein are returned
  88    * @return
  89    */
  90   public List<String> findXrefSourcesForSequences(boolean dna)
  91   {
  92     List<String> sources = new ArrayList<String>();
  93     for (SequenceI seq : fromSeqs)
  94     {
  95       if (seq != null)
  96       {
  97         findXrefSourcesForSequence(seq, dna, sources);
  98       }
  99     }
 100     return sources;
 101   }
 102
 103   /**
 104    * Returns a list of distinct database sources for which a sequence has either
 105    * <ul>
 106    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
 107    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
 108    * reference from another sequence in the dataset which has a cross-reference
 109    * to a direct DBRefEntry on the given sequence</li>
 110    * </ul>
 111    *
 112    * @param seq
 113    *          the sequence whose dbrefs we are searching against
 114    * @param sources
 115    *          a list of sources to add matches to
 116    */
 117   void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
 118           List<String> sources)
 119   {
 120     /*
 121      * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
 122      */
 123     DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
 124     addXrefsToSources(rfs, sources);
 125     if (dataset != null)
 126     {
 127       /*
 128        * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
 129        */
 130       DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
 131       List<SequenceI> rseqs = new ArrayList<SequenceI>();
 132
 133       /*
 134        * find sequences in the alignment which xref one of these DBRefs
 135        * i.e. is xref-ed to a common sequence identifier
 136        */
 137       searchDatasetXrefs(fromDna, seq, lrfs, rseqs, null);
 138
 139       /*
 140        * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
 141        */
 142       for (SequenceI rs : rseqs)
 143       {
 144         DBRefEntry[] xrs = DBRefUtils
 145                 .selectDbRefs(!fromDna, rs.getDBRefs());
 146         addXrefsToSources(xrs, sources);
 147       }
 148     }
 149   }
 150
 151   /**
 152    * Helper method that adds the source identifiers of some cross-references to
 153    * a (non-redundant) list of database sources
 154    *
 155    * @param xrefs
 156    * @param sources
 157    */
 158   void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
 159   {
 160     if (xrefs != null)
 161     {
 162       for (DBRefEntry ref : xrefs)
 163       {
 164         /*
 165          * avoid duplication e.g. ENSEMBL and Ensembl
 166          */
 167         String source = DBRefUtils.getCanonicalName(ref.getSource());
 168         if (!sources.contains(source))
 169         {
 170           sources.add(source);
 171         }
 172       }
 173     }
 174   }
 175
 176   /**
 177    * Attempts to find cross-references from the sequences provided in the
 178    * constructor to the given source database. Cross-references may be found
 179    * <ul>
 180    * <li>in dbrefs on the sequence which hold a mapping to a sequence
 181    * <ul>
 182    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
 183    * <li>populated previously after getting cross-references</li>
 184    * </ul>
 185    * <li>as other sequences in the alignment which share a dbref identifier with
 186    * the sequence</li>
 187    * <li>by fetching from the remote database</li>
 188    * </ul>
 189    * The cross-referenced sequences, and mappings to them, are added to the
 190    * alignment dataset.
 191    *
 192    * @param source
 193    * @return cross-referenced sequences (as dataset sequences)
 194    */
 195   public Alignment findXrefSequences(String source, boolean fromDna)
 196   {
 197
 198     List<SequenceI> rseqs = new ArrayList<SequenceI>();
 199     AlignedCodonFrame cf = new AlignedCodonFrame();
 200     SequenceIdMatcher matcher = new SequenceIdMatcher(
 201             dataset.getSequences());
 202
 203     for (SequenceI seq : fromSeqs)
 204     {
 205       SequenceI dss = seq;
 206       while (dss.getDatasetSequence() != null)
 207       {
 208         dss = dss.getDatasetSequence();
 209       }
 210       boolean found = false;
 211       DBRefEntry[] xrfs = DBRefUtils
 212               .selectDbRefs(!fromDna, dss.getDBRefs());
 213       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 214       {
 215         /*
 216          * found no suitable dbrefs on sequence - look for sequences in the
 217          * alignment which share a dbref with this one
 218          */
 219         DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
 220                 seq.getDBRefs());
 221
 222         /*
 223          * find sequences (except this one!), of complementary type,
 224          *  which have a dbref to an accession id for this sequence,
 225          *  and add them to the results
 226          */
 227         found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
 228       }
 229       if (xrfs == null && !found)
 230       {
 231         /*
 232          * no dbref to source on this sequence or matched
 233          * complementary sequence in the dataset
 234          */
 235         continue;
 236       }
 237       List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
 238               source);
 239       Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
 240       while (refIterator.hasNext())
 241       {
 242         DBRefEntry xref = refIterator.next();
 243         found = false;
 244         if (xref.hasMap())
 245         {
 246           SequenceI mappedTo = xref.getMap().getTo();
 247           if (mappedTo != null)
 248           {
 249             /*
 250              * dbref contains the sequence it maps to; add it to the
 251              * results unless we have done so already (could happen if
 252              * fetching xrefs for sequences which have xrefs in common)
 253              * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
 254              */
 255             found = true;
 256             /*
 257              * problem: matcher.findIdMatch() is lenient - returns a sequence
 258              * with a dbref to the search arg e.g. ENST for ENSP - wrong
 259              * but findInDataset() matches ENSP when looking for Uniprot...
 260              */
 261             SequenceI matchInDataset = findInDataset(xref);
 262             /*matcher.findIdMatch(mappedTo);*/
 263             if (matchInDataset != null)
 264             {
 265               if (!rseqs.contains(matchInDataset))
 266               {
 267                 rseqs.add(matchInDataset);
 268               }
 269               refIterator.remove();
 270               continue;
 271             }
 272             SequenceI rsq = new Sequence(mappedTo);
 273             rseqs.add(rsq);
 274             if (xref.getMap().getMap().getFromRatio() != xref.getMap()
 275                     .getMap().getToRatio())
 276             {
 277               // get sense of map correct for adding to product alignment.
 278               if (fromDna)
 279               {
 280                 // map is from dna seq to a protein product
 281                 cf.addMap(dss, rsq, xref.getMap().getMap());
 282               }
 283               else
 284               {
 285                 // map should be from protein seq to its coding dna
 286                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
 287               }
 288             }
 289           }
 290         }
 291
 292         if (!found)
 293         {
 294           SequenceI matchedSeq = matcher.findIdMatch(xref.getSource() + "|"
 295                   + xref.getAccessionId());
 296           if (matchedSeq != null)
 297           {
 298             if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
 299             {
 300               found = true;
 301             }
 302           }
 303         }
 304
 305         if (!found)
 306         {
 307           // do a bit more work - search for sequences with references matching
 308           // xrefs on this sequence.
 309           found = searchDataset(fromDna, dss, xref, rseqs, cf, false);
 310         }
 311         if (found)
 312         {
 313           refIterator.remove();
 314         }
 315       }
 316
 317       /*
 318        * fetch from source database any dbrefs we haven't resolved up to here
 319        */
 320       if (!sourceRefs.isEmpty())
 321       {
 322         ASequenceFetcher sftch = SequenceFetcherFactory
 323                 .getSequenceFetcher();
 324         SequenceI[] retrieved = null;
 325         try
 326         {
 327           retrieved = sftch.getSequences(sourceRefs, !fromDna);
 328         } catch (Exception e)
 329         {
 330           System.err
 331                   .println("Problem whilst retrieving cross references for Sequence : "
 332                           + seq.getName());
 333           e.printStackTrace();
 334         }
 335
 336         if (retrieved != null)
 337         {
 338           updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
 339           for (SequenceI retrievedSequence : retrieved)
 340           {
 341             SequenceI retrievedDss = retrievedSequence.getDatasetSequence() == null ? retrievedSequence
 342                     : retrievedSequence.getDatasetSequence();
 343             DBRefEntry[] dbr = retrievedSequence.getDBRefs();
 344             if (dbr != null)
 345             {
 346               for (DBRefEntry dbref : dbr)
 347               {
 348                 // find any entry where we should put in the sequence being
 349                 // cross-referenced into the map
 350                 Mapping map = dbref.getMap();
 351                 if (map != null)
 352                 {
 353                   if (map.getTo() != null && map.getMap() != null)
 354                   {
 355                     // TODO findInDataset requires exact sequence match but
 356                     // 'congruent' test is only for the mapped part
 357                     // maybe not a problem in practice since only ENA provide a
 358                     // mapping and it is to the full protein translation of CDS
 359                     SequenceI matched = findInDataset(dbref);
 360                     // matcher.findIdMatch(map.getTo());
 361                     if (matched != null)
 362                     {
 363                       /*
 364                        * already got an xref to this sequence; update this
 365                        * map to point to the same sequence, and add
 366                        * any new dbrefs to it
 367                        */
 368                       DBRefEntry[] toRefs = map.getTo().getDBRefs();
 369                       if (toRefs != null)
 370                       {
 371                         for (DBRefEntry ref : toRefs)
 372                         {
 373                           matched.addDBRef(ref); // add or update mapping
 374                         }
 375                       }
 376                       map.setTo(matched);
 377                     }
 378                     else
 379                     {
 380                       matcher.add(map.getTo());
 381                     }
 382                     try
 383                     {
 384                       // compare ms with dss and replace with dss in mapping
 385                       // if map is congruent
 386                       SequenceI ms = map.getTo();
 387                       int sf = map.getMap().getToLowest();
 388                       int st = map.getMap().getToHighest();
 389                       SequenceI mappedrg = ms.getSubSequence(sf, st);
 390                       // SequenceI loc = dss.getSubSequence(sf, st);
 391                       if (mappedrg.getLength() > 0
 392                               && ms.getSequenceAsString().equals(
 393                                       dss.getSequenceAsString()))
 394                       // && mappedrg.getSequenceAsString().equals(
 395                       // loc.getSequenceAsString()))
 396                       {
 397                         String msg = "Mapping updated from " + ms.getName()
 398                                 + " to retrieved crossreference "
 399                                 + dss.getName();
 400                         System.out.println(msg);
 401                         map.setTo(dss);
 402
 403                         /*
 404                          * give the reverse reference the inverse mapping
 405                          * (if it doesn't have one already)
 406                          */
 407                         setReverseMapping(dss, dbref, cf);
 408
 409                         /*
 410                          * copy sequence features as well, avoiding
 411                          * duplication (e.g. same variation from two
 412                          * transcripts)
 413                          */
 414                         SequenceFeature[] sfs = ms.getSequenceFeatures();
 415                         if (sfs != null)
 416                         {
 417                           for (SequenceFeature feat : sfs)
 418                           {
 419                             /*
 420                              * make a flyweight feature object which ignores Parent
 421                              * attribute in equality test; this avoids creating many
 422                              * otherwise duplicate exon features on genomic sequence
 423                              */
 424                             SequenceFeature newFeature = new SequenceFeature(
 425                                     feat)
 426                             {
 427                               @Override
 428                               public boolean equals(Object o)
 429                               {
 430                                 return super.equals(o, true);
 431                               }
 432                             };
 433                             dss.addSequenceFeature(newFeature);
 434                           }
 435                         }
 436                       }
 437                       cf.addMap(retrievedDss, map.getTo(), map.getMap());
 438                     } catch (Exception e)
 439                     {
 440                       System.err
 441                               .println("Exception when consolidating Mapped sequence set...");
 442                       e.printStackTrace(System.err);
 443                     }
 444                   }
 445                 }
 446               }
 447             }
 448             retrievedSequence.updatePDBIds();
 449             rseqs.add(retrievedDss);
 450             dataset.addSequence(retrievedDss);
 451             matcher.add(retrievedDss);
 452           }
 453         }
 454       }
 455     }
 456
 457     Alignment ral = null;
 458     if (rseqs.size() > 0)
 459     {
 460       ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
 461       if (!cf.isEmpty())
 462       {
 463         dataset.addCodonFrame(cf);
 464       }
 465     }
 466     return ral;
 467   }
 468
 469   /**
 470    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
 471    * to sequence (if any). This is used after fetching a cross-referenced
 472    * sequence, if the fetched sequence has a mapping to the original sequence,
 473    * to set the mapping in the original sequence's dbref.
 474    *
 475    * @param mapFrom
 476    *          the sequence mapped from
 477    * @param dbref
 478    * @param mappings
 479    */
 480   void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
 481           AlignedCodonFrame mappings)
 482   {
 483     SequenceI mapTo = dbref.getMap().getTo();
 484     if (mapTo == null)
 485     {
 486       return;
 487     }
 488     DBRefEntry[] dbrefs = mapTo.getDBRefs();
 489     if (dbrefs == null)
 490     {
 491       return;
 492     }
 493     for (DBRefEntry toRef : dbrefs)
 494     {
 495       if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
 496       {
 497         /*
 498          * found the reverse dbref; update its mapping if null
 499          */
 500         if (toRef.getMap().getMap() == null)
 501         {
 502           MapList inverse = dbref.getMap().getMap().getInverse();
 503           toRef.getMap().setMap(inverse);
 504           mappings.addMap(mapTo, mapFrom, inverse);
 505         }
 506       }
 507     }
 508   }
 509
 510   /**
 511    * Returns the first identical sequence in the dataset if any, else null
 512    *
 513    * @param xref
 514    * @return
 515    */
 516   SequenceI findInDataset(DBRefEntry xref)
 517   {
 518     if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
 519     {
 520       return null;
 521     }
 522     SequenceI mapsTo = xref.getMap().getTo();
 523     String name = xref.getAccessionId();
 524     String name2 = xref.getSource() + "|" + name;
 525     SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo : mapsTo
 526             .getDatasetSequence();
 527     for (SequenceI seq : dataset.getSequences())
 528     {
 529       /*
 530        * clumsy alternative to using SequenceIdMatcher which currently
 531        * returns sequences with a dbref to the matched accession id
 532        * which we don't want
 533        */
 534       if (name.equals(seq.getName()) || seq.getName().startsWith(name2))
 535       {
 536         if (sameSequence(seq, dss))
 537         {
 538           return seq;
 539         }
 540       }
 541     }
 542     return null;
 543   }
 544
 545   /**
 546    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
 547    * case), else false. This method compares the lengths, then each character in
 548    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
 549    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
 550    *
 551    * @param seq1
 552    * @param seq2
 553    * @return
 554    */
 555   // TODO move to Sequence / SequenceI
 556   static boolean sameSequence(SequenceI seq1, SequenceI seq2)
 557   {
 558     if (seq1 == seq2)
 559     {
 560       return true;
 561     }
 562     if (seq1 == null || seq2 == null)
 563     {
 564       return false;
 565     }
 566     char[] c1 = seq1.getSequence();
 567     char[] c2 = seq2.getSequence();
 568     if (c1.length != c2.length)
 569     {
 570       return false;
 571     }
 572     for (int i = 0; i < c1.length; i++)
 573     {
 574       int diff = c1[i] - c2[i];
 575       /*
 576        * same char or differ in case only ('a'-'A' == 32)
 577        */
 578       if (diff != 0 && diff != 32 && diff != -32)
 579       {
 580         return false;
 581       }
 582     }
 583     return true;
 584   }
 585
 586   /**
 587    * Updates any empty mappings in the cross-references with one to a compatible
 588    * retrieved sequence if found, and adds any new mappings to the
 589    * AlignedCodonFrame
 590    *
 591    * @param mapFrom
 592    * @param xrefs
 593    * @param retrieved
 594    * @param acf
 595    */
 596   void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs,
 597           SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
 598   {
 599     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 600     for (DBRefEntry xref : xrefs)
 601     {
 602       if (!xref.hasMap())
 603       {
 604         String targetSeqName = xref.getSource() + "|"
 605                 + xref.getAccessionId();
 606         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 607         if (matches == null)
 608         {
 609           return;
 610         }
 611         for (SequenceI seq : matches)
 612         {
 613           constructMapping(mapFrom, seq, xref, acf, fromDna);
 614         }
 615       }
 616     }
 617   }
 618
 619   /**
 620    * Tries to make a mapping between sequences. If successful, adds the mapping
 621    * to the dbref and the mappings collection and answers true, otherwise
 622    * answers false. The following methods of making are mapping are tried in
 623    * turn:
 624    * <ul>
 625    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
 626    * example, the case after fetching EMBL cross-references for a Uniprot
 627    * sequence</li>
 628    * <li>else check if the dna translates exactly to the protein (give or take
 629    * start and stop codons></li>
 630    * <li>else try to map based on CDS features on the dna sequence</li>
 631    * </ul>
 632    *
 633    * @param mapFrom
 634    * @param mapTo
 635    * @param xref
 636    * @param mappings
 637    * @return
 638    */
 639   boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
 640           DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
 641   {
 642     MapList mapping = null;
 643
 644     /*
 645      * look for a reverse mapping, if found make its inverse
 646      */
 647     if (mapTo.getDBRefs() != null)
 648     {
 649       for (DBRefEntry dbref : mapTo.getDBRefs())
 650       {
 651         String name = dbref.getSource() + "|" + dbref.getAccessionId();
 652         if (dbref.hasMap() && mapFrom.getName().startsWith(name))
 653         {
 654           /*
 655            * looks like we've found a map from 'mapTo' to 'mapFrom'
 656            * - invert it to make the mapping the other way
 657            */
 658           MapList reverse = dbref.getMap().getMap().getInverse();
 659           xref.setMap(new Mapping(mapTo, reverse));
 660           mappings.addMap(mapFrom, mapTo, reverse);
 661           return true;
 662         }
 663       }
 664     }
 665
 666     if (fromDna)
 667     {
 668       mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
 669     }
 670     else
 671     {
 672       mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
 673       if (mapping != null)
 674       {
 675         mapping = mapping.getInverse();
 676       }
 677     }
 678     if (mapping == null)
 679     {
 680       return false;
 681     }
 682     xref.setMap(new Mapping(mapTo, mapping));
 683     if (fromDna)
 684     {
 685       AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
 686       mappings.addMap(mapFrom, mapTo, mapping);
 687     }
 688     else
 689     {
 690       mappings.addMap(mapTo, mapFrom, mapping.getInverse());
 691     }
 692
 693     return true;
 694   }
 695
 696   /**
 697    * find references to lrfs in the cross-reference set of each sequence in
 698    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 699    * based on source and accession string only - Map and Version are nulled.
 700    *
 701    * @param fromDna
 702    *          - true if context was searching from Dna sequences, false if
 703    *          context was searching from Protein sequences
 704    * @param sequenceI
 705    * @param lrfs
 706    * @param rseqs
 707    * @return true if matches were found.
 708    */
 709   private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
 710           DBRefEntry[] lrfs, List<SequenceI> rseqs, AlignedCodonFrame cf)
 711   {
 712     boolean found = false;
 713     if (lrfs == null)
 714     {
 715       return false;
 716     }
 717     for (int i = 0; i < lrfs.length; i++)
 718     {
 719       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 720       // add in wildcards
 721       xref.setVersion(null);
 722       xref.setMap(null);
 723       found |= searchDataset(fromDna, sequenceI, xref, rseqs, cf, false);
 724     }
 725     return found;
 726   }
 727
 728   /**
 729    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
 730    * associated sequence to rseqs
 731    *
 732    * @param fromDna
 733    *          true if context was searching for refs *from* dna sequence, false
 734    *          if context was searching for refs *from* protein sequence
 735    * @param sequenceI
 736    *          a sequence to ignore (start point of search)
 737    * @param xrf
 738    *          a cross-reference to try to match
 739    * @param rseqs
 740    *          result list to add to
 741    * @param cf
 742    *          a set of sequence mappings to add to
 743    * @param direct
 744    *          - search all references or only subset
 745    * @return true if relationship found and sequence added.
 746    */
 747   boolean searchDataset(boolean fromDna, SequenceI sequenceI,
 748           DBRefEntry xrf, List<SequenceI> rseqs, AlignedCodonFrame cf,
 749           boolean direct)
 750   {
 751     boolean found = false;
 752     if (dataset == null)
 753     {
 754       return false;
 755     }
 756     if (dataset.getSequences() == null)
 757     {
 758       System.err.println("Empty dataset sequence set - NO VECTOR");
 759       return false;
 760     }
 761     List<SequenceI> ds;
 762     synchronized (ds = dataset.getSequences())
 763     {
 764       for (SequenceI nxt : ds)
 765       {
 766         if (nxt != null)
 767         {
 768           if (nxt.getDatasetSequence() != null)
 769           {
 770             System.err
 771                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 772           }
 773           if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
 774           {
 775             continue;
 776           }
 777           /*
 778            * only look at same molecule type if 'direct', or
 779            * complementary type if !direct
 780            */
 781           {
 782             boolean isDna = !nxt.isProtein();
 783             if (direct ? (isDna != fromDna) : (isDna == fromDna))
 784             {
 785               // skip this sequence because it is wrong molecule type
 786               continue;
 787             }
 788           }
 789
 790           // look for direct or indirect references in common
 791           DBRefEntry[] poss = nxt.getDBRefs();
 792           List<DBRefEntry> cands = null;
 793           /*
 794            * TODO does this make any sense?
 795            * if 'direct', search the dbrefs for xrf
 796            * else, filter the dbrefs by type and then search for xrf
 797            * - the result is the same isn't it?
 798            */
 799           if (direct)
 800           {
 801             cands = DBRefUtils.searchRefs(poss, xrf);
 802           }
 803           else
 804           {
 805             poss = DBRefUtils.selectDbRefs(!fromDna, poss);
 806             cands = DBRefUtils.searchRefs(poss, xrf);
 807           }
 808           if (!cands.isEmpty())
 809           {
 810             if (!rseqs.contains(nxt))
 811             {
 812               found = true;
 813               rseqs.add(nxt);
 814               if (cf != null)
 815               {
 816                 // don't search if we aren't given a codon map object
 817                 for (DBRefEntry candidate : cands)
 818                 {
 819                   Mapping mapping = candidate.getMap();
 820                   if (mapping != null)
 821                   {
 822                     MapList map = mapping.getMap();
 823                     if (mapping.getTo() != null
 824                             && map.getFromRatio() != map.getToRatio())
 825                     {
 826                       // get sense of map correct for adding to product
 827                       // alignment.
 828                       if (fromDna)
 829                       {
 830                         // map is from dna seq to a protein product
 831                         cf.addMap(sequenceI, nxt, map);
 832                       }
 833                       else
 834                       {
 835                         // map should be from protein seq to its coding dna
 836                         cf.addMap(nxt, sequenceI, map.getInverse());
 837                       }
 838                     }
 839                   }
 840                 }
 841               }
 842               // TODO: add mapping between sequences if necessary
 843             }
 844           }
 845         }
 846       }
 847     }
 848     return found;
 849   }
 850 }