src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.Comparison;
  32 import jalview.util.DBRefUtils;
  33 import jalview.util.MapList;
  34 import jalview.ws.SequenceFetcherFactory;
  35 import jalview.ws.seqfetcher.ASequenceFetcher;
  36
  37 import java.util.ArrayList;
  38 import java.util.List;
  39
  40 /**
  41  * Functions for cross-referencing sequence databases. user must first specify
  42  * if cross-referencing from protein or dna (set dna==true)
  43  *
  44  * @author JimP
  45  *
  46  */
  47 public class CrossRef
  48 {
  49   /*
  50    * A sub-class that ignores Parent attribute when comparing sequence
  51    * features. This avoids 'duplicate' CDS features that only
  52    * differ in their parent Transcript ids.
  53    */
  54   class MySequenceFeature extends SequenceFeature
  55   {
  56     private SequenceFeature feat;
  57
  58     MySequenceFeature(SequenceFeature sf)
  59     {
  60       this.feat = sf;
  61     }
  62
  63     @Override
  64     public boolean equals(Object o)
  65     {
  66       return feat.equals(o, true);
  67     }
  68   }
  69
  70   /**
  71
  72    * Returns a list of distinct database sources for which sequences have either
  73    * <ul>
  74    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
  75    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
  76    * reference from another sequence in the dataset which has a cross-reference
  77    * to a direct DBRefEntry on the given sequence</li>
  78    * </ul>
  79    *
  80    * @param dna
  81    *          true if seqs are nucleotide
  82    * @param seqs
  83    *          sequences whose xrefs we are seeking
  84    * @param dataset
  85    *          an alignment to search for indirect references
  86    * @return
  87    */
  88   public static List<String> findXrefSourcesForSequences(boolean dna,
  89           SequenceI[] seqs, AlignmentI dataset)
  90   {
  91     List<String> sources = new ArrayList<String>();
  92     for (SequenceI seq : seqs)
  93     {
  94       if (seq != null)
  95       {
  96         findXrefSourcesForSequence(seq, dna, dataset, sources);
  97       }
  98     }
  99     return sources;
 100   }
 101
 102   /**
 103    * Returns a list of distinct database sources for which a sequence has either
 104    * <ul>
 105    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
 106    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
 107    * reference from another sequence in the dataset which has a cross-reference
 108    * to a direct DBRefEntry on the given sequence</li>
 109    * </ul>
 110    *
 111    * @param seq
 112    *          the sequence whose dbrefs we are searching against
 113    * @param dna
 114    *          true if the sequence is nucleotide
 115    * @param dataset
 116    *          an alignment to search for indirect references
 117    * @param sources
 118    *          a list of sources to add matches to
 119    */
 120   static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
 121           AlignmentI dataset, List<String> sources)
 122   {
 123     /*
 124      * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
 125      */
 126     DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
 127     addXrefsToSources(rfs, sources);
 128     if (dataset != null)
 129     {
 130       /*
 131        * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
 132        */
 133       DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
 134       List<SequenceI> rseqs = new ArrayList<SequenceI>();
 135
 136       /*
 137        * find sequences in the alignment which xref one of these DBRefs
 138        * i.e. is xref-ed to a common sequence identifier
 139        */
 140       CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
 141
 142       /*
 143        * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
 144        */
 145       for (SequenceI rs : rseqs)
 146       {
 147         DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
 148         addXrefsToSources(xrs, sources);
 149       }
 150     }
 151   }
 152
 153   /**
 154    * Helper method that adds the source identifiers of some cross-references to
 155    * a (non-redundant) list of database sources
 156    *
 157    * @param xrefs
 158    * @param sources
 159    */
 160   static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
 161   {
 162     if (xrefs != null)
 163     {
 164       for (DBRefEntry ref : xrefs)
 165       {
 166         String source = ref.getSource();
 167         if (!sources.contains(source))
 168         {
 169           sources.add(source);
 170         }
 171       }
 172     }
 173   }
 174
 175   /**
 176    *
 177    * @param seqs
 178    *          sequences whose xrefs are being retrieved
 179    * @param dna
 180    *          true if sequences are nucleotide
 181    * @param source
 182    * @param al
 183    *          alignment to search for cross-referenced sequences (and possibly
 184    *          add to)
 185    * @return products (as dataset sequences)
 186    */
 187   public static Alignment findXrefSequences(SequenceI[] seqs,
 188           final boolean dna, final String source, AlignmentI al)
 189   {
 190     AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
 191     List<SequenceI> rseqs = new ArrayList<SequenceI>();
 192     AlignedCodonFrame cf = new AlignedCodonFrame();
 193     for (SequenceI seq : seqs)
 194     {
 195       SequenceI dss = seq;
 196       while (dss.getDatasetSequence() != null)
 197       {
 198         dss = dss.getDatasetSequence();
 199       }
 200       boolean found = false;
 201       DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
 202       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 203       {
 204         /*
 205          * found no suitable dbrefs on sequence - look for sequences in the
 206          * alignment which share a dbref with this one
 207          */
 208         DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
 209
 210         /*
 211          * find sequences (except this one!), of complementary type,
 212          *  which have a dbref to an accession id for this sequence,
 213          *  and add them to the results
 214          */
 215         found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
 216                 rseqs, cf);
 217       }
 218       for (int r = 0; xrfs != null && r < xrfs.length; r++)
 219       {
 220         DBRefEntry xref = xrfs[r];
 221         if (source != null && !source.equals(xref.getSource()))
 222         {
 223           continue;
 224         }
 225         if (xref.hasMap())
 226         {
 227           if (xref.getMap().getTo() != null)
 228           {
 229             found = true;
 230             SequenceI rsq = new Sequence(xref.getMap().getTo());
 231             rseqs.add(rsq);
 232             if (xref.getMap().getMap().getFromRatio() != xref
 233                     .getMap().getMap().getToRatio())
 234             {
 235               // get sense of map correct for adding to product alignment.
 236               if (dna)
 237               {
 238                 // map is from dna seq to a protein product
 239                 cf.addMap(dss, rsq, xref.getMap().getMap());
 240               }
 241               else
 242               {
 243                 // map should be from protein seq to its coding dna
 244                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
 245               }
 246             }
 247           }
 248         }
 249         if (!found)
 250         {
 251           // do a bit more work - search for sequences with references matching
 252           // xrefs on this sequence.
 253           if (dataset != null)
 254           {
 255             found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
 256                     !dna);
 257             if (found)
 258             {
 259               xrfs[r] = null; // we've recovered seqs for this one.
 260             }
 261           }
 262         }
 263       }
 264       if (!found)
 265       {
 266         if (xrfs != null && xrfs.length > 0)
 267         {
 268           ASequenceFetcher sftch = SequenceFetcherFactory
 269                   .getSequenceFetcher();
 270           SequenceI[] retrieved = null;
 271           int l = xrfs.length;
 272           for (int r = 0; r < xrfs.length; r++)
 273           {
 274             // filter out any irrelevant or irretrievable references
 275             if (xrfs[r] == null
 276                     || ((source != null && !source.equals(xrfs[r]
 277                             .getSource())) || !sftch.isFetchable(xrfs[r]
 278                             .getSource())))
 279             {
 280               l--;
 281               xrfs[r] = null;
 282             }
 283           }
 284           if (l > 0)
 285           {
 286             // System.out
 287             // .println("Attempting to retrieve cross referenced sequences.");
 288             DBRefEntry[] t = new DBRefEntry[l];
 289             l = 0;
 290             for (int r = 0; r < xrfs.length; r++)
 291             {
 292               if (xrfs[r] != null)
 293               {
 294                 t[l++] = xrfs[r];
 295               }
 296             }
 297             xrfs = t;
 298             try
 299             {
 300               retrieved = sftch.getSequences(xrfs, !dna);
 301               // problem here is we don't know which of xrfs resulted in which
 302               // retrieved element
 303             } catch (Exception e)
 304             {
 305               System.err
 306                       .println("Problem whilst retrieving cross references for Sequence : "
 307                               + seq.getName());
 308               e.printStackTrace();
 309             }
 310
 311             if (retrieved != null)
 312             {
 313               updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
 314
 315               SequenceIdMatcher matcher = new SequenceIdMatcher(
 316                       dataset.getSequences());
 317               List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
 318               CrossRef me = new CrossRef();
 319               for (int rs = 0; rs < retrieved.length; rs++)
 320               {
 321                 // TODO: examine each sequence for 'redundancy'
 322                 DBRefEntry[] dbr = retrieved[rs].getDBRefs();
 323                 if (dbr != null && dbr.length > 0)
 324                 {
 325                   for (int di = 0; di < dbr.length; di++)
 326                   {
 327                     // find any entry where we should put in the sequence being
 328                     // cross-referenced into the map
 329                     Mapping map = dbr[di].getMap();
 330                     if (map != null)
 331                     {
 332                       if (map.getTo() != null && map.getMap() != null)
 333                       {
 334                         SequenceI matched = matcher
 335                                 .findIdMatch(map.getTo());
 336                         if (matched != null)
 337                         {
 338                           /*
 339                            * already got an xref to this sequence; update this
 340                            * map to point to the same sequence, and add
 341                            * any new dbrefs to it
 342                            */
 343                           for (DBRefEntry ref : map.getTo().getDBRefs())
 344                           {
 345                             matched.addDBRef(ref); // add or update mapping
 346                           }
 347                           map.setTo(matched);
 348                         }
 349                         else
 350                         {
 351                           matcher.add(map.getTo());
 352                         }
 353                         try
 354                         {
 355                           // compare ms with dss and replace with dss in mapping
 356                           // if map is congruent
 357                           SequenceI ms = map.getTo();
 358                           int sf = map.getMap().getToLowest();
 359                           int st = map.getMap().getToHighest();
 360                           SequenceI mappedrg = ms.getSubSequence(sf, st);
 361                           // SequenceI loc = dss.getSubSequence(sf, st);
 362                           if (mappedrg.getLength() > 0
 363                                   && ms.getSequenceAsString().equals(
 364                                           dss.getSequenceAsString()))
 365                           // && mappedrg.getSequenceAsString().equals(
 366                           // loc.getSequenceAsString()))
 367                           {
 368                             String msg = "Mapping updated from "
 369                                     + ms.getName()
 370                                     + " to retrieved crossreference "
 371                                     + dss.getName();
 372                             System.out.println(msg);
 373                             // method to update all refs of existing To on
 374                             // retrieved sequence with dss and merge any props
 375                             // on To onto dss.
 376                             map.setTo(dss);
 377                             /*
 378                              * copy sequence features as well, avoiding
 379                              * duplication (e.g. same variation from 2
 380                              * transcripts)
 381                              */
 382                             SequenceFeature[] sfs = ms
 383                                     .getSequenceFeatures();
 384                             if (sfs != null)
 385                             {
 386                               for (SequenceFeature feat : sfs)
 387                               {
 388                                 /*
 389                                  * we override SequenceFeature.equals here (but
 390                                  * not elsewhere) to ignore Parent attribute
 391                                  * TODO not quite working yet!
 392                                  */
 393                                 if (!copiedFeatures
 394                                         .contains(me.new MySequenceFeature(
 395                                                 feat)))
 396                                 {
 397                                   dss.addSequenceFeature(feat);
 398                                   copiedFeatures.add(feat);
 399                                 }
 400                               }
 401                             }
 402                             cf.addMap(retrieved[rs].getDatasetSequence(),
 403                                     dss, map.getMap());
 404                           }
 405                           // TODO remove this 'else' and the cf.addMap above?
 406                           else
 407                           {
 408                             cf.addMap(retrieved[rs].getDatasetSequence(),
 409                                     map.getTo(), map.getMap());
 410                           }
 411                         } catch (Exception e)
 412                         {
 413                           System.err
 414                                   .println("Exception when consolidating Mapped sequence set...");
 415                           e.printStackTrace(System.err);
 416                         }
 417                       }
 418                     }
 419                   }
 420                 }
 421                 retrieved[rs].updatePDBIds();
 422                 rseqs.add(retrieved[rs]);
 423               }
 424             }
 425           }
 426         }
 427       }
 428     }
 429
 430     Alignment ral = null;
 431     if (rseqs.size() > 0)
 432     {
 433       ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
 434       if (cf != null && !cf.isEmpty())
 435       {
 436         ral.addCodonFrame(cf);
 437       }
 438     }
 439     return ral;
 440   }
 441
 442   /**
 443    * Updates any empty mappings in the cross-references with one to a compatible
 444    * retrieved sequence if found, and adds any new mappings to the
 445    * AlignedCodonFrame
 446    *
 447    * @param dna
 448    * @param mapFrom
 449    * @param xrefs
 450    * @param retrieved
 451    * @param acf
 452    */
 453   static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
 454           DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
 455   {
 456     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 457     for (DBRefEntry xref : xrefs)
 458     {
 459       if (!xref.hasMap())
 460       {
 461         String targetSeqName = xref.getSource() + "|"
 462                 + xref.getAccessionId();
 463         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 464         if (matches == null)
 465         {
 466           return;
 467         }
 468         for (SequenceI seq : matches)
 469         {
 470           MapList mapping = null;
 471           if (dna)
 472           {
 473             mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
 474           }
 475           else
 476           {
 477             mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
 478             if (mapping != null)
 479             {
 480               mapping = mapping.getInverse();
 481             }
 482           }
 483           if (mapping != null)
 484           {
 485             xref.setMap(new Mapping(seq, mapping));
 486             if (dna)
 487             {
 488               AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
 489             }
 490             if (dna)
 491             {
 492               acf.addMap(mapFrom, seq, mapping);
 493             }
 494             else
 495             {
 496               acf.addMap(seq, mapFrom, mapping.getInverse());
 497             }
 498             continue;
 499           }
 500         }
 501       }
 502     }
 503   }
 504
 505   /**
 506    * find references to lrfs in the cross-reference set of each sequence in
 507    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 508    * based on source and accession string only - Map and Version are nulled.
 509    *
 510    * @param sequenceI
 511    * @param lrfs
 512    * @param dataset
 513    * @param rseqs
 514    * @return true if matches were found.
 515    */
 516   private static boolean searchDatasetXrefs(SequenceI sequenceI,
 517           boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
 518           List<SequenceI> rseqs, AlignedCodonFrame cf)
 519   {
 520     boolean found = false;
 521     if (lrfs == null)
 522     {
 523       return false;
 524     }
 525     for (int i = 0; i < lrfs.length; i++)
 526     {
 527       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 528       // add in wildcards
 529       xref.setVersion(null);
 530       xref.setMap(null);
 531       found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
 532               dna);
 533     }
 534     return found;
 535   }
 536
 537   /**
 538    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
 539    * associated sequence to rseqs
 540    *
 541    * @param sequenceI
 542    *          a sequence to ignore (start point of search)
 543    * @param xrf
 544    *          a cross-reference to try to match
 545    * @param dataset
 546    *          sequences to search in
 547    * @param rseqs
 548    *          result list to add to
 549    * @param cf
 550    *          a set of sequence mappings to add to
 551    * @param direct
 552    *          - search all references or only subset
 553    * @param dna
 554    *          search dna or protein xrefs (if direct=false)
 555    * @return true if relationship found and sequence added.
 556    */
 557   public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
 558           AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
 559           boolean direct, boolean dna)
 560   {
 561     boolean found = false;
 562     if (dataset == null)
 563     {
 564       return false;
 565     }
 566     if (dataset.getSequences() == null)
 567     {
 568       System.err.println("Empty dataset sequence set - NO VECTOR");
 569       return false;
 570     }
 571     List<SequenceI> ds;
 572     synchronized (ds = dataset.getSequences())
 573     {
 574       for (SequenceI nxt : ds)
 575       {
 576         if (nxt != null)
 577         {
 578           if (nxt.getDatasetSequence() != null)
 579           {
 580             System.err
 581                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 582           }
 583           if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
 584           {
 585             continue;
 586           }
 587           // check if this is the correct sequence type
 588           {
 589             // TODO 'direct' is always set to false - remove?
 590             // or should it be 'true' from findXrefSequences?
 591             // also its Javadoc conflicts with its use:
 592             // test below implies 'direct' means find complementary sequences,
 593             // !direct means select same molecule type
 594             boolean isDna = Comparison
 595                     .isNucleotide(new SequenceI[] { nxt });
 596             if ((direct && isDna == dna) || (!direct && isDna != dna))
 597             {
 598               // skip this sequence because it is wrong molecule type
 599               continue;
 600             }
 601           }
 602
 603           // look for direct or indirect references in common
 604           DBRefEntry[] poss = nxt.getDBRefs();
 605           DBRefEntry[] cands = null;
 606           /*
 607            * TODO does this make any sense?
 608            * if 'direct', search the dbrefs for xrf
 609            * else, filter the dbrefs by type and then search for xrf
 610            * - the result is the same isn't it?
 611            */
 612           if (direct)
 613           {
 614             cands = DBRefUtils.searchRefs(poss, xrf);
 615           }
 616           else
 617           {
 618             poss = DBRefUtils.selectDbRefs(!dna, poss);
 619             cands = DBRefUtils.searchRefs(poss, xrf);
 620           }
 621           if (cands != null)
 622           {
 623             if (!rseqs.contains(nxt))
 624             {
 625               found = true;
 626               rseqs.add(nxt);
 627               boolean foundmap = cf != null;
 628               // don't search if we aren't given a codon map object
 629               for (int r = 0; foundmap && r < cands.length; r++)
 630               {
 631                 if (cands[r].hasMap())
 632                 {
 633                   Mapping mapping = cands[r].getMap();
 634                   MapList map = mapping.getMap();
 635                   if (mapping.getTo() != null
 636                           && map.getFromRatio() != map.getToRatio())
 637                   {
 638                     // get sense of map correct for adding to product
 639                     // alignment.
 640                     if (dna)
 641                     {
 642                       // map is from dna seq to a protein product
 643                       cf.addMap(sequenceI, nxt, map);
 644                     }
 645                     else
 646                     {
 647                       // map should be from protein seq to its coding dna
 648                       cf.addMap(nxt, sequenceI, map.getInverse());
 649                     }
 650                   }
 651                 }
 652               }
 653               // TODO: add mapping between sequences if necessary
 654             }
 655           }
 656         }
 657       }
 658     }
 659     return found;
 660   }
 661 }