src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.Comparison;
  32 import jalview.util.DBRefUtils;
  33 import jalview.util.MapList;
  34 import jalview.ws.SequenceFetcherFactory;
  35 import jalview.ws.seqfetcher.ASequenceFetcher;
  36
  37 import java.util.ArrayList;
  38 import java.util.Arrays;
  39 import java.util.List;
  40
  41 /**
  42  * Functions for cross-referencing sequence databases. user must first specify
  43  * if cross-referencing from protein or dna (set dna==true)
  44  *
  45  * @author JimP
  46  *
  47  */
  48 public class CrossRef
  49 {
  50   /*
  51    * A sub-class that ignores Parent attribute when comparing sequence
  52    * features. This avoids 'duplicate' CDS features that only
  53    * differ in their parent Transcript ids.
  54    */
  55   class MySequenceFeature extends SequenceFeature
  56   {
  57     private SequenceFeature feat;
  58
  59     MySequenceFeature(SequenceFeature sf)
  60     {
  61       this.feat = sf;
  62     }
  63
  64     @Override
  65     public boolean equals(Object o)
  66     {
  67       return feat.equals(o, true);
  68     }
  69   }
  70
  71   /**
  72
  73    * Returns a list of distinct database sources for which sequences have either
  74    * <ul>
  75    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
  76    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
  77    * reference from another sequence in the dataset which has a cross-reference
  78    * to a direct DBRefEntry on the given sequence</li>
  79    * </ul>
  80    *
  81    * @param dna
  82    *          true if seqs are nucleotide
  83    * @param seqs
  84    *          sequences whose xrefs we are seeking
  85    * @param dataset
  86    *          an alignment to search for indirect references
  87    * @return
  88    */
  89   public static List<String> findXrefSourcesForSequences(boolean dna,
  90           SequenceI[] seqs, AlignmentI dataset)
  91   {
  92     List<String> sources = new ArrayList<String>();
  93     for (SequenceI seq : seqs)
  94     {
  95       if (seq != null)
  96       {
  97         findXrefSourcesForSequence(seq, dna, dataset, sources);
  98       }
  99     }
 100     return sources;
 101   }
 102
 103   /**
 104    * Returns a list of distinct database sources for which a sequence has either
 105    * <ul>
 106    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
 107    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
 108    * reference from another sequence in the dataset which has a cross-reference
 109    * to a direct DBRefEntry on the given sequence</li>
 110    * </ul>
 111    *
 112    * @param seq
 113    *          the sequence whose dbrefs we are searching against
 114    * @param dna
 115    *          true if the sequence is nucleotide
 116    * @param dataset
 117    *          an alignment to search for indirect references
 118    * @param sources
 119    *          a list of sources to add matches to
 120    */
 121   static void findXrefSourcesForSequence(SequenceI seq, boolean dna,
 122           AlignmentI dataset, List<String> sources)
 123   {
 124     /*
 125      * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
 126      */
 127     DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!dna, seq.getDBRefs());
 128     addXrefsToSources(rfs, sources);
 129     if (dataset != null)
 130     {
 131       /*
 132        * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
 133        */
 134       DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
 135       List<SequenceI> rseqs = new ArrayList<SequenceI>();
 136
 137       /*
 138        * find sequences in the alignment which xref one of these DBRefs
 139        * i.e. is xref-ed to a common sequence identifier
 140        */
 141       CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs, null);
 142
 143       /*
 144        * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
 145        */
 146       for (SequenceI rs : rseqs)
 147       {
 148         DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!dna, rs.getDBRefs());
 149         addXrefsToSources(xrs, sources);
 150       }
 151     }
 152   }
 153
 154   /**
 155    * Helper method that adds the source identifiers of some cross-references to
 156    * a (non-redundant) list of database sources
 157    *
 158    * @param xrefs
 159    * @param sources
 160    */
 161   static void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
 162   {
 163     if (xrefs != null)
 164     {
 165       for (DBRefEntry ref : xrefs)
 166       {
 167         String source = ref.getSource();
 168         if (!sources.contains(source))
 169         {
 170           sources.add(source);
 171         }
 172       }
 173     }
 174   }
 175
 176   /**
 177    *
 178    * @param seqs
 179    *          sequences whose xrefs are being retrieved
 180    * @param dna
 181    *          true if sequences are nucleotide
 182    * @param source
 183    * @param al
 184    *          alignment to search for cross-referenced sequences (and possibly
 185    *          add to)
 186    * @return products (as dataset sequences)
 187    */
 188   public static Alignment findXrefSequences(SequenceI[] seqs,
 189           final boolean dna, final String source, AlignmentI al)
 190   {
 191     AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
 192     List<SequenceI> rseqs = new ArrayList<SequenceI>();
 193     AlignedCodonFrame cf = new AlignedCodonFrame();
 194     for (SequenceI seq : seqs)
 195     {
 196       SequenceI dss = seq;
 197       while (dss.getDatasetSequence() != null)
 198       {
 199         dss = dss.getDatasetSequence();
 200       }
 201       boolean found = false;
 202       DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!dna, dss.getDBRefs());
 203       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 204       {
 205         /*
 206          * found no suitable dbrefs on sequence - look for sequences in the
 207          * alignment which share a dbref with this one
 208          */
 209         DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(dna, seq.getDBRefs());
 210
 211         /*
 212          * find sequences (except this one!), of complementary type,
 213          *  which have a dbref to an accession id for this sequence,
 214          *  and add them to the results
 215          */
 216         found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
 217                 rseqs, cf);
 218       }
 219       for (int r = 0; xrfs != null && r < xrfs.length; r++)
 220       {
 221         DBRefEntry xref = xrfs[r];
 222         if (source != null && !source.equals(xref.getSource()))
 223         {
 224           continue;
 225         }
 226         if (xref.hasMap())
 227         {
 228           if (xref.getMap().getTo() != null)
 229           {
 230             found = true;
 231             SequenceI rsq = new Sequence(xref.getMap().getTo());
 232             rseqs.add(rsq);
 233             if (xref.getMap().getMap().getFromRatio() != xref
 234                     .getMap().getMap().getToRatio())
 235             {
 236               // get sense of map correct for adding to product alignment.
 237               if (dna)
 238               {
 239                 // map is from dna seq to a protein product
 240                 cf.addMap(dss, rsq, xref.getMap().getMap());
 241               }
 242               else
 243               {
 244                 // map should be from protein seq to its coding dna
 245                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
 246               }
 247             }
 248           }
 249         }
 250         if (!found)
 251         {
 252           // do a bit more work - search for sequences with references matching
 253           // xrefs on this sequence.
 254           if (dataset != null)
 255           {
 256             found = searchDataset(dss, xref, dataset, rseqs, cf, false,/*true?*/
 257                     !dna);
 258             if (found)
 259             {
 260               xrfs[r] = null; // we've recovered seqs for this one.
 261             }
 262           }
 263         }
 264       }
 265       if (!found)
 266       {
 267         if (xrfs != null && xrfs.length > 0)
 268         {
 269           ASequenceFetcher sftch = SequenceFetcherFactory
 270                   .getSequenceFetcher();
 271           SequenceI[] retrieved = null;
 272           int l = xrfs.length;
 273           for (int r = 0; r < xrfs.length; r++)
 274           {
 275             // filter out any irrelevant or irretrievable references
 276             if (xrfs[r] == null
 277                     || ((source != null && !source.equals(xrfs[r]
 278                             .getSource())) || !sftch.isFetchable(xrfs[r]
 279                             .getSource())))
 280             {
 281               l--;
 282               xrfs[r] = null;
 283             }
 284           }
 285           if (l > 0)
 286           {
 287             // System.out
 288             // .println("Attempting to retrieve cross referenced sequences.");
 289             DBRefEntry[] t = new DBRefEntry[l];
 290             l = 0;
 291             for (int r = 0; r < xrfs.length; r++)
 292             {
 293               if (xrfs[r] != null)
 294               {
 295                 t[l++] = xrfs[r];
 296               }
 297             }
 298             xrfs = t;
 299             try
 300             {
 301               retrieved = sftch.getSequences(Arrays.asList(xrfs), !dna);
 302               // problem here is we don't know which of xrfs resulted in which
 303               // retrieved element
 304             } catch (Exception e)
 305             {
 306               System.err
 307                       .println("Problem whilst retrieving cross references for Sequence : "
 308                               + seq.getName());
 309               e.printStackTrace();
 310             }
 311
 312             if (retrieved != null)
 313             {
 314               updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
 315
 316               SequenceIdMatcher matcher = new SequenceIdMatcher(
 317                       dataset.getSequences());
 318               List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
 319               CrossRef me = new CrossRef();
 320               for (int rs = 0; rs < retrieved.length; rs++)
 321               {
 322                 // TODO: examine each sequence for 'redundancy'
 323                 DBRefEntry[] dbr = retrieved[rs].getDBRefs();
 324                 if (dbr != null && dbr.length > 0)
 325                 {
 326                   for (int di = 0; di < dbr.length; di++)
 327                   {
 328                     // find any entry where we should put in the sequence being
 329                     // cross-referenced into the map
 330                     Mapping map = dbr[di].getMap();
 331                     if (map != null)
 332                     {
 333                       if (map.getTo() != null && map.getMap() != null)
 334                       {
 335                         SequenceI matched = matcher
 336                                 .findIdMatch(map.getTo());
 337                         if (matched != null)
 338                         {
 339                           /*
 340                            * already got an xref to this sequence; update this
 341                            * map to point to the same sequence, and add
 342                            * any new dbrefs to it
 343                            */
 344                           for (DBRefEntry ref : map.getTo().getDBRefs())
 345                           {
 346                             matched.addDBRef(ref); // add or update mapping
 347                           }
 348                           map.setTo(matched);
 349                         }
 350                         else
 351                         {
 352                           matcher.add(map.getTo());
 353                         }
 354                         try
 355                         {
 356                           // compare ms with dss and replace with dss in mapping
 357                           // if map is congruent
 358                           SequenceI ms = map.getTo();
 359                           int sf = map.getMap().getToLowest();
 360                           int st = map.getMap().getToHighest();
 361                           SequenceI mappedrg = ms.getSubSequence(sf, st);
 362                           // SequenceI loc = dss.getSubSequence(sf, st);
 363                           if (mappedrg.getLength() > 0
 364                                   && ms.getSequenceAsString().equals(
 365                                           dss.getSequenceAsString()))
 366                           // && mappedrg.getSequenceAsString().equals(
 367                           // loc.getSequenceAsString()))
 368                           {
 369                             String msg = "Mapping updated from "
 370                                     + ms.getName()
 371                                     + " to retrieved crossreference "
 372                                     + dss.getName();
 373                             System.out.println(msg);
 374                             // method to update all refs of existing To on
 375                             // retrieved sequence with dss and merge any props
 376                             // on To onto dss.
 377                             map.setTo(dss);
 378                             /*
 379                              * copy sequence features as well, avoiding
 380                              * duplication (e.g. same variation from 2
 381                              * transcripts)
 382                              */
 383                             SequenceFeature[] sfs = ms
 384                                     .getSequenceFeatures();
 385                             if (sfs != null)
 386                             {
 387                               for (SequenceFeature feat : sfs)
 388                               {
 389                                 /*
 390                                  * we override SequenceFeature.equals here (but
 391                                  * not elsewhere) to ignore Parent attribute
 392                                  * TODO not quite working yet!
 393                                  */
 394                                 if (!copiedFeatures
 395                                         .contains(me.new MySequenceFeature(
 396                                                 feat)))
 397                                 {
 398                                   dss.addSequenceFeature(feat);
 399                                   copiedFeatures.add(feat);
 400                                 }
 401                               }
 402                             }
 403                             cf.addMap(retrieved[rs].getDatasetSequence(),
 404                                     dss, map.getMap());
 405                           }
 406                           // TODO remove this 'else' and the cf.addMap above?
 407                           else
 408                           {
 409                             cf.addMap(retrieved[rs].getDatasetSequence(),
 410                                     map.getTo(), map.getMap());
 411                           }
 412                         } catch (Exception e)
 413                         {
 414                           System.err
 415                                   .println("Exception when consolidating Mapped sequence set...");
 416                           e.printStackTrace(System.err);
 417                         }
 418                       }
 419                     }
 420                   }
 421                 }
 422                 retrieved[rs].updatePDBIds();
 423                 rseqs.add(retrieved[rs]);
 424               }
 425             }
 426           }
 427         }
 428       }
 429     }
 430
 431     Alignment ral = null;
 432     if (rseqs.size() > 0)
 433     {
 434       ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
 435       if (cf != null && !cf.isEmpty())
 436       {
 437         ral.addCodonFrame(cf);
 438       }
 439     }
 440     return ral;
 441   }
 442
 443   /**
 444    * Updates any empty mappings in the cross-references with one to a compatible
 445    * retrieved sequence if found, and adds any new mappings to the
 446    * AlignedCodonFrame
 447    *
 448    * @param dna
 449    * @param mapFrom
 450    * @param xrefs
 451    * @param retrieved
 452    * @param acf
 453    */
 454   static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
 455           DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
 456   {
 457     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 458     for (DBRefEntry xref : xrefs)
 459     {
 460       if (!xref.hasMap())
 461       {
 462         String targetSeqName = xref.getSource() + "|"
 463                 + xref.getAccessionId();
 464         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 465         if (matches == null)
 466         {
 467           return;
 468         }
 469         for (SequenceI seq : matches)
 470         {
 471           MapList mapping = null;
 472           if (dna)
 473           {
 474             mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
 475           }
 476           else
 477           {
 478             mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
 479             if (mapping != null)
 480             {
 481               mapping = mapping.getInverse();
 482             }
 483           }
 484           if (mapping != null)
 485           {
 486             xref.setMap(new Mapping(seq, mapping));
 487             if (dna)
 488             {
 489               AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
 490             }
 491             if (dna)
 492             {
 493               acf.addMap(mapFrom, seq, mapping);
 494             }
 495             else
 496             {
 497               acf.addMap(seq, mapFrom, mapping.getInverse());
 498             }
 499             continue;
 500           }
 501         }
 502       }
 503     }
 504   }
 505
 506   /**
 507    * find references to lrfs in the cross-reference set of each sequence in
 508    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 509    * based on source and accession string only - Map and Version are nulled.
 510    *
 511    * @param sequenceI
 512    * @param lrfs
 513    * @param dataset
 514    * @param rseqs
 515    * @return true if matches were found.
 516    */
 517   private static boolean searchDatasetXrefs(SequenceI sequenceI,
 518           boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
 519           List<SequenceI> rseqs, AlignedCodonFrame cf)
 520   {
 521     boolean found = false;
 522     if (lrfs == null)
 523     {
 524       return false;
 525     }
 526     for (int i = 0; i < lrfs.length; i++)
 527     {
 528       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 529       // add in wildcards
 530       xref.setVersion(null);
 531       xref.setMap(null);
 532       found |= searchDataset(sequenceI, xref, dataset, rseqs, cf, false,
 533               dna);
 534     }
 535     return found;
 536   }
 537
 538   /**
 539    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
 540    * associated sequence to rseqs
 541    *
 542    * @param sequenceI
 543    *          a sequence to ignore (start point of search)
 544    * @param xrf
 545    *          a cross-reference to try to match
 546    * @param dataset
 547    *          sequences to search in
 548    * @param rseqs
 549    *          result list to add to
 550    * @param cf
 551    *          a set of sequence mappings to add to
 552    * @param direct
 553    *          - search all references or only subset
 554    * @param dna
 555    *          search dna or protein xrefs (if direct=false)
 556    * @return true if relationship found and sequence added.
 557    */
 558   public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
 559           AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
 560           boolean direct, boolean dna)
 561   {
 562     boolean found = false;
 563     if (dataset == null)
 564     {
 565       return false;
 566     }
 567     if (dataset.getSequences() == null)
 568     {
 569       System.err.println("Empty dataset sequence set - NO VECTOR");
 570       return false;
 571     }
 572     List<SequenceI> ds;
 573     synchronized (ds = dataset.getSequences())
 574     {
 575       for (SequenceI nxt : ds)
 576       {
 577         if (nxt != null)
 578         {
 579           if (nxt.getDatasetSequence() != null)
 580           {
 581             System.err
 582                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 583           }
 584           if (nxt == sequenceI || nxt == sequenceI.getDatasetSequence())
 585           {
 586             continue;
 587           }
 588           // check if this is the correct sequence type
 589           {
 590             // TODO 'direct' is always set to false - remove?
 591             // or should it be 'true' from findXrefSequences?
 592             // also its Javadoc conflicts with its use:
 593             // test below implies 'direct' means find complementary sequences,
 594             // !direct means select same molecule type
 595             boolean isDna = Comparison
 596                     .isNucleotide(new SequenceI[] { nxt });
 597             if ((direct && isDna == dna) || (!direct && isDna != dna))
 598             {
 599               // skip this sequence because it is wrong molecule type
 600               continue;
 601             }
 602           }
 603
 604           // look for direct or indirect references in common
 605           DBRefEntry[] poss = nxt.getDBRefs();
 606           List<DBRefEntry> cands = null;
 607           /*
 608            * TODO does this make any sense?
 609            * if 'direct', search the dbrefs for xrf
 610            * else, filter the dbrefs by type and then search for xrf
 611            * - the result is the same isn't it?
 612            */
 613           if (direct)
 614           {
 615             cands = DBRefUtils.searchRefs(poss, xrf);
 616           }
 617           else
 618           {
 619             poss = DBRefUtils.selectDbRefs(!dna, poss);
 620             cands = DBRefUtils.searchRefs(poss, xrf);
 621           }
 622           if (!cands.isEmpty())
 623           {
 624             if (!rseqs.contains(nxt))
 625             {
 626               found = true;
 627               rseqs.add(nxt);
 628               if (cf != null)
 629               {
 630                 // don't search if we aren't given a codon map object
 631                 for (DBRefEntry candidate : cands)
 632                 {
 633                   Mapping mapping = candidate.getMap();
 634                   if (mapping != null)
 635                   {
 636                     MapList map = mapping.getMap();
 637                     if (mapping.getTo() != null
 638                             && map.getFromRatio() != map.getToRatio())
 639                     {
 640                       // get sense of map correct for adding to product
 641                       // alignment.
 642                       if (dna)
 643                       {
 644                         // map is from dna seq to a protein product
 645                         cf.addMap(sequenceI, nxt, map);
 646                       }
 647                       else
 648                       {
 649                         // map should be from protein seq to its coding dna
 650                         cf.addMap(nxt, sequenceI, map.getInverse());
 651                       }
 652                     }
 653                   }
 654                 }
 655               }
 656               // TODO: add mapping between sequences if necessary
 657             }
 658           }
 659         }
 660       }
 661     }
 662     return found;
 663   }
 664 }