src/jalview/analysis/CrossRef.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.Alignment;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.DBRefEntry;
  27 import jalview.datamodel.DBRefSource;
  28 import jalview.datamodel.Mapping;
  29 import jalview.datamodel.Sequence;
  30 import jalview.datamodel.SequenceFeature;
  31 import jalview.datamodel.SequenceI;
  32 import jalview.util.DBRefUtils;
  33 import jalview.ws.SequenceFetcher;
  34 import jalview.ws.seqfetcher.ASequenceFetcher;
  35
  36 import java.util.ArrayList;
  37 import java.util.List;
  38 import java.util.Vector;
  39
  40 /**
  41  * Functions for cross-referencing sequence databases. user must first specify
  42  * if cross-referencing from protein or dna (set dna==true)
  43  *
  44  * @author JimP
  45  *
  46  */
  47 public class CrossRef
  48 {
  49   /*
  50    * A sub-class that ignores Parent attribute when comparing sequence
  51    * features. This avoids 'duplicate' CDS features that only
  52    * differ in their parent Transcript ids.
  53    */
  54   class MySequenceFeature extends SequenceFeature
  55   {
  56     private SequenceFeature feat;
  57
  58     MySequenceFeature(SequenceFeature sf)
  59     {
  60       this.feat = sf;
  61     }
  62
  63     @Override
  64     public boolean equals(Object o)
  65     {
  66       return feat.equals(o, true);
  67     }
  68   }
  69
  70   /**
  71    * Select just the DNA or protein references for a protein or dna sequence
  72    *
  73    * @param fromDna
  74    *          if true, select references from DNA (i.e. Protein databases), else
  75    *          DNA database references
  76    * @param refs
  77    *          a set of references to select from
  78    * @return
  79    */
  80   public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
  81   {
  82     return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
  83             : DBRefSource.DNACODINGDBS);
  84     // could attempt to find other cross
  85     // refs here - ie PDB xrefs
  86     // (not dna, not protein seq)
  87   }
  88
  89   /**
  90    * @param dna
  91    *          true if seqs are DNA seqs
  92    * @param seqs
  93    * @return a list of sequence database cross reference source types
  94    */
  95   public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs)
  96   {
  97     return findSequenceXrefTypes(dna, seqs, null);
  98   }
  99
 100   /**
 101    * Indirect references are references from other sequences from the dataset to
 102    * any of the direct DBRefEntrys on the given sequences.
 103    *
 104    * @param dna
 105    *          true if seqs are DNA seqs
 106    * @param seqs
 107    * @return a list of sequence database cross reference source types
 108    */
 109   public static String[] findSequenceXrefTypes(boolean dna,
 110           SequenceI[] seqs, AlignmentI dataset)
 111   {
 112     String[] dbrefs = null;
 113     List<String> refs = new ArrayList<String>();
 114     for (SequenceI seq : seqs)
 115     {
 116       if (seq != null)
 117       {
 118         SequenceI dss = seq;
 119         while (dss.getDatasetSequence() != null)
 120         {
 121           dss = dss.getDatasetSequence();
 122         }
 123         DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRefs());
 124         if (rfs != null)
 125         {
 126           for (DBRefEntry ref : rfs)
 127           {
 128             if (!refs.contains(ref.getSource()))
 129             {
 130               refs.add(ref.getSource());
 131             }
 132           }
 133         }
 134         if (dataset != null)
 135         {
 136           // search for references to this sequence's direct references.
 137           DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
 138           List<SequenceI> rseqs = new ArrayList<SequenceI>();
 139           CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
 140                   null); // don't need to specify codon frame for mapping here
 141           for (SequenceI rs : rseqs)
 142           {
 143             DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRefs());
 144             if (xrs != null)
 145             {
 146               for (DBRefEntry ref : xrs)
 147               {
 148                 if (!refs.contains(ref.getSource()))
 149                 {
 150                   refs.add(ref.getSource());
 151                 }
 152               }
 153             }
 154             // looks like copy and paste - change rfs to xrs?
 155             // for (int r = 0; rfs != null && r < rfs.length; r++)
 156             // {
 157             // if (!refs.contains(rfs[r].getSource()))
 158             // {
 159             // refs.add(rfs[r].getSource());
 160             // }
 161             // }
 162           }
 163         }
 164       }
 165     }
 166     if (refs.size() > 0)
 167     {
 168       dbrefs = new String[refs.size()];
 169       refs.toArray(dbrefs);
 170     }
 171     return dbrefs;
 172   }
 173
 174   public static boolean hasCdnaMap(SequenceI[] seqs)
 175   {
 176     // TODO unused - remove?
 177     String[] reftypes = findSequenceXrefTypes(false, seqs);
 178     for (int s = 0; s < reftypes.length; s++)
 179     {
 180       if (reftypes.equals(DBRefSource.EMBLCDS))
 181       {
 182         return true;
 183         // no map
 184       }
 185     }
 186     return false;
 187   }
 188
 189   public static SequenceI[] getCdnaMap(SequenceI[] seqs)
 190   {
 191     // TODO unused - remove?
 192     Vector cseqs = new Vector();
 193     for (int s = 0; s < seqs.length; s++)
 194     {
 195       DBRefEntry[] cdna = findXDbRefs(true, seqs[s].getDBRefs());
 196       for (int c = 0; c < cdna.length; c++)
 197       {
 198         if (cdna[c].getSource().equals(DBRefSource.EMBLCDS))
 199         {
 200           System.err
 201                   .println("TODO: unimplemented sequence retrieval for coding region sequence.");
 202           // TODO: retrieve CDS dataset sequences
 203           // need global dataset sequence retriever/resolver to reuse refs
 204           // and construct Mapping entry.
 205           // insert gaps in CDS according to peptide gaps.
 206           // add gapped sequence to cseqs
 207         }
 208       }
 209     }
 210     if (cseqs.size() > 0)
 211     {
 212       SequenceI[] rsqs = new SequenceI[cseqs.size()];
 213       cseqs.copyInto(rsqs);
 214       return rsqs;
 215     }
 216     return null;
 217
 218   }
 219
 220   /**
 221    *
 222    * @param dna
 223    * @param seqs
 224    * @return
 225    */
 226   public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
 227           String source)
 228   {
 229     return findXrefSequences(seqs, dna, source, null);
 230   }
 231
 232   /**
 233    *
 234    * @param seqs
 235    *          sequences whose xrefs are being retrieved
 236    * @param dna
 237    *          true if sequences are nucleotide
 238    * @param source
 239    * @param dataset
 240    *          alignment to search for product sequences.
 241    * @return products (as dataset sequences)
 242    */
 243   public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
 244           String source, AlignmentI dataset)
 245   {
 246     List<SequenceI> rseqs = new ArrayList<SequenceI>();
 247     AlignedCodonFrame cf = new AlignedCodonFrame();
 248     for (SequenceI seq : seqs)
 249     {
 250       SequenceI dss = seq;
 251       while (dss.getDatasetSequence() != null)
 252       {
 253         dss = dss.getDatasetSequence();
 254       }
 255       boolean found = false;
 256       DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRefs());
 257       if ((xrfs == null || xrfs.length == 0) && dataset != null)
 258       {
 259         System.out.println("Attempting to find ds Xrefs refs.");
 260         // FIXME should be dss not seq here?
 261         DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
 262         // less ambiguous would be a 'find primary dbRefEntry' method.
 263         // filter for desired source xref here
 264         found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
 265                 rseqs, cf);
 266       }
 267       for (int r = 0; xrfs != null && r < xrfs.length; r++)
 268       {
 269         DBRefEntry xref = xrfs[r];
 270         if (source != null && !source.equals(xref.getSource()))
 271         {
 272           continue;
 273         }
 274         if (xref.hasMap())
 275         {
 276           if (xref.getMap().getTo() != null)
 277           {
 278             SequenceI rsq = new Sequence(xref.getMap().getTo());
 279             rseqs.add(rsq);
 280             if (xref.getMap().getMap().getFromRatio() != xref
 281                     .getMap().getMap().getToRatio())
 282             {
 283               // get sense of map correct for adding to product alignment.
 284               if (dna)
 285               {
 286                 // map is from dna seq to a protein product
 287                 cf.addMap(dss, rsq, xref.getMap().getMap());
 288               }
 289               else
 290               {
 291                 // map should be from protein seq to its coding dna
 292                 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
 293               }
 294             }
 295             found = true;
 296           }
 297         }
 298         if (!found)
 299         {
 300           // do a bit more work - search for sequences with references matching
 301           // xrefs on this sequence.
 302           if (dataset != null)
 303           {
 304             found |= searchDataset(dss, xref, dataset, rseqs, cf); // ,false,!dna);
 305             if (found)
 306             {
 307               xrfs[r] = null; // we've recovered seqs for this one.
 308             }
 309           }
 310         }
 311       }
 312       if (!found)
 313       {
 314         if (xrfs != null && xrfs.length > 0)
 315         {
 316           // Try and get the sequence reference...
 317           /*
 318            * Ideal world - we ask for a sequence fetcher implementation here if
 319            * (jalview.io.RunTimeEnvironment.getSequenceFetcher()) (
 320            */
 321           ASequenceFetcher sftch = new SequenceFetcher();
 322           SequenceI[] retrieved = null;
 323           int l = xrfs.length;
 324           for (int r = 0; r < xrfs.length; r++)
 325           {
 326             // filter out any irrelevant or irretrievable references
 327             if (xrfs[r] == null
 328                     || ((source != null && !source.equals(xrfs[r]
 329                             .getSource())) || !sftch.isFetchable(xrfs[r]
 330                             .getSource())))
 331             {
 332               l--;
 333               xrfs[r] = null;
 334             }
 335           }
 336           if (l > 0)
 337           {
 338             System.out
 339                     .println("Attempting to retrieve cross referenced sequences.");
 340             DBRefEntry[] t = new DBRefEntry[l];
 341             l = 0;
 342             for (int r = 0; r < xrfs.length; r++)
 343             {
 344               if (xrfs[r] != null)
 345               {
 346                 t[l++] = xrfs[r];
 347               }
 348             }
 349             xrfs = t;
 350             try
 351             {
 352               retrieved = sftch.getSequences(xrfs, !dna);
 353               // problem here is we don't know which of xrfs resulted in which
 354               // retrieved element
 355             } catch (Exception e)
 356             {
 357               System.err
 358                       .println("Problem whilst retrieving cross references for Sequence : "
 359                               + seq.getName());
 360               e.printStackTrace();
 361             }
 362
 363             if (retrieved != null)
 364             {
 365               List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
 366               CrossRef me = new CrossRef();
 367               for (int rs = 0; rs < retrieved.length; rs++)
 368               {
 369                 // TODO: examine each sequence for 'redundancy'
 370                 DBRefEntry[] dbr = retrieved[rs].getDBRefs();
 371                 if (dbr != null && dbr.length > 0)
 372                 {
 373                   for (int di = 0; di < dbr.length; di++)
 374                   {
 375                     // find any entry where we should put in the sequence being
 376                     // cross-referenced into the map
 377                     Mapping map = dbr[di].getMap();
 378                     if (map != null)
 379                     {
 380                       if (map.getTo() != null && map.getMap() != null)
 381                       {
 382                         // should search the local dataset to find any existing
 383                         // candidates for To !
 384                         try
 385                         {
 386                           // compare ms with dss and replace with dss in mapping
 387                           // if map is congruent
 388                           SequenceI ms = map.getTo();
 389                           int sf = map.getMap().getToLowest();
 390                           int st = map.getMap().getToHighest();
 391                           SequenceI mappedrg = ms.getSubSequence(sf, st);
 392                           SequenceI loc = dss.getSubSequence(sf, st);
 393                           if (mappedrg.getLength() > 0
 394                                   && mappedrg.getSequenceAsString().equals(
 395                                           loc.getSequenceAsString()))
 396                           {
 397                             String msg = "Mapping updated from "
 398                                     + ms.getName()
 399                                     + " to retrieved crossreference "
 400                                     + dss.getName();
 401                             System.out.println(msg);
 402                             // method to update all refs of existing To on
 403                             // retrieved sequence with dss and merge any props
 404                             // on To onto dss.
 405                             map.setTo(dss);
 406                             /*
 407                              * copy sequence features as well, avoiding
 408                              * duplication (e.g. from 2 transcripts)
 409                              */
 410                             SequenceFeature[] sfs = ms
 411                                     .getSequenceFeatures();
 412                             if (sfs != null)
 413                             {
 414                               for (SequenceFeature feat : sfs)
 415                               {
 416                                 /*
 417                                  * we override the equality test here (but not
 418                                  * elsewhere) to ignore Parent attribute
 419                                  * TODO not quite working yet!
 420                                  */
 421                                 if (!copiedFeatures
 422                                         .contains(me.new MySequenceFeature(
 423                                                 feat)))
 424                                 {
 425                                   dss.addSequenceFeature(feat);
 426                                   copiedFeatures.add(feat);
 427                                 }
 428                               }
 429                             }
 430                             cf.addMap(retrieved[rs].getDatasetSequence(),
 431                                     dss, map.getMap());
 432                           }
 433                         } catch (Exception e)
 434                         {
 435                           System.err
 436                                   .println("Exception when consolidating Mapped sequence set...");
 437                           e.printStackTrace(System.err);
 438                         }
 439                       }
 440                     }
 441                   }
 442                 }
 443                 retrieved[rs].updatePDBIds();
 444                 rseqs.add(retrieved[rs]);
 445               }
 446             }
 447           }
 448         }
 449       }
 450     }
 451
 452     Alignment ral = null;
 453     if (rseqs.size() > 0)
 454     {
 455       SequenceI[] rsqs = new SequenceI[rseqs.size()];
 456       rseqs.toArray(rsqs);
 457       ral = new Alignment(rsqs);
 458       if (cf != null && !cf.isEmpty())
 459       {
 460         ral.addCodonFrame(cf);
 461       }
 462     }
 463     return ral;
 464   }
 465
 466   /**
 467    * find references to lrfs in the cross-reference set of each sequence in
 468    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
 469    * based on source and accession string only - Map and Version are nulled.
 470    *
 471    * @param sequenceI
 472    * @param lrfs
 473    * @param dataset
 474    * @param rseqs
 475    * @return true if matches were found.
 476    */
 477   private static boolean searchDatasetXrefs(SequenceI sequenceI,
 478           boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
 479           List<SequenceI> rseqs, AlignedCodonFrame cf)
 480   {
 481     boolean found = false;
 482     if (lrfs == null)
 483     {
 484       return false;
 485     }
 486     for (int i = 0; i < lrfs.length; i++)
 487     {
 488       DBRefEntry xref = new DBRefEntry(lrfs[i]);
 489       // add in wildcards
 490       xref.setVersion(null);
 491       xref.setMap(null);
 492       found = searchDataset(sequenceI, xref, dataset, rseqs, cf, false, dna);
 493     }
 494     return found;
 495   }
 496
 497   /**
 498    * search a given sequence dataset for references matching cross-references to
 499    * the given sequence
 500    *
 501    * @param sequenceI
 502    * @param xrf
 503    * @param dataset
 504    * @param rseqs
 505    *          set of unique sequences
 506    * @param cf
 507    * @return true if one or more unique sequences were found and added
 508    */
 509   public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
 510           AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
 511   {
 512     return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
 513   }
 514
 515   /**
 516    * TODO: generalise to different protein classifications Search dataset for
 517    * DBRefEntrys matching the given one (xrf) and add the associated sequence to
 518    * rseq.
 519    *
 520    * @param sequenceI
 521    * @param xrf
 522    * @param dataset
 523    * @param rseqs
 524    * @param direct
 525    *          - search all references or only subset
 526    * @param dna
 527    *          search dna or protein xrefs (if direct=false)
 528    * @return true if relationship found and sequence added.
 529    */
 530   public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
 531           AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
 532           boolean direct, boolean dna)
 533   {
 534     boolean found = false;
 535     SequenceI[] typer = new SequenceI[1];
 536     if (dataset == null)
 537     {
 538       return false;
 539     }
 540     if (dataset.getSequences() == null)
 541     {
 542       System.err.println("Empty dataset sequence set - NO VECTOR");
 543       return false;
 544     }
 545     List<SequenceI> ds;
 546     synchronized (ds = dataset.getSequences())
 547     {
 548       for (SequenceI nxt : ds)
 549       {
 550         if (nxt != null)
 551         {
 552           if (nxt.getDatasetSequence() != null)
 553           {
 554             System.err
 555                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 556           }
 557           if (nxt != sequenceI && nxt != sequenceI.getDatasetSequence())
 558           {
 559             // check if this is the correct sequence type
 560             {
 561               typer[0] = nxt;
 562               boolean isDna = jalview.util.Comparison.isNucleotide(typer);
 563               if ((direct && isDna == dna) || (!direct && isDna != dna))
 564               {
 565                 // skip this sequence because it is same molecule type
 566                 continue;
 567               }
 568             }
 569
 570             // look for direct or indirect references in common
 571             DBRefEntry[] poss = nxt.getDBRefs(), cands = null;
 572             if (direct)
 573             {
 574               cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
 575             }
 576             else
 577             {
 578               poss = CrossRef.findXDbRefs(dna, poss); //
 579               cands = jalview.util.DBRefUtils.searchRefs(poss, xrf);
 580             }
 581             if (cands != null)
 582             {
 583               if (!rseqs.contains(nxt))
 584               {
 585                 rseqs.add(nxt);
 586                 boolean foundmap = cf != null;
 587                 // don't search if we aren't given a codon map object
 588                 for (int r = 0; foundmap && r < cands.length; r++)
 589                 {
 590                   if (cands[r].hasMap())
 591                   {
 592                     if (cands[r].getMap().getTo() != null
 593                             && cands[r].getMap().getMap().getFromRatio() != cands[r]
 594                                     .getMap().getMap().getToRatio())
 595                     {
 596                       foundmap = true;
 597                       // get sense of map correct for adding to product
 598                       // alignment.
 599                       if (dna)
 600                       {
 601                         // map is from dna seq to a protein product
 602                         cf.addMap(sequenceI, nxt, cands[r].getMap()
 603                                 .getMap());
 604                       }
 605                       else
 606                       {
 607                         // map should be from protein seq to its coding dna
 608                         cf.addMap(nxt, sequenceI, cands[r].getMap()
 609                                 .getMap().getInverse());
 610                       }
 611                     }
 612                   }
 613                 }
 614                 // TODO: add mapping between sequences if necessary
 615                 found = true;
 616               }
 617             }
 618
 619           }
 620         }
 621       }
 622     }
 623     return found;
 624   }
 625
 626   /**
 627    * precalculate different products that can be found for seqs in dataset and
 628    * return them.
 629    *
 630    * @param dna
 631    * @param seqs
 632    * @param dataset
 633    * @param fake
 634    *          - don't actually build lists - just get types
 635    * @return public static Object[] buildXProductsList(boolean dna, SequenceI[]
 636    *         seqs, AlignmentI dataset, boolean fake) { String types[] =
 637    *         jalview.analysis.CrossRef.findSequenceXrefTypes( dna, seqs,
 638    *         dataset); if (types != null) { System.out.println("Xref Types for:
 639    *         "+(dna ? "dna" : "prot")); for (int t = 0; t < types.length; t++) {
 640    *         System.out.println("Type: " + types[t]); SequenceI[] prod =
 641    *         jalview.analysis.CrossRef.findXrefSequences(seqs, dna, types[t]);
 642    *         System.out.println("Found " + ((prod == null) ? "no" : "" +
 643    *         prod.length) + " products"); if (prod!=null) { for (int p=0;
 644    *         p<prod.length; p++) { System.out.println("Prod "+p+":
 645    *         "+prod[p].getDisplayId(true)); } } } } else {
 646    *         System.out.println("Trying getProducts for
 647    *         "+al.getSequenceAt(0).getDisplayId(true));
 648    *         System.out.println("Search DS Xref for: "+(dna ? "dna" : "prot"));
 649    *         // have a bash at finding the products amongst all the retrieved
 650    *         sequences. SequenceI[] prod =
 651    *         jalview.analysis.CrossRef.findXrefSequences(al
 652    *         .getSequencesArray(), dna, null, ds); System.out.println("Found " +
 653    *         ((prod == null) ? "no" : "" + prod.length) + " products"); if
 654    *         (prod!=null) { // select non-equivalent sequences from dataset list
 655    *         for (int p=0; p<prod.length; p++) { System.out.println("Prod "+p+":
 656    *         "+prod[p].getDisplayId(true)); } } } }
 657    */
 658 }