src/jalview/analysis/CrossRefs.java

   1 package jalview.analysis;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.Alignment;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.DBRefEntry;
   7 import jalview.datamodel.Mapping;
   8 import jalview.datamodel.Sequence;
   9 import jalview.datamodel.SequenceFeature;
  10 import jalview.datamodel.SequenceI;
  11 import jalview.util.Comparison;
  12 import jalview.util.DBRefUtils;
  13 import jalview.util.MapList;
  14 import jalview.ws.SequenceFetcherFactory;
  15 import jalview.ws.seqfetcher.ASequenceFetcher;
  16
  17 import java.util.ArrayList;
  18 import java.util.Iterator;
  19 import java.util.List;
  20
  21 public class CrossRefs
  22 {
  23   /**
  24    * Finds cross-references for sequences from a specified source database.
  25    * These may be found in four ways:
  26    * <ul>
  27    * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
  28    * <li>a sequence of complementary type in the alignment dataset, which has a
  29    * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
  30    * <li>a sequence of complementary type in the alignment, which has a
  31    * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
  32    * <li>by fetching the accession from the remote database</li>
  33    * </ul>
  34    *
  35    * @param seqs
  36    *          the sequences whose cross-references we are searching for
  37    * @param dna
  38    *          true if the sequences are from a nucleotide alignment, else false
  39    * @param source
  40    *          the database source we want cross-references to
  41    * @param dataset
  42    *          the alignment dataset the sequences belong to
  43    * @return an alignment containing cross-reference sequences, or null if none
  44    *         found
  45    */
  46   public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
  47           String source, AlignmentI dataset)
  48   {
  49     List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
  50     AlignedCodonFrame mappings = new AlignedCodonFrame();
  51
  52     List<DBRefEntry> sourceRefs = new ArrayList<DBRefEntry>();
  53
  54     for (SequenceI seq : seqs)
  55     {
  56       if (dna != Comparison.isNucleotide(seq))
  57       {
  58         /*
  59          * mixed alignment, and this sequence is of the wrong type
  60          */
  61         continue;
  62       }
  63
  64       /*
  65        * get this sequence's dbrefs to source database (if any)
  66        */
  67       List<DBRefEntry> seqSourceRefs = DBRefUtils.searchRefsForSource(
  68               seq.getDBRefs(), source);
  69
  70       /*
  71        * first extract any mapped sequences from sourceRefs
  72        */
  73       findMappedDbrefs(seq, seqSourceRefs, foundSeqs, mappings);
  74
  75       /*
  76        * for remaining sourceRefs, try to match a
  77        * complementary sequence in the dataset
  78        */
  79       findIndirectCrossReferences(seq, source, seqSourceRefs, dataset,
  80               foundSeqs, mappings);
  81     }
  82
  83     /*
  84      * fetch any remaining sourceRefs from the source database
  85      */
  86     fetchCrossReferences(sourceRefs, foundSeqs, mappings, dna, dataset);
  87
  88     if (foundSeqs.isEmpty())
  89     {
  90       return null;
  91     }
  92     AlignmentI crossRefs = new Alignment(
  93             foundSeqs.toArray(new SequenceI[foundSeqs.size()]));
  94     crossRefs.addCodonFrame(mappings);
  95     return crossRefs;
  96   }
  97
  98   /**
  99    * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
 100    * found, adds the sequence to foundSeqs and removes the dbref from the list.
 101    *
 102    * @param seq
 103    *          the dataset sequence we are searching from
 104    * @param sourceRefs
 105    *          the sequence's dbrefs to 'source'
 106    * @param foundSeqs
 107    *          a list of cross-references to add to
 108    * @param mappings
 109    *          a set of sequence mappings to add to
 110    * @return
 111    */
 112   static void findMappedDbrefs(SequenceI seq, List<DBRefEntry> sourceRefs,
 113           List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
 114   {
 115     Iterator<DBRefEntry> refs = sourceRefs.iterator();
 116     while (refs.hasNext())
 117     {
 118       DBRefEntry dbref = refs.next();
 119       Mapping map = dbref.getMap();
 120       if (map != null)
 121       {
 122         SequenceI mappedTo = map.getTo();
 123         if (mappedTo != null)
 124         {
 125           foundSeqs.add(new Sequence(mappedTo));
 126           refs.remove();
 127
 128           /*
 129            * check mapping is not 'direct' (it shouldn't be if we reach here)
 130            * and add mapping (dna-to-peptide or vice versa) to the set
 131            */
 132           MapList mapList = map.getMap();
 133           int fromRatio = mapList.getFromRatio();
 134           int toRatio = mapList.getToRatio();
 135           if (fromRatio != toRatio)
 136           {
 137             if (fromRatio == 3)
 138             {
 139               mappings.addMap(seq, mappedTo, mapList);
 140             }
 141             else
 142             {
 143               mappings.addMap(mappedTo, seq, mapList.getInverse());
 144             }
 145           }
 146         }
 147       }
 148     }
 149   }
 150
 151   /**
 152    * Tries to fetch seq's database references to 'source' database, and add them
 153    * to the foundSeqs list. If found, tries to make a mapping between seq and
 154    * the retrieved sequence and insert it into the database reference.
 155    *
 156    * @param seq
 157    * @param sourceRefs
 158    * @param foundSeqs
 159    * @param mappings
 160    * @param dna
 161    */
 162   static void fetchCrossReferences(SequenceI seq,
 163           List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
 164           AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
 165   {
 166     ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
 167     SequenceI[] retrieved;
 168     try
 169     {
 170       retrieved = sftch.getSequences(sourceRefs, !dna);
 171     } catch (Exception e)
 172     {
 173       System.err
 174               .println("Problem whilst retrieving cross references for Sequence : "
 175                       + seq.getName());
 176       e.printStackTrace();
 177       return;
 178     }
 179
 180     if (retrieved != null)
 181     {
 182       updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings);
 183
 184       SequenceIdMatcher matcher = new SequenceIdMatcher(
 185               dataset.getSequences());
 186       List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
 187       CrossRef me = new CrossRef();
 188       for (int rs = 0; rs < retrieved.length; rs++)
 189       {
 190         // TODO: examine each sequence for 'redundancy'
 191         DBRefEntry[] dbr = retrieved[rs].getDBRefs();
 192         if (dbr != null && dbr.length > 0)
 193         {
 194           for (int di = 0; di < dbr.length; di++)
 195           {
 196             // find any entry where we should put in the sequence being
 197             // cross-referenced into the map
 198             Mapping map = dbr[di].getMap();
 199             if (map != null)
 200             {
 201               if (map.getTo() != null && map.getMap() != null)
 202               {
 203                 SequenceI matched = matcher.findIdMatch(map.getTo());
 204                 if (matched != null)
 205                 {
 206                   /*
 207                    * already got an xref to this sequence; update this
 208                    * map to point to the same sequence, and add
 209                    * any new dbrefs to it
 210                    */
 211                   for (DBRefEntry ref : map.getTo().getDBRefs())
 212                   {
 213                     matched.addDBRef(ref); // add or update mapping
 214                   }
 215                   map.setTo(matched);
 216                 }
 217                 else
 218                 {
 219                   matcher.add(map.getTo());
 220                 }
 221                 try
 222                 {
 223                   // compare ms with dss and replace with dss in mapping
 224                   // if map is congruent
 225                   SequenceI ms = map.getTo();
 226                   int sf = map.getMap().getToLowest();
 227                   int st = map.getMap().getToHighest();
 228                   SequenceI mappedrg = ms.getSubSequence(sf, st);
 229                   // SequenceI loc = dss.getSubSequence(sf, st);
 230                   if (mappedrg.getLength() > 0
 231                           && ms.getSequenceAsString().equals(
 232                                   seq.getSequenceAsString()))
 233                   // && mappedrg.getSequenceAsString().equals(
 234                   // loc.getSequenceAsString()))
 235                   {
 236                     String msg = "Mapping updated from " + ms.getName()
 237                             + " to retrieved crossreference "
 238                             + seq.getName();
 239                     System.out.println(msg);
 240                     // method to update all refs of existing To on
 241                     // retrieved sequence with dss and merge any props
 242                     // on To onto dss.
 243                     map.setTo(seq);
 244                     /*
 245                      * copy sequence features as well, avoiding
 246                      * duplication (e.g. same variation from 2
 247                      * transcripts)
 248                      */
 249                     SequenceFeature[] sfs = ms.getSequenceFeatures();
 250                     if (sfs != null)
 251                     {
 252                       for (SequenceFeature feat : sfs)
 253                       {
 254                         /*
 255                          * we override SequenceFeature.equals here (but
 256                          * not elsewhere) to ignore Parent attribute
 257                          * TODO not quite working yet!
 258                          */
 259                         if (!copiedFeatures
 260                                 .contains(me.new MySequenceFeature(feat)))
 261                         {
 262                           seq.addSequenceFeature(feat);
 263                           copiedFeatures.add(feat);
 264                         }
 265                       }
 266                     }
 267                   }
 268                   mappings.addMap(retrieved[rs].getDatasetSequence(),
 269                           map.getTo(), map.getMap());
 270                 } catch (Exception e)
 271                 {
 272                   System.err
 273                           .println("Exception when consolidating Mapped sequence set...");
 274                   e.printStackTrace(System.err);
 275                 }
 276               }
 277             }
 278           }
 279         }
 280         retrieved[rs].updatePDBIds();
 281         foundSeqs.add(retrieved[rs]);
 282       }
 283     }
 284   }
 285
 286   /**
 287    * Searches the alignment for a sequence of complementary type to 'seq' which
 288    * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
 289    * removes the resolved sourceRef from the search list.
 290    *
 291    * @param seq
 292    * @param source
 293    * @param sourceRefs
 294    * @param dataset
 295    * @param foundSeqs
 296    * @param mappings
 297    * @return
 298    */
 299   static void findIndirectCrossReferences(SequenceI seq, String source,
 300           List<DBRefEntry> sourceRefs, AlignmentI dataset,
 301           List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
 302   {
 303     Iterator<DBRefEntry> refs = sourceRefs.iterator();
 304     while (refs.hasNext())
 305     {
 306       DBRefEntry dbref = refs.next();
 307       boolean found = searchDatasetForCrossReference(seq, dbref, dataset,
 308               foundSeqs, mappings);
 309       if (found)
 310       {
 311         refs.remove();
 312       }
 313     }
 314   }
 315
 316   /**
 317    * Searches the dataset for a sequence of opposite type to 'excluding', which
 318    * has a cross-reference matching dbref. If found, adds the sequence to
 319    * foundSeqs and removes dbref from the search list.
 320    *
 321    * @param excluding
 322    *          a sequence to ignore (start point of search)
 323    * @param dbref
 324    *          a cross-reference to try to match
 325    * @param dataset
 326    *          sequences to search in
 327    * @param foundSeqs
 328    *          result list to add to
 329    * @param mappings
 330    *          a set of sequence mappings to add to
 331    * @return true if relationship found and sequence added
 332    */
 333   static boolean searchDatasetForCrossReference(SequenceI excluding,
 334           DBRefEntry dbref, AlignmentI dataset, List<SequenceI> foundSeqs,
 335           AlignedCodonFrame mappings)
 336   {
 337     boolean fromNucleotide = Comparison.isNucleotide(excluding);
 338     boolean found = false;
 339     if (dataset == null)
 340     {
 341       return false;
 342     }
 343     if (dataset.getSequences() == null)
 344     {
 345       return false;
 346     }
 347     List<SequenceI> ds;
 348     synchronized (ds = dataset.getSequences())
 349     {
 350       for (SequenceI nxt : ds)
 351       {
 352         if (nxt != null)
 353         {
 354           if (nxt.getDatasetSequence() != null)
 355           {
 356             System.err
 357                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 358           }
 359           if (nxt == excluding || nxt == excluding.getDatasetSequence())
 360           {
 361             continue;
 362           }
 363           if (foundSeqs.contains(nxt))
 364           {
 365             /*
 366              * already added this sequence to cross-refs
 367              */
 368             continue;
 369           }
 370           boolean isDna = Comparison.isNucleotide(nxt);
 371           if (isDna == fromNucleotide)
 372           {
 373             /*
 374              * skip this sequence - wrong molecule type
 375              */
 376             continue;
 377           }
 378
 379           /*
 380            * check if this sequence has any dbref matching source and accession
 381            * (version and mapping may differ)
 382            */
 383           List<DBRefEntry> candidates = DBRefUtils.searchRefs(
 384                   nxt.getDBRefs(), dbref);
 385
 386           if (candidates.isEmpty())
 387           {
 388             continue;
 389           }
 390           found = true;
 391           foundSeqs.add(nxt);
 392           if (mappings != null)
 393           {
 394             // don't search if we aren't given a codon map object
 395             for (DBRefEntry candidate : candidates)
 396             {
 397               if (candidate.hasMap())
 398               {
 399                 Mapping mapping = candidate.getMap();
 400                 MapList map = mapping.getMap();
 401                 if (mapping.getTo() != null
 402                         && map.getFromRatio() != map.getToRatio())
 403                 {
 404                   if (fromNucleotide)
 405                   {
 406                     // map is from dna seq to a protein product
 407                     mappings.addMap(excluding, nxt, map);
 408                   }
 409                   else
 410                   {
 411                     // map is from protein seq to its coding dna
 412                     mappings.addMap(nxt, excluding, map.getInverse());
 413                   }
 414                 }
 415               }
 416             }
 417           }
 418         }
 419       }
 420     }
 421     return found;
 422   }
 423
 424   /**
 425    * Updates any empty mappings in the cross-references with one to a compatible
 426    * retrieved sequence if found, and adds any new mappings to the
 427    * AlignedCodonFrame
 428    *
 429    * @param dna
 430    * @param mapFrom
 431    * @param xrefs
 432    * @param retrieved
 433    * @param mappings
 434    */
 435   static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
 436           List<DBRefEntry> xrefs, SequenceI[] retrieved,
 437           AlignedCodonFrame mappings)
 438   {
 439     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 440     for (DBRefEntry xref : xrefs)
 441     {
 442       if (!xref.hasMap())
 443       {
 444         String targetSeqName = xref.getSource() + "|"
 445                 + xref.getAccessionId();
 446         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 447         if (matches == null)
 448         {
 449           return;
 450         }
 451         for (SequenceI seq : matches)
 452         {
 453           MapList mapping = null;
 454           if (dna)
 455           {
 456             mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
 457           }
 458           else
 459           {
 460             mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
 461             if (mapping != null)
 462             {
 463               mapping = mapping.getInverse();
 464             }
 465           }
 466           if (mapping != null)
 467           {
 468             xref.setMap(new Mapping(seq, mapping));
 469             if (dna)
 470             {
 471               AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
 472             }
 473             if (dna)
 474             {
 475               mappings.addMap(mapFrom, seq, mapping);
 476             }
 477             else
 478             {
 479               mappings.addMap(seq, mapFrom, mapping.getInverse());
 480             }
 481             continue;
 482           }
 483         }
 484       }
 485     }
 486   }
 487 }