src/jalview/analysis/CrossRefs.java

   1 package jalview.analysis;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.Alignment;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.DBRefEntry;
   7 import jalview.datamodel.Mapping;
   8 import jalview.datamodel.Sequence;
   9 import jalview.datamodel.SequenceFeature;
  10 import jalview.datamodel.SequenceI;
  11 import jalview.util.Comparison;
  12 import jalview.util.DBRefUtils;
  13 import jalview.util.MapList;
  14 import jalview.ws.SequenceFetcherFactory;
  15 import jalview.ws.seqfetcher.ASequenceFetcher;
  16
  17 import java.util.ArrayList;
  18 import java.util.Iterator;
  19 import java.util.List;
  20
  21 public class CrossRefs
  22 {
  23   /**
  24    * Finds cross-references for sequences from a specified source database.
  25    * These may be found in four ways:
  26    * <ul>
  27    * <li>as a DBRefEntry on the known sequence, which has a mapped-to sequence</li>
  28    * <li>a sequence of complementary type in the alignment dataset, which has a
  29    * DBRefEntry to one of the known sequence's 'direct' DBRefs</li>
  30    * <li>a sequence of complementary type in the alignment, which has a
  31    * DBRefEntry to one of the known sequence's 'cross-ref' DBRefs</li>
  32    * <li>by fetching the accession from the remote database</li>
  33    * </ul>
  34    *
  35    * @param seqs
  36    *          the sequences whose cross-references we are searching for
  37    * @param dna
  38    *          true if the sequences are from a nucleotide alignment, else false
  39    * @param source
  40    *          the database source we want cross-references to
  41    * @param dataset
  42    *          the alignment dataset the sequences belong to
  43    * @return an alignment containing cross-reference sequences, or null if none
  44    *         found
  45    */
  46   public static AlignmentI findXrefSequences(SequenceI[] seqs, boolean dna,
  47           String source, AlignmentI dataset)
  48   {
  49     List<SequenceI> foundSeqs = new ArrayList<SequenceI>();
  50     AlignedCodonFrame mappings = new AlignedCodonFrame();
  51
  52     for (SequenceI seq : seqs)
  53     {
  54       if (dna != Comparison.isNucleotide(seq))
  55       {
  56         /*
  57          * mixed alignment, and this sequence is of the wrong type
  58          */
  59         continue;
  60       }
  61
  62       /*
  63        * get this sequence's dbrefs to source database (if any)
  64        */
  65       List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(
  66               seq.getDBRefs(), source);
  67
  68       /*
  69        * first extract any mapped sequences from sourceRefs
  70        */
  71       findMappedDbrefs(seq, sourceRefs, foundSeqs, mappings);
  72
  73       /*
  74        * for remaining sourceRefs, try to match a
  75        * complementary sequence in the dataset
  76        */
  77       findIndirectCrossReferences(seq, source, sourceRefs, dataset,
  78               foundSeqs, mappings);
  79
  80       /*
  81        * fetch any remaining sourceRefs from the source database
  82        */
  83       fetchCrossReferences(seq, sourceRefs, foundSeqs, mappings, dna,
  84               dataset);
  85     }
  86
  87     if (foundSeqs.isEmpty())
  88     {
  89       return null;
  90     }
  91     AlignmentI crossRefs = new Alignment(
  92             foundSeqs.toArray(new SequenceI[foundSeqs.size()]));
  93     crossRefs.addCodonFrame(mappings);
  94     return crossRefs;
  95   }
  96
  97   /**
  98    * Looks for DBRefEntrys to 'source' which have a mapping to a sequence. If
  99    * found, adds the sequence to foundSeqs and removes the dbref from the list.
 100    *
 101    * @param seq
 102    *          the dataset sequence we are searching from
 103    * @param sourceRefs
 104    *          the sequence's dbrefs to 'source'
 105    * @param foundSeqs
 106    *          a list of cross-references to add to
 107    * @param mappings
 108    *          a set of sequence mappings to add to
 109    * @return
 110    */
 111   static void findMappedDbrefs(SequenceI seq, List<DBRefEntry> sourceRefs,
 112           List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
 113   {
 114     Iterator<DBRefEntry> refs = sourceRefs.iterator();
 115     while (refs.hasNext())
 116     {
 117       DBRefEntry dbref = refs.next();
 118       Mapping map = dbref.getMap();
 119       if (map != null)
 120       {
 121         SequenceI mappedTo = map.getTo();
 122         if (mappedTo != null)
 123         {
 124           foundSeqs.add(new Sequence(mappedTo));
 125           refs.remove();
 126
 127           /*
 128            * check mapping is not 'direct' (it shouldn't be if we reach here)
 129            * and add mapping (dna-to-peptide or vice versa) to the set
 130            */
 131           MapList mapList = map.getMap();
 132           int fromRatio = mapList.getFromRatio();
 133           int toRatio = mapList.getToRatio();
 134           if (fromRatio != toRatio)
 135           {
 136             if (fromRatio == 3)
 137             {
 138               mappings.addMap(seq, mappedTo, mapList);
 139             }
 140             else
 141             {
 142               mappings.addMap(mappedTo, seq, mapList.getInverse());
 143             }
 144           }
 145         }
 146       }
 147     }
 148   }
 149
 150   /**
 151    * Tries to fetch seq's database references to 'source' database, and add them
 152    * to the foundSeqs list. If found, tries to make a mapping between seq and
 153    * the retrieved sequence and insert it into the database reference.
 154    *
 155    * @param seq
 156    * @param sourceRefs
 157    * @param foundSeqs
 158    * @param mappings
 159    * @param dna
 160    */
 161   static void fetchCrossReferences(SequenceI seq,
 162           List<DBRefEntry> sourceRefs, List<SequenceI> foundSeqs,
 163           AlignedCodonFrame mappings, boolean dna, AlignmentI dataset)
 164   {
 165     ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
 166     SequenceI[] retrieved;
 167     try
 168     {
 169       retrieved = sftch.getSequences(sourceRefs, !dna);
 170     } catch (Exception e)
 171     {
 172       System.err
 173               .println("Problem whilst retrieving cross references for Sequence : "
 174                       + seq.getName());
 175       e.printStackTrace();
 176       return;
 177     }
 178
 179     if (retrieved != null)
 180     {
 181       updateDbrefMappings(dna, seq, sourceRefs, retrieved, mappings);
 182
 183       SequenceIdMatcher matcher = new SequenceIdMatcher(
 184               dataset.getSequences());
 185       List<SequenceFeature> copiedFeatures = new ArrayList<SequenceFeature>();
 186       CrossRef me = new CrossRef();
 187       for (int rs = 0; rs < retrieved.length; rs++)
 188       {
 189         // TODO: examine each sequence for 'redundancy'
 190         DBRefEntry[] dbr = retrieved[rs].getDBRefs();
 191         if (dbr != null && dbr.length > 0)
 192         {
 193           for (int di = 0; di < dbr.length; di++)
 194           {
 195             // find any entry where we should put in the sequence being
 196             // cross-referenced into the map
 197             Mapping map = dbr[di].getMap();
 198             if (map != null)
 199             {
 200               if (map.getTo() != null && map.getMap() != null)
 201               {
 202                 SequenceI matched = matcher.findIdMatch(map.getTo());
 203                 if (matched != null)
 204                 {
 205                   /*
 206                    * already got an xref to this sequence; update this
 207                    * map to point to the same sequence, and add
 208                    * any new dbrefs to it
 209                    */
 210                   for (DBRefEntry ref : map.getTo().getDBRefs())
 211                   {
 212                     matched.addDBRef(ref); // add or update mapping
 213                   }
 214                   map.setTo(matched);
 215                 }
 216                 else
 217                 {
 218                   matcher.add(map.getTo());
 219                 }
 220                 try
 221                 {
 222                   // compare ms with dss and replace with dss in mapping
 223                   // if map is congruent
 224                   SequenceI ms = map.getTo();
 225                   int sf = map.getMap().getToLowest();
 226                   int st = map.getMap().getToHighest();
 227                   SequenceI mappedrg = ms.getSubSequence(sf, st);
 228                   // SequenceI loc = dss.getSubSequence(sf, st);
 229                   if (mappedrg.getLength() > 0
 230                           && ms.getSequenceAsString().equals(
 231                                   seq.getSequenceAsString()))
 232                   // && mappedrg.getSequenceAsString().equals(
 233                   // loc.getSequenceAsString()))
 234                   {
 235                     String msg = "Mapping updated from " + ms.getName()
 236                             + " to retrieved crossreference "
 237                             + seq.getName();
 238                     System.out.println(msg);
 239                     // method to update all refs of existing To on
 240                     // retrieved sequence with dss and merge any props
 241                     // on To onto dss.
 242                     map.setTo(seq);
 243                     /*
 244                      * copy sequence features as well, avoiding
 245                      * duplication (e.g. same variation from 2
 246                      * transcripts)
 247                      */
 248                     SequenceFeature[] sfs = ms.getSequenceFeatures();
 249                     if (sfs != null)
 250                     {
 251                       for (SequenceFeature feat : sfs)
 252                       {
 253                         /*
 254                          * we override SequenceFeature.equals here (but
 255                          * not elsewhere) to ignore Parent attribute
 256                          * TODO not quite working yet!
 257                          */
 258                         if (!copiedFeatures
 259                                 .contains(me.new MySequenceFeature(feat)))
 260                         {
 261                           seq.addSequenceFeature(feat);
 262                           copiedFeatures.add(feat);
 263                         }
 264                       }
 265                     }
 266                   }
 267                   mappings.addMap(retrieved[rs].getDatasetSequence(),
 268                           map.getTo(), map.getMap());
 269                 } catch (Exception e)
 270                 {
 271                   System.err
 272                           .println("Exception when consolidating Mapped sequence set...");
 273                   e.printStackTrace(System.err);
 274                 }
 275               }
 276             }
 277           }
 278         }
 279         retrieved[rs].updatePDBIds();
 280         foundSeqs.add(retrieved[rs]);
 281       }
 282     }
 283   }
 284
 285   /**
 286    * Searches the alignment for a sequence of complementary type to 'seq' which
 287    * shares a DBRefEntry with it. If found, adds the sequence to foundSeqs and
 288    * removes the resolved sourceRef from the search list.
 289    *
 290    * @param seq
 291    * @param source
 292    * @param sourceRefs
 293    * @param dataset
 294    * @param foundSeqs
 295    * @param mappings
 296    * @return
 297    */
 298   static void findIndirectCrossReferences(SequenceI seq, String source,
 299           List<DBRefEntry> sourceRefs, AlignmentI dataset,
 300           List<SequenceI> foundSeqs, AlignedCodonFrame mappings)
 301   {
 302     Iterator<DBRefEntry> refs = sourceRefs.iterator();
 303     while (refs.hasNext())
 304     {
 305       DBRefEntry dbref = refs.next();
 306       boolean found = searchDatasetForCrossReference(seq, dbref, dataset,
 307               foundSeqs, mappings);
 308       if (found)
 309       {
 310         refs.remove();
 311       }
 312     }
 313   }
 314
 315   /**
 316    * Searches the dataset for a sequence of opposite type to 'excluding', which
 317    * has a cross-reference matching dbref. If found, adds the sequence to
 318    * foundSeqs and removes dbref from the search list.
 319    *
 320    * @param excluding
 321    *          a sequence to ignore (start point of search)
 322    * @param dbref
 323    *          a cross-reference to try to match
 324    * @param dataset
 325    *          sequences to search in
 326    * @param foundSeqs
 327    *          result list to add to
 328    * @param mappings
 329    *          a set of sequence mappings to add to
 330    * @return true if relationship found and sequence added
 331    */
 332   static boolean searchDatasetForCrossReference(SequenceI excluding,
 333           DBRefEntry dbref, AlignmentI dataset, List<SequenceI> foundSeqs,
 334           AlignedCodonFrame mappings)
 335   {
 336     boolean fromNucleotide = Comparison.isNucleotide(excluding);
 337     boolean found = false;
 338     if (dataset == null)
 339     {
 340       return false;
 341     }
 342     if (dataset.getSequences() == null)
 343     {
 344       return false;
 345     }
 346     List<SequenceI> ds;
 347     synchronized (ds = dataset.getSequences())
 348     {
 349       for (SequenceI nxt : ds)
 350       {
 351         if (nxt != null)
 352         {
 353           if (nxt.getDatasetSequence() != null)
 354           {
 355             System.err
 356                     .println("Implementation warning: getProducts passed a dataset alignment without dataset sequences in it!");
 357           }
 358           if (nxt == excluding || nxt == excluding.getDatasetSequence())
 359           {
 360             continue;
 361           }
 362           if (foundSeqs.contains(nxt))
 363           {
 364             /*
 365              * already added this sequence to cross-refs
 366              */
 367             continue;
 368           }
 369           boolean isDna = Comparison.isNucleotide(nxt);
 370           if (isDna == fromNucleotide)
 371           {
 372             /*
 373              * skip this sequence - wrong molecule type
 374              */
 375             continue;
 376           }
 377
 378           /*
 379            * check if this sequence has any dbref matching source and accession
 380            * (version and mapping may differ)
 381            */
 382           List<DBRefEntry> candidates = DBRefUtils.searchRefs(
 383                   nxt.getDBRefs(), dbref);
 384
 385           if (candidates.isEmpty())
 386           {
 387             continue;
 388           }
 389           found = true;
 390           foundSeqs.add(nxt);
 391           if (mappings != null)
 392           {
 393             // don't search if we aren't given a codon map object
 394             for (DBRefEntry candidate : candidates)
 395             {
 396               if (candidate.hasMap())
 397               {
 398                 Mapping mapping = candidate.getMap();
 399                 MapList map = mapping.getMap();
 400                 if (mapping.getTo() != null
 401                         && map.getFromRatio() != map.getToRatio())
 402                 {
 403                   if (fromNucleotide)
 404                   {
 405                     // map is from dna seq to a protein product
 406                     mappings.addMap(excluding, nxt, map);
 407                   }
 408                   else
 409                   {
 410                     // map is from protein seq to its coding dna
 411                     mappings.addMap(nxt, excluding, map.getInverse());
 412                   }
 413                 }
 414               }
 415             }
 416           }
 417         }
 418       }
 419     }
 420     return found;
 421   }
 422
 423   /**
 424    * Updates any empty mappings in the cross-references with one to a compatible
 425    * retrieved sequence if found, and adds any new mappings to the
 426    * AlignedCodonFrame
 427    *
 428    * @param dna
 429    * @param mapFrom
 430    * @param xrefs
 431    * @param retrieved
 432    * @param mappings
 433    */
 434   static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
 435           List<DBRefEntry> xrefs, SequenceI[] retrieved,
 436           AlignedCodonFrame mappings)
 437   {
 438     SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
 439     for (DBRefEntry xref : xrefs)
 440     {
 441       if (!xref.hasMap())
 442       {
 443         String targetSeqName = xref.getSource() + "|"
 444                 + xref.getAccessionId();
 445         SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
 446         if (matches == null)
 447         {
 448           return;
 449         }
 450         for (SequenceI seq : matches)
 451         {
 452           MapList mapping = null;
 453           if (dna)
 454           {
 455             mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
 456           }
 457           else
 458           {
 459             mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
 460             if (mapping != null)
 461             {
 462               mapping = mapping.getInverse();
 463             }
 464           }
 465           if (mapping != null)
 466           {
 467             xref.setMap(new Mapping(seq, mapping));
 468             if (dna)
 469             {
 470               AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
 471             }
 472             if (dna)
 473             {
 474               mappings.addMap(mapFrom, seq, mapping);
 475             }
 476             else
 477             {
 478               mappings.addMap(seq, mapFrom, mapping.getInverse());
 479             }
 480             continue;
 481           }
 482         }
 483       }
 484     }
 485   }
 486 }