src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceI;
  27 import jalview.schemes.ResidueProperties;
  28 import jalview.util.MapList;
  29
  30 import java.util.ArrayList;
  31 import java.util.LinkedHashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * grab bag of useful alignment manipulation operations Expect these to be
  37  * refactored elsewhere at some point.
  38  *
  39  * @author jimp
  40  *
  41  */
  42 public class AlignmentUtils
  43 {
  44
  45   /**
  46    * Represents the 3 possible results of trying to map one alignment to
  47    * another.
  48    */
  49   public enum MappingResult
  50   {
  51     Mapped, NotMapped, AlreadyMapped
  52   }
  53
  54   /**
  55    * given an existing alignment, create a new alignment including all, or up to
  56    * flankSize additional symbols from each sequence's dataset sequence
  57    *
  58    * @param core
  59    * @param flankSize
  60    * @return AlignmentI
  61    */
  62   public static AlignmentI expandContext(AlignmentI core, int flankSize)
  63   {
  64     List<SequenceI> sq = new ArrayList<SequenceI>();
  65     int maxoffset = 0;
  66     for (SequenceI s : core.getSequences())
  67     {
  68       SequenceI newSeq = s.deriveSequence();
  69       if (newSeq.getStart() > maxoffset
  70               && newSeq.getDatasetSequence().getStart() < s.getStart())
  71       {
  72         maxoffset = newSeq.getStart();
  73       }
  74       sq.add(newSeq);
  75     }
  76     if (flankSize > -1)
  77     {
  78       maxoffset = flankSize;
  79     }
  80     // now add offset to create a new expanded alignment
  81     for (SequenceI s : sq)
  82     {
  83       SequenceI ds = s;
  84       while (ds.getDatasetSequence() != null)
  85       {
  86         ds = ds.getDatasetSequence();
  87       }
  88       int s_end = s.findPosition(s.getStart() + s.getLength());
  89       // find available flanking residues for sequence
  90       int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds
  91               .getEnd() - s_end;
  92
  93       // build new flanked sequence
  94
  95       // compute gap padding to start of flanking sequence
  96       int offset = maxoffset - ustream_ds;
  97
  98       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
  99       if (flankSize >= 0)
 100       {
 101         if (flankSize < ustream_ds)
 102         {
 103           // take up to flankSize residues
 104           offset = maxoffset - flankSize;
 105           ustream_ds = flankSize;
 106         }
 107         if (flankSize < dstream_ds)
 108         {
 109           dstream_ds = flankSize;
 110         }
 111       }
 112       char[] upstream = new String(ds.getSequence(s.getStart() - 1
 113               - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
 114       char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1
 115               + dstream_ds)).toLowerCase().toCharArray();
 116       char[] coreseq = s.getSequence();
 117       char[] nseq = new char[offset + upstream.length + downstream.length
 118               + coreseq.length];
 119       char c = core.getGapCharacter();
 120       // TODO could lowercase the flanking regions
 121       int p = 0;
 122       for (; p < offset; p++)
 123       {
 124         nseq[p] = c;
 125       }
 126       // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) +
 127       // new String(downstream).toLowerCase());
 128       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 129       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 130               coreseq.length);
 131       System.arraycopy(downstream, 0, nseq, p + coreseq.length
 132               + upstream.length, downstream.length);
 133       s.setSequence(new String(nseq));
 134       s.setStart(s.getStart() - ustream_ds);
 135       s.setEnd(s_end + downstream.length);
 136     }
 137     AlignmentI newAl = new jalview.datamodel.Alignment(
 138             sq.toArray(new SequenceI[0]));
 139     for (SequenceI s : sq)
 140     {
 141       if (s.getAnnotation() != null)
 142       {
 143         for (AlignmentAnnotation aa : s.getAnnotation())
 144         {
 145           newAl.addAnnotation(aa);
 146         }
 147       }
 148     }
 149     newAl.setDataset(core.getDataset());
 150     return newAl;
 151   }
 152
 153   /**
 154    * Returns the index (zero-based position) of a sequence in an alignment, or
 155    * -1 if not found.
 156    *
 157    * @param al
 158    * @param seq
 159    * @return
 160    */
 161   public static int getSequenceIndex(AlignmentI al, SequenceI seq)
 162   {
 163     int result = -1;
 164     int pos = 0;
 165     for (SequenceI alSeq : al.getSequences())
 166     {
 167       if (alSeq == seq)
 168       {
 169         result = pos;
 170         break;
 171       }
 172       pos++;
 173     }
 174     return result;
 175   }
 176
 177   /**
 178    * Returns a map of lists of sequences in the alignment, keyed by sequence
 179    * name. For use in mapping between different alignment views of the same
 180    * sequences.
 181    *
 182    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 183    */
 184   public static Map<String, List<SequenceI>> getSequencesByName(
 185           AlignmentI al)
 186   {
 187     Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
 188     for (SequenceI seq : al.getSequences())
 189     {
 190       String name = seq.getName();
 191       if (name != null)
 192       {
 193         List<SequenceI> seqs = theMap.get(name);
 194         if (seqs == null)
 195         {
 196           seqs = new ArrayList<SequenceI>();
 197           theMap.put(name, seqs);
 198         }
 199         seqs.add(seq);
 200       }
 201     }
 202     return theMap;
 203   }
 204
 205   /**
 206    * Build mapping of protein to cDNA alignment. Mappings are made between
 207    * sequences which have the same name and compatible lengths. Has a 3-valued
 208    * result: either Mapped (at least one sequence mapping was created),
 209    * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
 210    * (no possible sequence mappings exist).
 211    *
 212    * @param proteinAlignment
 213    * @param cdnaAlignment
 214    * @return
 215    */
 216   public static MappingResult mapProteinToCdna(
 217           final AlignmentI proteinAlignment,
 218           final AlignmentI cdnaAlignment)
 219   {
 220     boolean mappingPossible = false;
 221     boolean mappingPerformed = false;
 222
 223     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 224
 225     /*
 226      * Build a look-up of cDNA sequences by name, for matching purposes.
 227      */
 228     Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
 229             .getSequencesByName();
 230
 231     for (SequenceI aaSeq : thisSeqs)
 232     {
 233       AlignedCodonFrame acf = new AlignedCodonFrame();
 234       List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
 235       if (candidates == null)
 236       {
 237         /*
 238          * No cDNA sequence with matching name, so no mapping possible for this
 239          * protein sequence
 240          */
 241         continue;
 242       }
 243       mappingPossible = true;
 244       for (SequenceI cdnaSeq : candidates)
 245       {
 246         if (!mappingExists(proteinAlignment.getCodonFrames(),
 247                 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
 248         {
 249           MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
 250           if (map != null)
 251           {
 252             acf.addMap(cdnaSeq, aaSeq, map);
 253             mappingPerformed = true;
 254           }
 255         }
 256       }
 257       proteinAlignment.addCodonFrame(acf);
 258     }
 259
 260     /*
 261      * If at least one mapping was possible but none was done, then the
 262      * alignments are already as mapped as they can be.
 263      */
 264     if (mappingPossible && !mappingPerformed)
 265     {
 266       return MappingResult.AlreadyMapped;
 267     }
 268     else
 269     {
 270       return mappingPerformed ? MappingResult.Mapped
 271               : MappingResult.NotMapped;
 272     }
 273   }
 274
 275   /**
 276    * Answers true if the mappings include one between the given (dataset)
 277    * sequences.
 278    */
 279   public static boolean mappingExists(AlignedCodonFrame[] codonFrames,
 280           SequenceI aaSeq, SequenceI cdnaSeq)
 281   {
 282     if (codonFrames != null)
 283     {
 284       for (AlignedCodonFrame acf : codonFrames)
 285       {
 286         if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
 287         {
 288           return true;
 289         }
 290       }
 291     }
 292     return false;
 293   }
 294
 295   /**
 296    * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
 297    * must be three times the length of the protein, possibly after ignoring
 298    * start and/or stop codons. Returns null if no mapping is determined.
 299    *
 300    * @param proteinSeqs
 301    * @param cdnaSeq
 302    * @return
 303    */
 304   public static MapList mapProteinToCdna(SequenceI proteinSeq,
 305           SequenceI cdnaSeq)
 306   {
 307     String aaSeqString = proteinSeq.getDatasetSequence()
 308             .getSequenceAsString();
 309     String cdnaSeqString = cdnaSeq.getDatasetSequence()
 310             .getSequenceAsString();
 311     if (aaSeqString == null || cdnaSeqString == null)
 312     {
 313       return null;
 314     }
 315
 316     final int mappedLength = 3 * aaSeqString.length();
 317     int cdnaLength = cdnaSeqString.length();
 318     int cdnaStart = 1;
 319     int cdnaEnd = cdnaLength;
 320     final int proteinStart = 1;
 321     final int proteinEnd = aaSeqString.length();
 322
 323     /*
 324      * If lengths don't match, try ignoring stop codon.
 325      */
 326     if (cdnaLength != mappedLength)
 327     {
 328       for (Object stop : ResidueProperties.STOP)
 329       {
 330         if (cdnaSeqString.toUpperCase().endsWith((String) stop))
 331         {
 332           cdnaEnd -= 3;
 333           cdnaLength -= 3;
 334           break;
 335         }
 336       }
 337     }
 338
 339     /*
 340      * If lengths still don't match, try ignoring start codon.
 341      */
 342     if (cdnaLength != mappedLength
 343             && cdnaSeqString.toUpperCase().startsWith(
 344                     ResidueProperties.START))
 345     {
 346       cdnaStart += 3;
 347       cdnaLength -= 3;
 348     }
 349
 350     if (cdnaLength == mappedLength)
 351     {
 352       MapList map = new MapList(new int[]
 353       { cdnaStart, cdnaEnd }, new int[]
 354       { proteinStart, proteinEnd }, 3, 1);
 355       return map;
 356     }
 357     else
 358     {
 359       return null;
 360     }
 361   }
 362 }