src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceI;
  27 import jalview.schemes.ResidueProperties;
  28 import jalview.util.MapList;
  29
  30 import java.util.ArrayList;
  31 import java.util.LinkedHashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * grab bag of useful alignment manipulation operations Expect these to be
  37  * refactored elsewhere at some point.
  38  *
  39  * @author jimp
  40  *
  41  */
  42 public class AlignmentUtils
  43 {
  44
  45   /**
  46    * given an existing alignment, create a new alignment including all, or up to
  47    * flankSize additional symbols from each sequence's dataset sequence
  48    *
  49    * @param core
  50    * @param flankSize
  51    * @return AlignmentI
  52    */
  53   public static AlignmentI expandContext(AlignmentI core, int flankSize)
  54   {
  55     List<SequenceI> sq = new ArrayList<SequenceI>();
  56     int maxoffset = 0;
  57     for (SequenceI s : core.getSequences())
  58     {
  59       SequenceI newSeq = s.deriveSequence();
  60       if (newSeq.getStart() > maxoffset
  61               && newSeq.getDatasetSequence().getStart() < s.getStart())
  62       {
  63         maxoffset = newSeq.getStart();
  64       }
  65       sq.add(newSeq);
  66     }
  67     if (flankSize > -1)
  68     {
  69       maxoffset = flankSize;
  70     }
  71     // now add offset to create a new expanded alignment
  72     for (SequenceI s : sq)
  73     {
  74       SequenceI ds = s;
  75       while (ds.getDatasetSequence() != null)
  76       {
  77         ds = ds.getDatasetSequence();
  78       }
  79       int s_end = s.findPosition(s.getStart() + s.getLength());
  80       // find available flanking residues for sequence
  81       int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds
  82               .getEnd() - s_end;
  83
  84       // build new flanked sequence
  85
  86       // compute gap padding to start of flanking sequence
  87       int offset = maxoffset - ustream_ds;
  88
  89       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
  90       if (flankSize >= 0)
  91       {
  92         if (flankSize < ustream_ds)
  93         {
  94           // take up to flankSize residues
  95           offset = maxoffset - flankSize;
  96           ustream_ds = flankSize;
  97         }
  98         if (flankSize < dstream_ds)
  99         {
 100           dstream_ds = flankSize;
 101         }
 102       }
 103       char[] upstream = new String(ds.getSequence(s.getStart() - 1
 104               - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
 105       char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1
 106               + dstream_ds)).toLowerCase().toCharArray();
 107       char[] coreseq = s.getSequence();
 108       char[] nseq = new char[offset + upstream.length + downstream.length
 109               + coreseq.length];
 110       char c = core.getGapCharacter();
 111       // TODO could lowercase the flanking regions
 112       int p = 0;
 113       for (; p < offset; p++)
 114       {
 115         nseq[p] = c;
 116       }
 117       // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) +
 118       // new String(downstream).toLowerCase());
 119       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 120       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 121               coreseq.length);
 122       System.arraycopy(downstream, 0, nseq, p + coreseq.length
 123               + upstream.length, downstream.length);
 124       s.setSequence(new String(nseq));
 125       s.setStart(s.getStart() - ustream_ds);
 126       s.setEnd(s_end + downstream.length);
 127     }
 128     AlignmentI newAl = new jalview.datamodel.Alignment(
 129             sq.toArray(new SequenceI[0]));
 130     for (SequenceI s : sq)
 131     {
 132       if (s.getAnnotation() != null)
 133       {
 134         for (AlignmentAnnotation aa : s.getAnnotation())
 135         {
 136           newAl.addAnnotation(aa);
 137         }
 138       }
 139     }
 140     newAl.setDataset(core.getDataset());
 141     return newAl;
 142   }
 143
 144   /**
 145    * Returns the index (zero-based position) of a sequence in an alignment, or
 146    * -1 if not found.
 147    *
 148    * @param al
 149    * @param seq
 150    * @return
 151    */
 152   public static int getSequenceIndex(AlignmentI al, SequenceI seq)
 153   {
 154     int result = -1;
 155     int pos = 0;
 156     for (SequenceI alSeq : al.getSequences())
 157     {
 158       if (alSeq == seq)
 159       {
 160         result = pos;
 161         break;
 162       }
 163       pos++;
 164     }
 165     return result;
 166   }
 167
 168   /**
 169    * Returns a map of lists of sequences in the alignment, keyed by sequence
 170    * name. For use in mapping between different alignment views of the same
 171    * sequences.
 172    *
 173    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 174    */
 175   public static Map<String, List<SequenceI>> getSequencesByName(
 176           AlignmentI al)
 177   {
 178     Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
 179     for (SequenceI seq : al.getSequences())
 180     {
 181       String name = seq.getName();
 182       if (name != null)
 183       {
 184         List<SequenceI> seqs = theMap.get(name);
 185         if (seqs == null)
 186         {
 187           seqs = new ArrayList<SequenceI>();
 188           theMap.put(name, seqs);
 189         }
 190         seqs.add(seq);
 191       }
 192     }
 193     return theMap;
 194   }
 195
 196   /**
 197    * Build mapping of protein to cDNA alignment. Mappings are made between
 198    * sequences which have the same name and compatible lengths. Returns true if
 199    * at least one sequence mapping was made, else false.
 200    *
 201    * @param proteinAlignment
 202    * @param cdnaAlignment
 203    * @return
 204    */
 205   public static boolean mapProteinToCdna(final AlignmentI proteinAlignment,
 206           final AlignmentI cdnaAlignment)
 207   {
 208     boolean mapped = false;
 209     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 210
 211     /*
 212      * Build a look-up of cDNA sequences by name, for matching purposes.
 213      */
 214     Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
 215             .getSequencesByName();
 216
 217     for (SequenceI aaSeq : thisSeqs)
 218     {
 219       AlignedCodonFrame acf = new AlignedCodonFrame(
 220               proteinAlignment.getWidth());
 221       List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
 222       if (candidates == null)
 223       {
 224         /*
 225          * No cDNA sequence with matching name, so no mapping for this protein
 226          * sequence
 227          */
 228         continue;
 229       }
 230       for (SequenceI cdnaSeq : candidates)
 231       {
 232         MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
 233         if (map != null)
 234         {
 235           acf.addMap(cdnaSeq, aaSeq, map);
 236           mapped = true;
 237         }
 238       }
 239       proteinAlignment.addCodonFrame(acf);
 240     }
 241     return mapped;
 242   }
 243
 244   /**
 245    * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
 246    * must be three times the length of the protein, possibly after ignoring
 247    * start and/or stop codons. Returns null if no mapping is determined.
 248    *
 249    * @param proteinSeqs
 250    * @param cdnaSeq
 251    * @return
 252    */
 253   public static MapList mapProteinToCdna(SequenceI proteinSeq,
 254           SequenceI cdnaSeq)
 255   {
 256     String aaSeqString = proteinSeq.getDatasetSequence()
 257             .getSequenceAsString();
 258     String cdnaSeqString = cdnaSeq.getDatasetSequence()
 259             .getSequenceAsString();
 260     if (aaSeqString == null || cdnaSeqString == null)
 261     {
 262       return null;
 263     }
 264
 265     final int mappedLength = 3 * aaSeqString.length();
 266     int cdnaLength = cdnaSeqString.length();
 267     int cdnaStart = 1;
 268     int cdnaEnd = cdnaLength;
 269     final int proteinStart = 1;
 270     final int proteinEnd = aaSeqString.length();
 271
 272     /*
 273      * If lengths don't match, try ignoring stop codon.
 274      */
 275     if (cdnaLength != mappedLength)
 276     {
 277       for (Object stop : ResidueProperties.STOP)
 278       {
 279         if (cdnaSeqString.toUpperCase().endsWith((String) stop))
 280         {
 281           cdnaEnd -= 3;
 282           cdnaLength -= 3;
 283           break;
 284         }
 285       }
 286     }
 287
 288     /*
 289      * If lengths still don't match, try ignoring start codon.
 290      */
 291     if (cdnaLength != mappedLength
 292             && cdnaSeqString.toUpperCase().startsWith(
 293                     ResidueProperties.START))
 294     {
 295       cdnaStart += 3;
 296       cdnaLength -= 3;
 297     }
 298
 299     if (cdnaLength == mappedLength)
 300     {
 301       MapList map = new MapList(new int[]
 302       { cdnaStart, cdnaEnd }, new int[]
 303       { proteinStart, proteinEnd }, 3, 1);
 304       return map;
 305     }
 306     else
 307     {
 308       return null;
 309     }
 310   }
 311 }