*/
package jalview.analysis;
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentAnnotation;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceI;
+import jalview.schemes.ResidueProperties;
+import jalview.util.MapList;
+
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
-
-import jalview.datamodel.SequenceI;
-import jalview.datamodel.AlignmentI;
+import java.util.Map;
/**
- * grab bag of useful alignment manipulation operations
- * Expect these to be refactored elsewhere at some point.
+ * grab bag of useful alignment manipulation operations Expect these to be
+ * refactored elsewhere at some point.
+ *
* @author jimp
- *
+ *
*/
public class AlignmentUtils
{
/**
- * given an existing alignment, create a new alignment including all, or up to flankSize additional symbols from each sequence's dataset sequence
+ * given an existing alignment, create a new alignment including all, or up to
+ * flankSize additional symbols from each sequence's dataset sequence
+ *
* @param core
* @param flankSize
* @return AlignmentI
{
List<SequenceI> sq = new ArrayList<SequenceI>();
int maxoffset = 0;
- for (SequenceI s:core.getSequences())
+ for (SequenceI s : core.getSequences())
{
SequenceI newSeq = s.deriveSequence();
- if (newSeq.getStart()>maxoffset && newSeq.getDatasetSequence().getStart()<s.getStart())
+ if (newSeq.getStart() > maxoffset
+ && newSeq.getDatasetSequence().getStart() < s.getStart())
{
maxoffset = newSeq.getStart();
}
sq.add(newSeq);
}
- if (flankSize>-1) {
+ if (flankSize > -1)
+ {
maxoffset = flankSize;
}
// now add offset to create a new expanded alignment
- for (SequenceI s:sq)
+ for (SequenceI s : sq)
{
SequenceI ds = s;
- while (ds.getDatasetSequence()!=null) {
- ds=ds.getDatasetSequence();
+ while (ds.getDatasetSequence() != null)
+ {
+ ds = ds.getDatasetSequence();
}
- int s_end = s.findPosition(s.getStart()+s.getLength());
+ int s_end = s.findPosition(s.getStart() + s.getLength());
// find available flanking residues for sequence
- int ustream_ds=s.getStart()-ds.getStart(),dstream_ds=ds.getEnd()-s_end;
-
+ int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds
+ .getEnd() - s_end;
+
// build new flanked sequence
-
+
// compute gap padding to start of flanking sequence
- int offset=maxoffset - ustream_ds;
-
+ int offset = maxoffset - ustream_ds;
+
// padding is gapChar x ( maxoffset - min(ustream_ds, flank)
- if (flankSize>=0) {
- if (flankSize<ustream_ds)
+ if (flankSize >= 0)
+ {
+ if (flankSize < ustream_ds)
{
- // take up to flankSize residues
- offset = maxoffset-flankSize;
- ustream_ds = flankSize;
- }
- if (flankSize<dstream_ds)
+ // take up to flankSize residues
+ offset = maxoffset - flankSize;
+ ustream_ds = flankSize;
+ }
+ if (flankSize < dstream_ds)
{
- dstream_ds=flankSize;
+ dstream_ds = flankSize;
}
}
- char[] upstream = new String(ds.getSequence(s.getStart()-1-ustream_ds, s.getStart()-1)).toLowerCase().toCharArray();
- char[] downstream = new String(ds.getSequence(s_end-1,s_end+1+dstream_ds)).toLowerCase().toCharArray();
- char[] coreseq=s.getSequence();
- char[] nseq = new char[offset+upstream.length+downstream.length+coreseq.length];
+ char[] upstream = new String(ds.getSequence(s.getStart() - 1
+ - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
+ char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1
+ + dstream_ds)).toLowerCase().toCharArray();
+ char[] coreseq = s.getSequence();
+ char[] nseq = new char[offset + upstream.length + downstream.length
+ + coreseq.length];
char c = core.getGapCharacter();
// TODO could lowercase the flanking regions
- int p=0;
- for (; p<offset;p++)
+ int p = 0;
+ for (; p < offset; p++)
{
nseq[p] = c;
}
-// s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) + new String(downstream).toLowerCase());
+ // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) +
+ // new String(downstream).toLowerCase());
System.arraycopy(upstream, 0, nseq, p, upstream.length);
- System.arraycopy(coreseq, 0, nseq, p+upstream.length, coreseq.length);
- System.arraycopy(downstream, 0, nseq, p+coreseq.length+upstream.length, downstream.length);
+ System.arraycopy(coreseq, 0, nseq, p + upstream.length,
+ coreseq.length);
+ System.arraycopy(downstream, 0, nseq, p + coreseq.length
+ + upstream.length, downstream.length);
s.setSequence(new String(nseq));
- s.setStart(s.getStart()-ustream_ds);
- s.setEnd(s_end+downstream.length);
+ s.setStart(s.getStart() - ustream_ds);
+ s.setEnd(s_end + downstream.length);
+ }
+ AlignmentI newAl = new jalview.datamodel.Alignment(
+ sq.toArray(new SequenceI[0]));
+ for (SequenceI s : sq)
+ {
+ if (s.getAnnotation() != null)
+ {
+ for (AlignmentAnnotation aa : s.getAnnotation())
+ {
+ newAl.addAnnotation(aa);
+ }
+ }
}
- AlignmentI newAl = new jalview.datamodel.Alignment(sq.toArray(new SequenceI[0]));
newAl.setDataset(core.getDataset());
return newAl;
}
+
+ /**
+ * Returns the index (zero-based position) of a sequence in an alignment, or
+ * -1 if not found.
+ *
+ * @param al
+ * @param seq
+ * @return
+ */
+ public static int getSequenceIndex(AlignmentI al, SequenceI seq)
+ {
+ int result = -1;
+ int pos = 0;
+ for (SequenceI alSeq : al.getSequences())
+ {
+ if (alSeq == seq)
+ {
+ result = pos;
+ break;
+ }
+ pos++;
+ }
+ return result;
+ }
+
+ /**
+ * Returns a map of lists of sequences in the alignment, keyed by sequence
+ * name. For use in mapping between different alignment views of the same
+ * sequences.
+ *
+ * @see jalview.datamodel.AlignmentI#getSequencesByName()
+ */
+ public static Map<String, List<SequenceI>> getSequencesByName(
+ AlignmentI al)
+ {
+ Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
+ for (SequenceI seq : al.getSequences())
+ {
+ String name = seq.getName();
+ if (name != null)
+ {
+ List<SequenceI> seqs = theMap.get(name);
+ if (seqs == null)
+ {
+ seqs = new ArrayList<SequenceI>();
+ theMap.put(name, seqs);
+ }
+ seqs.add(seq);
+ }
+ }
+ return theMap;
+ }
+
+ /**
+ * Build mapping of protein to cDNA alignment. Mappings are made between
+ * sequences which have the same name and compatible lengths. Returns true if
+ * at least one sequence mapping was made, else false.
+ *
+ * @param proteinAlignment
+ * @param cdnaAlignment
+ * @return
+ */
+ public static boolean mapProteinToCdna(final AlignmentI proteinAlignment,
+ final AlignmentI cdnaAlignment)
+ {
+ boolean mapped = false;
+ List<SequenceI> thisSeqs = proteinAlignment.getSequences();
+
+ /*
+ * Build a look-up of cDNA sequences by name, for matching purposes.
+ */
+ Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
+ .getSequencesByName();
+
+ for (SequenceI aaSeq : thisSeqs)
+ {
+ AlignedCodonFrame acf = new AlignedCodonFrame(
+ proteinAlignment.getWidth());
+ List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
+ if (candidates == null)
+ {
+ /*
+ * No cDNA sequence with matching name, so no mapping for this protein
+ * sequence
+ */
+ continue;
+ }
+ for (SequenceI cdnaSeq : candidates)
+ {
+ MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
+ if (map != null)
+ {
+ acf.addMap(cdnaSeq, aaSeq, map);
+ mapped = true;
+ }
+ }
+ proteinAlignment.addCodonFrame(acf);
+ }
+ return mapped;
+ }
+
+ /**
+ * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
+ * must be three times the length of the protein, possibly after ignoring
+ * start and/or stop codons. Returns null if no mapping is determined.
+ *
+ * @param proteinSeqs
+ * @param cdnaSeq
+ * @return
+ */
+ public static MapList mapProteinToCdna(SequenceI proteinSeq,
+ SequenceI cdnaSeq)
+ {
+ String aaSeqString = proteinSeq.getDatasetSequence()
+ .getSequenceAsString();
+ String cdnaSeqString = cdnaSeq.getDatasetSequence()
+ .getSequenceAsString();
+ if (aaSeqString == null || cdnaSeqString == null)
+ {
+ return null;
+ }
+
+ final int mappedLength = 3 * aaSeqString.length();
+ int cdnaLength = cdnaSeqString.length();
+ int cdnaStart = 1;
+ int cdnaEnd = cdnaLength;
+ final int proteinStart = 1;
+ final int proteinEnd = aaSeqString.length();
+
+ /*
+ * If lengths don't match, try ignoring stop codon.
+ */
+ if (cdnaLength != mappedLength)
+ {
+ for (Object stop : ResidueProperties.STOP)
+ {
+ if (cdnaSeqString.toUpperCase().endsWith((String) stop))
+ {
+ cdnaEnd -= 3;
+ cdnaLength -= 3;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If lengths still don't match, try ignoring start codon.
+ */
+ if (cdnaLength != mappedLength
+ && cdnaSeqString.toUpperCase().startsWith(
+ ResidueProperties.START))
+ {
+ cdnaStart += 3;
+ cdnaLength -= 3;
+ }
+
+ if (cdnaLength == mappedLength)
+ {
+ MapList map = new MapList(new int[]
+ { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
+ }
+ else
+ {
+ return null;
+ }
+ }
}