From adf4e6df363330f72c49409a21a10a480e7b9f56 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 1 Mar 2016 12:10:25 +0000 Subject: [PATCH] JAL-1705 align CDS to transcripts --- src/jalview/analysis/AlignmentUtils.java | 235 +++++++++++++++++++++++++----- 1 file changed, 199 insertions(+), 36 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 34eaa60..bdcfc2a 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -28,6 +28,7 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; +import jalview.datamodel.IncompleteCodonException; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; @@ -37,6 +38,7 @@ import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.util.MappingUtils; @@ -44,6 +46,8 @@ import jalview.util.MappingUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -1024,14 +1028,20 @@ public class AlignmentUtils Iterator codons = seqMap.getCodonIterator(dna, gapChar); while (codons.hasNext()) { - AlignedCodon codon = codons.next(); - Map seqProduct = alignedCodons.get(codon); - if (seqProduct == null) + try { - seqProduct = new HashMap(); - alignedCodons.put(codon, seqProduct); + AlignedCodon codon = codons.next(); + Map seqProduct = alignedCodons.get(codon); + if (seqProduct == null) + { + seqProduct = new HashMap(); + alignedCodons.put(codon, seqProduct); + } + seqProduct.put(protein, codon.product); + } catch (IncompleteCodonException e) + { + // possible incomplete trailing codon - ignore } - seqProduct.put(protein, codon.product); } } @@ -1320,19 +1330,27 @@ public class AlignmentUtils } /** - * Constructs an alignment consisting of the mapped cds regions in the given - * nucleotide sequences, and updates mappings to match. + * Constructs an alignment consisting of the mapped (CDS) regions in the given + * nucleotide sequences, and updates mappings to match. The new sequences are + * aligned as per the original sequences (with gapped columns omitted). * * @param dna * aligned dna sequences * @param mappings * from dna to protein; these are replaced with new mappings + * @param gapChar * @return an alignment whose sequences are the cds-only parts of the dna - * sequences (or null if no cds are found) + * sequences (or null if no mappings are found) */ public static AlignmentI makeCdsAlignment(SequenceI[] dna, - List mappings) + List mappings, char gapChar) { + List cdsColumns = findCdsColumns(dna); + + /* + * create CDS sequences and new mappings + * (from cdna to cds, and cds to peptide) + */ List newMappings = new ArrayList(); List cdsSequences = new ArrayList(); @@ -1344,8 +1362,8 @@ public class AlignmentUtils for (AlignedCodonFrame acf : seqMappings) { AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final List mappedCds = makeCdsSequences(ds, acf, - newMapping); + final List mappedCds = makeCdsSequences(dnaSeq, acf, + cdsColumns, newMapping, gapChar); if (!mappedCds.isEmpty()) { cdsSequences.addAll(mappedCds); @@ -1355,6 +1373,7 @@ public class AlignmentUtils } AlignmentI al = new Alignment( cdsSequences.toArray(new SequenceI[cdsSequences.size()])); + al.setGapCharacter(gapChar); al.setDataset(null); /* @@ -1367,6 +1386,128 @@ public class AlignmentUtils } /** + * Returns a consolidated list of column ranges where at least one sequence + * has a CDS feature. This assumes CDS features are on genomic sequence i.e. + * are for contiguous CDS ranges (no gaps). + * + * @param seqs + * @return + */ + public static List findCdsColumns(SequenceI[] seqs) + { + // TODO use refactored code from AlignViewController + // markColumnsContainingFeatures, not reinvent the wheel! + + List result = new ArrayList(); + for (SequenceI seq : seqs) + { + result.addAll(findCdsColumns(seq)); + } + + /* + * sort and compact the list into ascending, non-overlapping ranges + */ + Collections.sort(result, new Comparator() + { + @Override + public int compare(int[] o1, int[] o2) + { + return Integer.compare(o1[0], o2[0]); + } + }); + result = MapList.coalesceRanges(result); + + return result; + } + + public static List findCdsColumns(SequenceI seq) + { + List result = new ArrayList(); + SequenceOntologyI so = SequenceOntologyFactory.getInstance(); + SequenceFeature[] sfs = seq.getSequenceFeatures(); + if (sfs != null) + { + for (SequenceFeature sf : sfs) + { + if (so.isA(sf.getType(), SequenceOntologyI.CDS)) + { + int colStart = seq.findIndex(sf.getBegin()); + int colEnd = seq.findIndex(sf.getEnd()); + result.add(new int[] { colStart, colEnd }); + } + } + } + return result; + } + + /** + * Answers true if all sequences have a gap at (or do not extend to) the + * specified column position (base 1) + * + * @param seqs + * @param col + * @return + */ + public static boolean isGappedColumn(List seqs, int col) + { + if (seqs != null) + { + for (SequenceI seq : seqs) + { + if (!Comparison.isGap(seq.getCharAt(col - 1))) + { + return false; + } + } + } + return true; + } + + /** + * Returns the column ranges (base 1) of each aligned sequence that are + * involved in any mapping. This is a helper method for aligning protein + * products of aligned transcripts. + * + * @param mappedSequences + * (possibly gapped) dna sequences + * @param mappings + * @return + */ + protected static List> getMappedColumns( + List mappedSequences, + List mappings) + { + List> result = new ArrayList>(); + for (SequenceI seq : mappedSequences) + { + List columns = new ArrayList(); + List seqMappings = MappingUtils + .findMappingsForSequence(seq, mappings); + for (AlignedCodonFrame mapping : seqMappings) + { + List maps = mapping.getMappingsForSequence(seq); + for (Mapping map : maps) + { + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + * Find and add the overall aligned column range for each + */ + for (int[] cdsRange : map.getMap().getFromRanges()) + { + int startPos = cdsRange[0]; + int endPos = cdsRange[1]; + int startCol = seq.findIndex(startPos); + int endCol = seq.findIndex(endPos); + columns.add(new int[] { startCol, endCol }); + } + } + } + result.add(columns); + } + return result; + } + + /** * Helper method to make cds-only sequences and populate their mappings to * protein products *

@@ -1380,35 +1521,38 @@ public class AlignmentUtils * (example EMBL KF591215). * * @param dnaSeq - * a dna dataset sequence + * a dna aligned sequence * @param mapping * containing one or more mappings of the sequence to protein + * @param ungappedCdsColumns * @param newMappings * the new mapping to populate, from the cds-only sequences to their * mapped protein sequences * @return */ protected static List makeCdsSequences(SequenceI dnaSeq, - AlignedCodonFrame mapping, AlignedCodonFrame newMappings) + AlignedCodonFrame mapping, List ungappedCdsColumns, + AlignedCodonFrame newMappings, char gapChar) { List cdsSequences = new ArrayList(); List seqMappings = mapping.getMappingsForSequence(dnaSeq); for (Mapping seqMapping : seqMappings) { - SequenceI cds = makeCdsSequence(dnaSeq, seqMapping); + SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, + ungappedCdsColumns, gapChar); cdsSequences.add(cds); /* * add new mappings, from dna to cds, and from cds to peptide */ - MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping, - newMappings); + MapList dnaToCds = addCdsMappings(dnaSeq.getDatasetSequence(), cds, + seqMapping, newMappings); /* * transfer any features on dna that overlap the CDS */ - transferFeatures(dnaSeq, cds, dnaToCds, null, "CDS" /* SequenceOntology.CDS */); + transferFeatures(dnaSeq, cds, dnaToCds, null, SequenceOntologyI.CDS); } return cdsSequences; } @@ -1534,10 +1678,11 @@ public class AlignmentUtils * the peptide ranges taken unchanged from the dna mapping */ List cdsRanges = new ArrayList(); - cdsRanges.add(new int[] { 1, cdsSeq.getLength() }); + SequenceI cdsDataset = cdsSeq.getDatasetSequence(); + cdsRanges.add(new int[] { 1, cdsDataset.getLength() }); MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() .getToRanges(), 3, 1); - newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(), + newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide); /* @@ -1545,7 +1690,7 @@ public class AlignmentUtils */ MapList dnaToCds = new MapList( dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); - newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds); + newMappings.addMap(dnaSeq, cdsDataset, dnaToCds); return dnaToCds; } @@ -1557,34 +1702,52 @@ public class AlignmentUtils * nucleotide sequence * @param seqMapping * mappings from CDS regions of nucleotide + * @param ungappedCdsColumns * @return */ protected static SequenceI makeCdsSequence(SequenceI dnaSeq, - Mapping seqMapping) + Mapping seqMapping, List ungappedCdsColumns, char gapChar) { - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); - final char[] dna = dnaSeq.getSequence(); - int offset = dnaSeq.getStart() - 1; + int cdsWidth = MappingUtils.getLength(ungappedCdsColumns); /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + * populate CDS columns with the aligned + * column character if that column is mapped (which may be a gap + * if an intron interrupts a codon), else with a gap */ - final List dnaCdsRanges = seqMapping.getMap().getFromRanges(); - for (int[] range : dnaCdsRanges) + List fromRanges = seqMapping.getMap().getFromRanges(); + char[] cdsChars = new char[cdsWidth]; + int pos = 0; + for (int[] columns : ungappedCdsColumns) { - // TODO handle reverse mapping as well (range[1] < range[0]) - for (int pos = range[0]; pos <= range[1]; pos++) + for (int i = columns[0]; i <= columns[1]; i++) { - newSequence.append(dna[pos - offset - 1]); + char dnaChar = dnaSeq.getCharAt(i - 1); + if (Comparison.isGap(dnaChar)) + { + cdsChars[pos] = gapChar; + } + else + { + int seqPos = dnaSeq.findPosition(i - 1); + if (MappingUtils.contains(fromRanges, seqPos)) + { + cdsChars[pos] = dnaChar; + } + else + { + cdsChars[pos] = gapChar; + } + } + pos++; } } + SequenceI cdsSequence = new Sequence(dnaSeq.getName(), + String.valueOf(cdsChars)); - SequenceI cds = new Sequence(dnaSeq.getName(), - newSequence.toString()); - - transferDbRefs(seqMapping.getTo(), cds); + transferDbRefs(seqMapping.getTo(), cdsSequence); - return cds; + return cdsSequence; } /** -- 1.7.10.2