X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=a811d846f52fd26b490d844107408471827d0164;hb=3ef44bef1f825d26977dedd1608469712a87fe15;hp=b78afebf6a6223857dc3ec0a62a2b52f07b789de;hpb=b6acaa87c22c1d31deb3f9b87a143def43ae34b7;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index b78afeb..a811d84 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -35,16 +35,22 @@ import java.util.TreeMap; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; +import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; /** * grab bag of useful alignment manipulation operations Expect these to be @@ -1258,4 +1264,124 @@ public class AlignmentUtils } return false; } + + /** + * Constructs an alignment consisting of the mapped exon regions in the given + * nucleotide sequences, and updates mappings to match. + * + * @param dna + * aligned dna sequences + * @param mappings + * from dna to protein; these are replaced with new mappings + * @return an alignment whose sequences are the exon-only parts of the dna + * sequences (or null if no exons are found) + */ + public static AlignmentI makeExonAlignment(SequenceI[] dna, + Set mappings) + { + Set newMappings = new HashSet(); + List exonSequences = new ArrayList(); + + for (SequenceI dnaSeq : dna) + { + final SequenceI ds = dnaSeq.getDatasetSequence(); + List seqMappings = MappingUtils + .findMappingsForSequence(ds, mappings); + if (!seqMappings.isEmpty()) + { + /* + * We assume here that only one protein mapping is expected per dna + * sequence. Mapping to multiple protein sequences is conceivable but + * undefined. Splitting a mapping to one protein sequence across + * multiple mappings is possible but pathological. Need closer + * constraints on the contents of AlignedCodonFrame. + */ + AlignedCodonFrame newMapping = new AlignedCodonFrame(); + final SequenceI exonSequence = makeExonSequence(ds, + seqMappings.get(0), newMapping); + exonSequences.add(exonSequence); + newMappings.add(newMapping); + } + } + AlignmentI al = new Alignment( + exonSequences.toArray(new SequenceI[exonSequences.size()])); + al.setDataset(null); + + /* + * Replace the old mappings with the new ones + */ + mappings.clear(); + mappings.addAll(newMappings); + + return al; + } + + /** + * Helper method to make an exon-only sequence and populate its mapping to + * protein + *

+ * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein + * then generate a sequence CCTTGA with mapping [1, 6] to the same protein + * residues + * + * @param dnaSeq + * a dna dataset sequence + * @param mapping + * the current mapping of the sequence to protein + * @param newMapping + * the new mapping to populate, from the exon-only sequence + * @return + */ + protected static SequenceI makeExonSequence(SequenceI dnaSeq, + AlignedCodonFrame acf, AlignedCodonFrame newMapping) + { + Mapping mapping = acf.getMappingForSequence(dnaSeq); + final char[] dna = dnaSeq.getSequence(); + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + List exonRanges = mapping.getMap().getFromRanges(); + for (int[] range : exonRanges) + { + for (int pos = range[0]; pos <= range[1]; pos++) + { + newSequence.append(dna[pos - 1]); + } + } + + SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString()); + + /* + * Locate any xrefs to CDS database on the protein product and attach to the + * CDS sequence. Also add as a sub-token of the sequence name. + */ + // default to "CDS" if we can't locate an actual gene id + String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs( + mapping.getTo().getDBRef(), DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + exon.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + exon.setName(exon.getName() + "|" + cdsAccId); + exon.createDatasetSequence(); + + /* + * Build new mappings - from the same protein regions, but now to contiguous + * exons + */ + List exonRange = new ArrayList(); + exonRange.add(new int[] + { 1, newSequence.length() }); + MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1); + newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map); + + return exon; + } }