X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=df302345d0af5fdd85bc649f8868ef2cd793bcdf;hb=d62b90cb6effb7b380e5f7d590691dd884b024cf;hp=b78afebf6a6223857dc3ec0a62a2b52f07b789de;hpb=07bb0da70ad46444f5f9bc0b2fa80e9ee3805394;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index b78afeb..df30234 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -27,6 +27,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -35,16 +36,21 @@ import java.util.TreeMap; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.DBRefSource; +import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; +import jalview.util.DBRefUtils; import jalview.util.MapList; +import jalview.util.MappingUtils; /** * grab bag of useful alignment manipulation operations Expect these to be @@ -71,18 +77,22 @@ public class AlignmentUtils for (SequenceI s : core.getSequences()) { SequenceI newSeq = s.deriveSequence(); - if (newSeq.getStart() > maxoffset + final int newSeqStart = newSeq.getStart() - 1; + if (newSeqStart > maxoffset && newSeq.getDatasetSequence().getStart() < s.getStart()) { - maxoffset = newSeq.getStart(); + maxoffset = newSeqStart; } sq.add(newSeq); } if (flankSize > -1) { - maxoffset = flankSize; + maxoffset = Math.min(maxoffset, flankSize); } - // now add offset to create a new expanded alignment + + /* + * now add offset left and right to create an expanded alignment + */ for (SequenceI s : sq) { SequenceI ds = s; @@ -92,8 +102,8 @@ public class AlignmentUtils } int s_end = s.findPosition(s.getStart() + s.getLength()); // find available flanking residues for sequence - int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds - .getEnd() - s_end; + int ustream_ds = s.getStart() - ds.getStart(); + int dstream_ds = ds.getEnd() - s_end; // build new flanked sequence @@ -109,27 +119,27 @@ public class AlignmentUtils offset = maxoffset - flankSize; ustream_ds = flankSize; } - if (flankSize < dstream_ds) + if (flankSize <= dstream_ds) { - dstream_ds = flankSize; + dstream_ds = flankSize - 1; } } + // TODO use Character.toLowerCase to avoid creating String objects? char[] upstream = new String(ds.getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray(); - char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1 + char[] downstream = new String(ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase().toCharArray(); char[] coreseq = s.getSequence(); char[] nseq = new char[offset + upstream.length + downstream.length + coreseq.length]; char c = core.getGapCharacter(); - // TODO could lowercase the flanking regions + int p = 0; for (; p < offset; p++) { nseq[p] = c; } - // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) + - // new String(downstream).toLowerCase()); + System.arraycopy(upstream, 0, nseq, p, upstream.length); System.arraycopy(coreseq, 0, nseq, p + upstream.length, coreseq.length); @@ -147,6 +157,7 @@ public class AlignmentUtils { for (AlignmentAnnotation aa : s.getAnnotation()) { + aa.adjustForAlignment(); // JAL-1712 fix newAl.addAnnotation(aa); } } @@ -1258,4 +1269,137 @@ public class AlignmentUtils } return false; } + + /** + * Constructs an alignment consisting of the mapped exon regions in the given + * nucleotide sequences, and updates mappings to match. + * + * @param dna + * aligned dna sequences + * @param mappings + * from dna to protein; these are replaced with new mappings + * @return an alignment whose sequences are the exon-only parts of the dna + * sequences (or null if no exons are found) + */ + public static AlignmentI makeExonAlignment(SequenceI[] dna, + Set mappings) + { + Set newMappings = new LinkedHashSet(); + List exonSequences = new ArrayList(); + + for (SequenceI dnaSeq : dna) + { + final SequenceI ds = dnaSeq.getDatasetSequence(); + List seqMappings = MappingUtils + .findMappingsForSequence(ds, mappings); + for (AlignedCodonFrame acf : seqMappings) + { + AlignedCodonFrame newMapping = new AlignedCodonFrame(); + final List mappedExons = makeExonSequences(ds, acf, + newMapping); + if (!mappedExons.isEmpty()) + { + exonSequences.addAll(mappedExons); + newMappings.add(newMapping); + } + } + } + AlignmentI al = new Alignment( + exonSequences.toArray(new SequenceI[exonSequences.size()])); + al.setDataset(null); + + /* + * Replace the old mappings with the new ones + */ + mappings.clear(); + mappings.addAll(newMappings); + + return al; + } + + /** + * Helper method to make exon-only sequences and populate their mappings to + * protein products + *

+ * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein + * then generate a sequence CCTTGA with mapping [1, 6] to the same protein + * residues + *

+ * Typically eukaryotic dna will include exons encoding for a single peptide + * sequence i.e. return a single result. Bacterial dna may have overlapping + * exon mappings coding for multiple peptides so return multiple results + * (example EMBL KF591215). + * + * @param dnaSeq + * a dna dataset sequence + * @param mapping + * containing one or more mappings of the sequence to protein + * @param newMapping + * the new mapping to populate, from the exon-only sequences to their + * mapped protein sequences + * @return + */ + protected static List makeExonSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMapping) + { + List exonSequences = new ArrayList(); + List seqMappings = mapping.getMappingsForSequence(dnaSeq); + final char[] dna = dnaSeq.getSequence(); + for (Mapping seqMapping : seqMappings) + { + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + final List dnaExonRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaExonRanges) + { + for (int pos = range[0]; pos <= range[1]; pos++) + { + newSequence.append(dna[pos - 1]); + } + } + + SequenceI exon = new Sequence(dnaSeq.getName(), + newSequence.toString()); + + /* + * Locate any xrefs to CDS database on the protein product and attach to + * the CDS sequence. Also add as a sub-token of the sequence name. + */ + // default to "CDS" if we can't locate an actual gene id + String cdsAccId = FeatureProperties + .getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() + .getDBRef(), DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + exon.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + exon.setName(exon.getName() + "|" + cdsAccId); + exon.createDatasetSequence(); + + /* + * Build new mappings - from the same protein regions, but now to + * contiguous exons + */ + List exonRange = new ArrayList(); + exonRange.add(new int[] + { 1, newSequence.length() }); + MapList map = new MapList(exonRange, seqMapping.getMap() + .getToRanges(), + 3, 1); + newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); + MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); + newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); + + exonSequences.add(exon); + } + return exonSequences; + } }