X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=df302345d0af5fdd85bc649f8868ef2cd793bcdf;hb=a4780f236ded1ae07824254bca59b2aa0ea539ba;hp=a811d846f52fd26b490d844107408471827d0164;hpb=3ef44bef1f825d26977dedd1608469712a87fe15;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index a811d84..df30234 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -27,6 +27,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -44,7 +45,6 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; @@ -77,18 +77,22 @@ public class AlignmentUtils for (SequenceI s : core.getSequences()) { SequenceI newSeq = s.deriveSequence(); - if (newSeq.getStart() > maxoffset + final int newSeqStart = newSeq.getStart() - 1; + if (newSeqStart > maxoffset && newSeq.getDatasetSequence().getStart() < s.getStart()) { - maxoffset = newSeq.getStart(); + maxoffset = newSeqStart; } sq.add(newSeq); } if (flankSize > -1) { - maxoffset = flankSize; + maxoffset = Math.min(maxoffset, flankSize); } - // now add offset to create a new expanded alignment + + /* + * now add offset left and right to create an expanded alignment + */ for (SequenceI s : sq) { SequenceI ds = s; @@ -98,8 +102,8 @@ public class AlignmentUtils } int s_end = s.findPosition(s.getStart() + s.getLength()); // find available flanking residues for sequence - int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds - .getEnd() - s_end; + int ustream_ds = s.getStart() - ds.getStart(); + int dstream_ds = ds.getEnd() - s_end; // build new flanked sequence @@ -115,27 +119,27 @@ public class AlignmentUtils offset = maxoffset - flankSize; ustream_ds = flankSize; } - if (flankSize < dstream_ds) + if (flankSize <= dstream_ds) { - dstream_ds = flankSize; + dstream_ds = flankSize - 1; } } + // TODO use Character.toLowerCase to avoid creating String objects? char[] upstream = new String(ds.getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray(); - char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1 + char[] downstream = new String(ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase().toCharArray(); char[] coreseq = s.getSequence(); char[] nseq = new char[offset + upstream.length + downstream.length + coreseq.length]; char c = core.getGapCharacter(); - // TODO could lowercase the flanking regions + int p = 0; for (; p < offset; p++) { nseq[p] = c; } - // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) + - // new String(downstream).toLowerCase()); + System.arraycopy(upstream, 0, nseq, p, upstream.length); System.arraycopy(coreseq, 0, nseq, p + upstream.length, coreseq.length); @@ -153,6 +157,7 @@ public class AlignmentUtils { for (AlignmentAnnotation aa : s.getAnnotation()) { + aa.adjustForAlignment(); // JAL-1712 fix newAl.addAnnotation(aa); } } @@ -1279,7 +1284,7 @@ public class AlignmentUtils public static AlignmentI makeExonAlignment(SequenceI[] dna, Set mappings) { - Set newMappings = new HashSet(); + Set newMappings = new LinkedHashSet(); List exonSequences = new ArrayList(); for (SequenceI dnaSeq : dna) @@ -1287,20 +1292,16 @@ public class AlignmentUtils final SequenceI ds = dnaSeq.getDatasetSequence(); List seqMappings = MappingUtils .findMappingsForSequence(ds, mappings); - if (!seqMappings.isEmpty()) + for (AlignedCodonFrame acf : seqMappings) { - /* - * We assume here that only one protein mapping is expected per dna - * sequence. Mapping to multiple protein sequences is conceivable but - * undefined. Splitting a mapping to one protein sequence across - * multiple mappings is possible but pathological. Need closer - * constraints on the contents of AlignedCodonFrame. - */ AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final SequenceI exonSequence = makeExonSequence(ds, - seqMappings.get(0), newMapping); - exonSequences.add(exonSequence); - newMappings.add(newMapping); + final List mappedExons = makeExonSequences(ds, acf, + newMapping); + if (!mappedExons.isEmpty()) + { + exonSequences.addAll(mappedExons); + newMappings.add(newMapping); + } } } AlignmentI al = new Alignment( @@ -1317,71 +1318,88 @@ public class AlignmentUtils } /** - * Helper method to make an exon-only sequence and populate its mapping to - * protein + * Helper method to make exon-only sequences and populate their mappings to + * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues + *

+ * Typically eukaryotic dna will include exons encoding for a single peptide + * sequence i.e. return a single result. Bacterial dna may have overlapping + * exon mappings coding for multiple peptides so return multiple results + * (example EMBL KF591215). * * @param dnaSeq * a dna dataset sequence * @param mapping - * the current mapping of the sequence to protein + * containing one or more mappings of the sequence to protein * @param newMapping - * the new mapping to populate, from the exon-only sequence + * the new mapping to populate, from the exon-only sequences to their + * mapped protein sequences * @return */ - protected static SequenceI makeExonSequence(SequenceI dnaSeq, - AlignedCodonFrame acf, AlignedCodonFrame newMapping) + protected static List makeExonSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMapping) { - Mapping mapping = acf.getMappingForSequence(dnaSeq); + List exonSequences = new ArrayList(); + List seqMappings = mapping.getMappingsForSequence(dnaSeq); final char[] dna = dnaSeq.getSequence(); - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); - - /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } - */ - List exonRanges = mapping.getMap().getFromRanges(); - for (int[] range : exonRanges) + for (Mapping seqMapping : seqMappings) { - for (int pos = range[0]; pos <= range[1]; pos++) + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + final List dnaExonRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaExonRanges) { - newSequence.append(dna[pos - 1]); + for (int pos = range[0]; pos <= range[1]; pos++) + { + newSequence.append(dna[pos - 1]); + } } - } - SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString()); + SequenceI exon = new Sequence(dnaSeq.getName(), + newSequence.toString()); - /* - * Locate any xrefs to CDS database on the protein product and attach to the - * CDS sequence. Also add as a sub-token of the sequence name. - */ - // default to "CDS" if we can't locate an actual gene id - String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); - DBRefEntry[] cdsRefs = DBRefUtils.selectRefs( - mapping.getTo().getDBRef(), DBRefSource.CODINGDBS); - if (cdsRefs != null) - { - for (DBRefEntry cdsRef : cdsRefs) + /* + * Locate any xrefs to CDS database on the protein product and attach to + * the CDS sequence. Also add as a sub-token of the sequence name. + */ + // default to "CDS" if we can't locate an actual gene id + String cdsAccId = FeatureProperties + .getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() + .getDBRef(), DBRefSource.CODINGDBS); + if (cdsRefs != null) { - exon.addDBRef(new DBRefEntry(cdsRef)); - cdsAccId = cdsRef.getAccessionId(); + for (DBRefEntry cdsRef : cdsRefs) + { + exon.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } } - } - exon.setName(exon.getName() + "|" + cdsAccId); - exon.createDatasetSequence(); + exon.setName(exon.getName() + "|" + cdsAccId); + exon.createDatasetSequence(); - /* - * Build new mappings - from the same protein regions, but now to contiguous - * exons - */ - List exonRange = new ArrayList(); - exonRange.add(new int[] - { 1, newSequence.length() }); - MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1); - newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map); + /* + * Build new mappings - from the same protein regions, but now to + * contiguous exons + */ + List exonRange = new ArrayList(); + exonRange.add(new int[] + { 1, newSequence.length() }); + MapList map = new MapList(exonRange, seqMapping.getMap() + .getToRanges(), + 3, 1); + newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); + MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); + newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); - return exon; + exonSequences.add(exon); + } + return exonSequences; } }