X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=926317ab2b900c8194ef8bc1c59a496900bf84f9;hb=17e77c3f2949a0729322b4a8d907f3f34b6a9914;hp=7dec4ec40a399d7f9e7f5320a2c6e6a96104474f;hpb=e29e164306d88242fe615973eeebc8c4a49d1bb0;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 7dec4ec..926317a 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1,6 +1,6 @@ /* - * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) - * Copyright (C) $$Year-Rel$$ The Jalview Authors + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.9) + * Copyright (C) 2015 The Jalview Authors * * This file is part of Jalview. * @@ -20,19 +20,6 @@ */ package jalview.analysis; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeMap; - import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; @@ -51,6 +38,20 @@ import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.util.MappingUtils; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; + /** * grab bag of useful alignment manipulation operations Expect these to be * refactored elsewhere at some point. @@ -76,18 +77,22 @@ public class AlignmentUtils for (SequenceI s : core.getSequences()) { SequenceI newSeq = s.deriveSequence(); - if (newSeq.getStart() > maxoffset + final int newSeqStart = newSeq.getStart() - 1; + if (newSeqStart > maxoffset && newSeq.getDatasetSequence().getStart() < s.getStart()) { - maxoffset = newSeq.getStart(); + maxoffset = newSeqStart; } sq.add(newSeq); } if (flankSize > -1) { - maxoffset = flankSize; + maxoffset = Math.min(maxoffset, flankSize); } - // now add offset to create a new expanded alignment + + /* + * now add offset left and right to create an expanded alignment + */ for (SequenceI s : sq) { SequenceI ds = s; @@ -97,8 +102,8 @@ public class AlignmentUtils } int s_end = s.findPosition(s.getStart() + s.getLength()); // find available flanking residues for sequence - int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds - .getEnd() - s_end; + int ustream_ds = s.getStart() - ds.getStart(); + int dstream_ds = ds.getEnd() - s_end; // build new flanked sequence @@ -114,27 +119,27 @@ public class AlignmentUtils offset = maxoffset - flankSize; ustream_ds = flankSize; } - if (flankSize < dstream_ds) + if (flankSize <= dstream_ds) { - dstream_ds = flankSize; + dstream_ds = flankSize - 1; } } + // TODO use Character.toLowerCase to avoid creating String objects? char[] upstream = new String(ds.getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray(); - char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1 + char[] downstream = new String(ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase().toCharArray(); char[] coreseq = s.getSequence(); char[] nseq = new char[offset + upstream.length + downstream.length + coreseq.length]; char c = core.getGapCharacter(); - // TODO could lowercase the flanking regions + int p = 0; for (; p < offset; p++) { nseq[p] = c; } - // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) + - // new String(downstream).toLowerCase()); + System.arraycopy(upstream, 0, nseq, p, upstream.length); System.arraycopy(coreseq, 0, nseq, p + upstream.length, coreseq.length); @@ -152,6 +157,7 @@ public class AlignmentUtils { for (AlignmentAnnotation aa : s.getAnnotation()) { + aa.adjustForAlignment(); // JAL-1712 fix newAl.addAnnotation(aa); } } @@ -222,8 +228,7 @@ public class AlignmentUtils * @param cdnaAlignment * @return */ - public static boolean mapProteinToCdna( - final AlignmentI proteinAlignment, + public static boolean mapProteinToCdna(final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment) { if (proteinAlignment == null || cdnaAlignment == null) @@ -408,8 +413,7 @@ public class AlignmentUtils if (cdnaLength != mappedLength && cdnaLength > 2 && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase() - .equals( - ResidueProperties.START)) + .equals(ResidueProperties.START)) { cdnaStart += 3; cdnaLength -= 3; @@ -423,9 +427,8 @@ public class AlignmentUtils { return null; } - MapList map = new MapList(new int[] - { cdnaStart, cdnaEnd }, new int[] - { proteinStart, proteinEnd }, 3, 1); + MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] { + proteinStart, proteinEnd }, 3, 1); return map; } @@ -442,23 +445,26 @@ public class AlignmentUtils protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart, char[] aaSeqChars) { + if (cdnaSeqChars == null || aaSeqChars == null) + { + return false; + } + int aaResidue = 0; for (int i = cdnaStart; i < cdnaSeqChars.length - 2 && aaResidue < aaSeqChars.length; i += 3, aaResidue++) { String codon = String.valueOf(cdnaSeqChars, i, 3); - final String translated = ResidueProperties.codonTranslate( - codon); + final String translated = ResidueProperties.codonTranslate(codon); /* - * ? allow X in protein to match untranslatable in dna ? + * allow * in protein to match untranslatable in dna */ final char aaRes = aaSeqChars[aaResidue]; - if ((translated == null || "STOP".equals(translated)) && aaRes == 'X') + if ((translated == null || "STOP".equals(translated)) && aaRes == '*') { continue; } - if (translated == null - || !(aaRes == translated.charAt(0))) + if (translated == null || !(aaRes == translated.charAt(0))) { // debug // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": " @@ -489,7 +495,8 @@ public class AlignmentUtils boolean preserveUnmappedGaps) { /* - * Get any mappings from the source alignment to the target (dataset) sequence. + * Get any mappings from the source alignment to the target (dataset) + * sequence. */ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with // all mappings. Would it help to constrain this? @@ -498,7 +505,7 @@ public class AlignmentUtils { return false; } - + /* * Locate the aligned source sequence whose dataset sequence is mapped. We * just take the first match here (as we can't align cDNA like more than one @@ -515,7 +522,7 @@ public class AlignmentUtils break; } } - + if (alignFrom == null) { return false; @@ -540,15 +547,15 @@ public class AlignmentUtils * @param preserveMappedGaps */ public static void alignSequenceAs(SequenceI alignTo, - SequenceI alignFrom, - AlignedCodonFrame mapping, String myGap, char sourceGap, - boolean preserveMappedGaps, boolean preserveUnmappedGaps) + SequenceI alignFrom, AlignedCodonFrame mapping, String myGap, + char sourceGap, boolean preserveMappedGaps, + boolean preserveUnmappedGaps) { // TODO generalise to work for Protein-Protein, dna-dna, dna-protein final char[] thisSeq = alignTo.getSequence(); final char[] thatAligned = alignFrom.getSequence(); StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); - + // aligned and dataset sequence positions, all base zero int thisSeqPos = 0; int sourceDsPos = 0; @@ -685,8 +692,8 @@ public class AlignmentUtils */ protected static int calculateGapsToInsert(boolean preserveMappedGaps, boolean preserveUnmappedGaps, int sourceGapMappedLength, - boolean inExon, int trailingGapLength, - int intronLength, final boolean startOfCodon) + boolean inExon, int trailingGapLength, int intronLength, + final boolean startOfCodon) { int gapsToAdd = 0; if (startOfCodon) @@ -810,8 +817,8 @@ public class AlignmentUtils // mapping is from protein to nucleotide toDna = true; // should ideally get gap count ratio from mapping - gap = String.valueOf(new char[] - { gapCharacter, gapCharacter, gapCharacter }); + gap = String.valueOf(new char[] { gapCharacter, gapCharacter, + gapCharacter }); } else { @@ -856,7 +863,7 @@ public class AlignmentUtils { mapping.markMappedRegion(seq, pos, sr); } - newseq.append(sr.toString()); + newseq.append(sr.getCharacters()); if (first) { first = false; @@ -890,6 +897,9 @@ public class AlignmentUtils */ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) { + List unmappedProtein = new ArrayList(); + unmappedProtein.addAll(protein.getSequences()); + Set mappings = protein.getCodonFrames(); /* @@ -910,10 +920,11 @@ public class AlignmentUtils { addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); + unmappedProtein.remove(prot); } } } - return alignProteinAs(protein, alignedCodons); + return alignProteinAs(protein, alignedCodons, unmappedProtein); } /** @@ -924,10 +935,12 @@ public class AlignmentUtils * @param alignedCodons * an ordered map of codon positions (columns), with sequence/peptide * values present in each column + * @param unmappedProtein * @return */ protected static int alignProteinAs(AlignmentI protein, - Map> alignedCodons) + Map> alignedCodons, + List unmappedProtein) { /* * Prefill aligned sequences with gaps before inserting aligned protein @@ -939,15 +952,18 @@ public class AlignmentUtils String allGaps = String.valueOf(gaps); for (SequenceI seq : protein.getSequences()) { - seq.setSequence(allGaps); + if (!unmappedProtein.contains(seq)) + { + seq.setSequence(allGaps); + } } int column = 0; for (AlignedCodon codon : alignedCodons.keySet()) { - final Map columnResidues = alignedCodons.get(codon); - for (Entry entry : columnResidues - .entrySet()) + final Map columnResidues = alignedCodons + .get(codon); + for (Entry entry : columnResidues.entrySet()) { // place translated codon at its column position in sequence entry.getKey().getSequence()[column] = entry.getValue().charAt(0); @@ -974,8 +990,7 @@ public class AlignmentUtils * the map we are building up */ static void addCodonPositions(SequenceI dna, SequenceI protein, - char gapChar, - Mapping seqMap, + char gapChar, Mapping seqMap, Map> alignedCodons) { Iterator codons = seqMap.getCodonIterator(dna, gapChar); @@ -1010,6 +1025,11 @@ public class AlignmentUtils */ public static boolean isMappable(AlignmentI al1, AlignmentI al2) { + if (al1 == null || al2 == null) + { + return false; + } + /* * Require one nucleotide and one protein */ @@ -1042,18 +1062,26 @@ public class AlignmentUtils * @param mappings * @return */ - public static boolean isMappable(SequenceI dnaSeq, SequenceI proteinSeq, - Set mappings) + protected static boolean isMappable(SequenceI dnaSeq, + SequenceI proteinSeq, Set mappings) { - SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq.getDatasetSequence(); + if (dnaSeq == null || proteinSeq == null) + { + return false; + } + + SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq + .getDatasetSequence(); SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq : proteinSeq.getDatasetSequence(); - + /* * Already mapped? */ - for (AlignedCodonFrame mapping : mappings) { - if ( proteinDs == mapping.getAaForDnaSeq(dnaDs)) { + for (AlignedCodonFrame mapping : mappings) + { + if (proteinDs == mapping.getAaForDnaSeq(dnaDs)) + { return true; } } @@ -1081,7 +1109,8 @@ public class AlignmentUtils * the alignment to check for presence of annotations */ public static void findAddableReferenceAnnotations( - List sequenceScope, Map labelForCalcId, + List sequenceScope, + Map labelForCalcId, final Map> candidates, AlignmentI al) { @@ -1089,7 +1118,7 @@ public class AlignmentUtils { return; } - + /* * For each sequence in scope, make a list of any annotations on the * underlying dataset sequence which are not already on the alignment. @@ -1117,8 +1146,7 @@ public class AlignmentUtils * sequence. */ final Iterable matchedAlignmentAnnotations = al - .findAnnotations(seq, dsann.getCalcId(), - dsann.label); + .findAnnotations(seq, dsann.getCalcId(), dsann.label); if (!matchedAlignmentAnnotations.iterator().hasNext()) { result.add(dsann); @@ -1166,7 +1194,7 @@ public class AlignmentUtils endRes = selectionGroup.getEndRes(); } copyAnn.restrict(startRes, endRes); - + /* * Add to the sequence (sets copyAnn.datasetSequence), unless the * original annotation is already on the sequence. @@ -1204,8 +1232,7 @@ public class AlignmentUtils Collection types, List forSequences, boolean anyType, boolean doShow) { - for (AlignmentAnnotation aa : al - .getAlignmentAnnotation()) + for (AlignmentAnnotation aa : al.getAlignmentAnnotation()) { if (anyType || types.contains(aa.label)) { @@ -1235,7 +1262,7 @@ public class AlignmentUtils /** * Returns true if seq1 has a cross-reference to seq2. Currently this assumes - * that sequence name is structured as Source|AccessId. + * that sequence name is structured as Source|AccessionId. * * @param seq1 * @param seq2 @@ -1278,9 +1305,9 @@ public class AlignmentUtils public static AlignmentI makeExonAlignment(SequenceI[] dna, Set mappings) { - Set newMappings = new HashSet(); + Set newMappings = new LinkedHashSet(); List exonSequences = new ArrayList(); - + for (SequenceI dnaSeq : dna) { final SequenceI ds = dnaSeq.getDatasetSequence(); @@ -1346,8 +1373,8 @@ public class AlignmentUtils /* * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } */ - List exonRanges = seqMapping.getMap().getFromRanges(); - for (int[] range : exonRanges) + final List dnaExonRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaExonRanges) { for (int pos = range[0]; pos <= range[1]; pos++) { @@ -1383,12 +1410,12 @@ public class AlignmentUtils * contiguous exons */ List exonRange = new ArrayList(); - exonRange.add(new int[] - { 1, newSequence.length() }); + exonRange.add(new int[] { 1, newSequence.length() }); MapList map = new MapList(exonRange, seqMapping.getMap() - .getToRanges(), - 3, 1); + .getToRanges(), 3, 1); newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); + MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); + newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); exonSequences.add(exon); }