X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=74066d778593d437ce44a31d947b716d0681b08d;hb=d2299844ae932a515a5007f30caf766a2c83ad97;hp=33a54e8e65530c21f6c3d6c556039a135ab56936;hpb=9f06569004895c680687699673abfe462b6a2086;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 33a54e8..74066d7 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -24,6 +24,7 @@ import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; @@ -1404,31 +1405,103 @@ public class AlignmentUtils * * @param dna * aligned dna sequences - * @param mappings - * from dna to protein - * @param al + * @param dataset + * - throws error if not given a dataset * @return an alignment whose sequences are the cds-only parts of the dna * sequences (or null if no mappings are found) */ public static AlignmentI makeCdsAlignment(SequenceI[] dna, - List mappings, AlignmentI al) + AlignmentI dataset) { + if (dataset.getDataset() != null) + { + throw new Error( + "IMPLEMENTATION ERROR: dataset.getDataset() must be null!"); + } List cdsSeqs = new ArrayList(); + List mappings = dataset.getCodonFrames(); + + /* + * construct CDS sequences from the (cds-to-protein) mappings made earlier; + * this makes it possible to model multiple products from dna (e.g. EMBL); + * however it does mean we don't have the EMBL protein_id (a property on + * the CDS features) in order to make the CDS sequence name :-( + */ for (SequenceI seq : dna) { - AlignedCodonFrame cdsMappings = new AlignedCodonFrame(); + SequenceI seqDss = seq.getDatasetSequence() == null ? seq : seq + .getDatasetSequence(); List seqMappings = MappingUtils .findMappingsForSequence(seq, mappings); - List alignmentMappings = al.getCodonFrames(); for (AlignedCodonFrame mapping : seqMappings) { - for (Mapping aMapping : mapping.getMappingsFromSequence(seq)) + List mappingsFromSequence = mapping.getMappingsFromSequence(seq); + + for (Mapping aMapping : mappingsFromSequence) { - SequenceI cdsSeq = makeCdsSequence(seq.getDatasetSequence(), - aMapping); + if (aMapping.getMap().getFromRatio() == 1) + { + /* + * not a dna-to-protein mapping (likely dna-to-cds) + */ + continue; + } + + /* + * check for an existing CDS sequence i.e. a 3:1 mapping to + * the dna mapping's product + */ + SequenceI cdsSeq = null; + // TODO better mappings collection data model so we can do + // a table lookup instead of double loops to find mappings + SequenceI proteinProduct = aMapping.getTo(); + for (AlignedCodonFrame acf : MappingUtils + .findMappingsForSequence(proteinProduct, mappings)) + { + for (SequenceToSequenceMapping map : acf.getMappings()) + { + if (map.getMapping().getMap().getFromRatio() == 3 + && proteinProduct == map.getMapping().getTo() + && seqDss != map.getFromSeq()) + { + /* + * found a 3:1 mapping to the protein product which is not + * from the dna sequence...assume it is from the CDS sequence + * TODO mappings data model that brings together related + * dna-cds-protein mappings in one object + */ + cdsSeq = map.getFromSeq(); + } + } + } + if (cdsSeq != null) + { + /* + * mappings are always to dataset sequences so create an aligned + * sequence to own it; add the dataset sequence to the dataset + */ + SequenceI derivedSequence = cdsSeq.deriveSequence(); + cdsSeqs.add(derivedSequence); + if (!dataset.getSequences().contains(cdsSeq)) + { + dataset.addSequence(cdsSeq); + } + continue; + } + + /* + * didn't find mapped CDS sequence - construct it and add + * its dataset sequence to the dataset + */ + cdsSeq = makeCdsSequence(seq.getDatasetSequence(), aMapping); + SequenceI cdsSeqDss = cdsSeq.createDatasetSequence(); cdsSeqs.add(cdsSeq); - + if (!dataset.getSequences().contains(cdsSeqDss)) + { + dataset.addSequence(cdsSeqDss); + } + /* * add a mapping from CDS to the (unchanged) mapped to range */ @@ -1437,16 +1510,29 @@ public class AlignmentUtils MapList map = new MapList(cdsRange, aMapping.getMap() .getToRanges(), aMapping.getMap().getFromRatio(), aMapping.getMap().getToRatio()); - cdsMappings.addMap(cdsSeq, aMapping.getTo(), map); + AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame(); + cdsToProteinMapping.addMap(cdsSeq, proteinProduct, map); + + /* + * guard against duplicating the mapping if repeating this action + */ + if (!mappings.contains(cdsToProteinMapping)) + { + mappings.add(cdsToProteinMapping); + } /* * add another mapping from original 'from' range to CDS */ + AlignedCodonFrame dnaToProteinMapping = new AlignedCodonFrame(); map = new MapList(aMapping.getMap().getFromRanges(), cdsRange, 1, 1); - cdsMappings.addMap(seq.getDatasetSequence(), cdsSeq, map); + dnaToProteinMapping.addMap(seq.getDatasetSequence(), cdsSeq, map); + if (!mappings.contains(dnaToProteinMapping)) + { + mappings.add(dnaToProteinMapping); + } - alignmentMappings.add(cdsMappings); /* * transfer any features on dna that overlap the CDS @@ -1456,20 +1542,9 @@ public class AlignmentUtils } } - /* - * add CDS seqs to shared dataset - */ - Alignment dataset = al.getDataset(); - for (SequenceI seq : cdsSeqs) - { - if (!dataset.getSequences().contains(seq.getDatasetSequence())) - { - dataset.addSequence(seq.getDatasetSequence()); - } - } AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs .size()])); - cds.setDataset(dataset); + cds.setDataset((Alignment) dataset); return cds; } @@ -1481,7 +1556,7 @@ public class AlignmentUtils * * @param seq * @param mapping - * @return + * @return CDS sequence (as a dataset sequence) */ static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping) { @@ -1513,7 +1588,6 @@ public class AlignmentUtils SequenceI newSeq = new Sequence(seq.getName() + "|" + mapping.getTo().getName(), newSeqChars, 1, newPos); - newSeq.createDatasetSequence(); return newSeq; } @@ -1799,17 +1873,20 @@ public class AlignmentUtils * sort to get sequence features in start position order * - would be better to store in Sequence as a TreeSet or NCList? */ - Arrays.sort(peptide.getSequenceFeatures(), - new Comparator() - { - @Override - public int compare(SequenceFeature o1, SequenceFeature o2) + if (peptide.getSequenceFeatures() != null) + { + Arrays.sort(peptide.getSequenceFeatures(), + new Comparator() { - int c = Integer.compare(o1.getBegin(), o2.getBegin()); - return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) - : c; - } - }); + @Override + public int compare(SequenceFeature o1, SequenceFeature o2) + { + int c = Integer.compare(o1.getBegin(), o2.getBegin()); + return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) + : c; + } + }); + } return count; }