int alignedCount = 0;
for (SequenceI dnaSeq : dna.getSequences())
{
- if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings))
+ if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
+ dna.getGapCharacter()))
{
alignedCount++;
}
}
/**
- * Helper method to align (if possible) the dna sequence (cds only) to match
- * the alignment of a mapped protein sequence
+ * Helper method to align (if possible) the dna sequence to match the
+ * alignment of a mapped protein sequence. This is currently limited to
+ * handling coding sequence only.
*
- * @param dnaSeq
+ * @param cdsSeq
* @param protein
* @param mappings
+ * @param gapChar
* @return
*/
- static boolean alignCdsSequenceAsProtein(SequenceI dnaSeq,
- AlignmentI protein, List<AlignedCodonFrame> mappings)
+ static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
+ AlignmentI protein, List<AlignedCodonFrame> mappings, char gapChar)
{
+ SequenceI cdsDss = cdsSeq.getDatasetSequence();
+ if (cdsDss == null)
+ {
+ System.err
+ .println("alignCdsSequenceAsProtein needs aligned sequence!");
+ return false;
+ }
+
List<AlignedCodonFrame> dnaMappings = MappingUtils
- .findMappingsForSequence(dnaSeq, mappings);
+ .findMappingsForSequence(cdsSeq, mappings);
for (AlignedCodonFrame mapping : dnaMappings)
{
- SequenceI peptide = mapping.findAlignedSequence(dnaSeq, protein);
+ SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
+ int peptideLength = peptide.getLength();
if (peptide != null)
{
- Mapping map = mapping.getMappingBetween(dnaSeq, peptide);
+ Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
if (map != null)
{
+ MapList mapList = map.getMap();
+ if (map.getTo() == peptide.getDatasetSequence())
+ {
+ mapList = mapList.getInverse();
+ }
+ int cdsLength = cdsDss.getLength();
+ int mappedFromLength = MappingUtils.getLength(mapList
+ .getFromRanges());
+ int mappedToLength = MappingUtils
+ .getLength(mapList.getToRanges());
+ boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3)
+ || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1);
+ if (cdsLength != mappedToLength && !addStopCodon)
+ {
+ System.err
+ .println(String
+ .format("Can't align cds as protein (length mismatch %d/%d): %s",
+ cdsLength, mappedToLength,
+ cdsSeq.getName()));
+ }
+
/*
- * traverse peptide adding gaps or codons to new cds sequence
+ * pre-fill the aligned cds sequence with gaps
*/
- MapList mapList = map.getMap();
+ char[] alignedCds = new char[peptideLength * 3
+ + (addStopCodon ? 3 : 0)];
+ Arrays.fill(alignedCds, gapChar);
+
+ /*
+ * walk over the aligned peptide sequence and insert mapped
+ * codons for residues in the aligned cds sequence
+ */
+ char[] alignedPeptide = peptide.getSequence();
+ char[] nucleotides = cdsDss.getSequence();
+ int copiedBases = 0;
+ int cdsStart = cdsDss.getStart();
+ int proteinPos = peptide.getStart() - 1;
+ int cdsCol = 0;
+ for (char residue : alignedPeptide)
+ {
+ if (Comparison.isGap(residue))
+ {
+ cdsCol += 3;
+ }
+ else
+ {
+ proteinPos++;
+ int[] codon = mapList.locateInTo(proteinPos, proteinPos);
+ if (codon == null)
+ {
+ // e.g. incomplete start codon, X in peptide
+ cdsCol += 3;
+ }
+ else
+ {
+ for (int j = codon[0]; j <= codon[1]; j++)
+ {
+ char mappedBase = nucleotides[j - cdsStart];
+ alignedCds[cdsCol++] = mappedBase;
+ copiedBases++;
+ }
+ }
+ }
+ }
+
+ /*
+ * append stop codon if not mapped from protein,
+ * closing it up to the end of the mapped sequence
+ */
+ if (copiedBases == nucleotides.length - 3)
+ {
+ for (int i = alignedCds.length - 1; i >= 0; i--)
+ {
+ if (!Comparison.isGap(alignedCds[i]))
+ {
+ cdsCol = i + 1; // gap just after end of sequence
+ break;
+ }
+ }
+ for (int i = nucleotides.length - 3; i < nucleotides.length; i++)
+ {
+ alignedCds[cdsCol++] = nucleotides[i];
+ }
+ }
+ cdsSeq.setSequence(new String(alignedCds));
+ return true;
}
}
}
*/
public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
{
+ /*
+ * easy case - aligning a copy of aligned sequences
+ */
+ if (alignAsSameSequences(unaligned, aligned))
+ {
+ return unaligned.getHeight();
+ }
+
+ /*
+ * fancy case - aligning via mappings between sequences
+ */
List<SequenceI> unmapped = new ArrayList<SequenceI>();
Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
unaligned, aligned, unmapped);
if (!unmapped.contains(seq))
{
char[] newSeq = new char[width];
- Arrays.fill(newSeq, gap);
+ Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
+ // Integer iteration below
int newCol = 0;
int lastCol = 0;
System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
newSeq = tmp;
}
+ // TODO: optimise SequenceI to avoid char[]->String->char[]
seq.setSequence(String.valueOf(newSeq));
realignedCount++;
}
}
/**
+ * If unaligned and aligned sequences share the same dataset sequences, then
+ * simply copies the aligned sequences to the unaligned sequences and returns
+ * true; else returns false
+ *
+ * @param unaligned
+ * @param aligned
+ * @return
+ */
+ static boolean alignAsSameSequences(AlignmentI unaligned,
+ AlignmentI aligned)
+ {
+ if (aligned.getDataset() == null || unaligned.getDataset() == null)
+ {
+ return false; // should only pass alignments with datasets here
+ }
+
+ Map<SequenceI, SequenceI> alignedDatasets = new HashMap<SequenceI, SequenceI>();
+ for (SequenceI seq : aligned.getSequences())
+ {
+ alignedDatasets.put(seq.getDatasetSequence(), seq);
+ }
+
+ /*
+ * first pass - check whether all sequences to be aligned share a dataset
+ * sequence with an aligned sequence
+ */
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ if (!alignedDatasets.containsKey(seq.getDatasetSequence()))
+ {
+ return false;
+ }
+ }
+
+ /*
+ * second pass - copy aligned sequences
+ */
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ SequenceI alignedSequence = alignedDatasets.get(seq
+ .getDatasetSequence());
+ // TODO: getSequenceAsString() will be deprecated in the future
+ // TODO: need to leave to SequenceI implementor to update gaps
+ seq.setSequence(alignedSequence.getSequenceAsString());
+ }
+
+ return true;
+ }
+
+ /**
* Returns a map whose key is alignment column number (base 1), and whose
* values are a map of sequence characters in that column.
*