2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.4)
3 * Copyright (C) 2008 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import java.util.Hashtable;
23 import jalview.analysis.*;
24 import jalview.util.*;
30 * start(inclusive) and end(exclusive) of subsequence on refseq
32 private int start, end;
33 private SequenceI refseq = null;
34 private Hashtable seqProps;
36 * Reference dataset sequence for the cigar string
39 public SequenceI getRefSeq()
46 * @return int start index of cigar ops on refSeq
55 * @return int end index (exclusive) of cigar ops on refSeq
63 * Returns sequence as a string with cigar operations applied to it
66 public String getSequenceString(char GapChar)
68 return (length == 0) ? "" :
69 (String) getSequenceAndDeletions(refseq.getSequenceAsString(start, end),
74 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
77 public SequenceI getSeq(char GapChar)
80 if (refseq == null || length == 0)
84 Object[] edit_result = getSequenceAndDeletions(refseq.getSequenceAsString(
87 if (edit_result == null)
90 "Implementation Error - unexpected null from getSequenceAndDeletions");
92 int bounds[] = (int[]) edit_result[1];
93 seq = new Sequence(refseq.getName(), (String) edit_result[0],
94 refseq.getStart() + start + bounds[0],
95 refseq.getStart() + start +
96 ( (bounds[2] == 0) ? -1 : bounds[2]));
97 seq.setDescription(refseq.getDescription());
98 int sstart = seq.getStart(),
100 // seq.checkValidRange(); probably not needed
101 // recover local properties if present
104 // this recovers dataset sequence reference as well as local features, names, start/end settings.
105 SeqsetUtils.SeqCharacterUnhash(seq, seqProps);
107 // ensure dataset sequence is up to date from local reference
108 seq.setDatasetSequence(refseq);
109 seq.setStart(sstart);
115 We don't allow this - refseq is given at construction time only
116 public void setSeq(SequenceI seq) {
121 * internal constructor - sets seq to a gapless sequence derived from seq
122 * and prepends any 'D' operations needed to get to the first residue of seq.
123 * @param seq SequenceI
124 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
125 * @param _s index of first position in seq
126 * @param _e index after last position in (possibly gapped) seq
127 * @return true if gaps are present in seq
129 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
132 boolean hasgaps = false;
135 throw new Error("Implementation Error - _setSeq(null,...)");
139 throw new Error("Implementation Error: _s=" + _s);
141 String seq_string = seq.getSequenceAsString();
142 if (_e == 0 || _e < _s || _e > seq_string.length())
144 _e = seq_string.length();
146 // resolve start and end positions relative to ungapped reference sequence
147 start = seq.findPosition(_s) - seq.getStart();
148 end = seq.findPosition(_e) - seq.getStart();
149 int l_ungapped = end - start;
150 // Find correct sequence to reference and correct start and end - if necessary
151 SequenceI ds = seq.getDatasetSequence();
154 // make a new dataset sequence
155 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
156 new String(seq_string));
157 l_ungapped = ungapped.length();
158 // check that we haven't just duplicated an ungapped sequence.
159 if (l_ungapped == seq.getLength())
165 ds = new Sequence(seq.getName(), ungapped,
167 seq.getStart() + ungapped.length() - 1);
168 // JBPNote: this would be consistent but may not be useful
169 // seq.setDatasetSequence(ds);
172 // add in offset between seq and the dataset sequence
173 if (ds.getStart() < seq.getStart())
175 int offset = seq.getStart() - ds.getStart();
178 // absolute cigar string
179 addDeleted(_s + offset);
185 // normal behaviour - just mark start and end subsequence
193 // any gaps to process ?
194 if (l_ungapped != (_e - _s))
200 // copy over local properties for the sequence instance of the refseq
201 seqProps = SeqsetUtils.SeqCharacterHash(seq);
203 if (end > ds.getLength())
205 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
206 // end = ds.getLength();
213 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
214 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
215 * @param seq SequenceI
216 * @param operation char[]
219 public SeqCigar(SequenceI seq, char operation[], int range[])
224 throw new Error("Implementation Bug. Null seq !");
226 if (operation.length != range.length)
228 throw new Error("Implementation Bug. Cigar Operation list!= range list");
231 if (operation != null)
233 this.operation = new char[operation.length + _inc_length];
234 this.range = new int[operation.length + _inc_length];
236 if (_setSeq(seq, false, 0, 0))
238 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
240 for (int i = this.length, j = 0; j < operation.length; i++, j++)
242 char op = operation[j];
243 if (op != M && op != I && op != D)
246 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
247 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
249 this.operation[i] = op;
250 this.range[i] = range[j];
252 this.length += operation.length;
256 this.operation = null;
259 if (_setSeq(seq, false, 0, 0))
261 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
267 * add range matched residues to cigar string
270 public void addMatch(int range)
272 this.addOperation(M, range);
277 * insertion and match operations based on seq to the cigar up to
278 * the endpos column of seq.
280 * @param cigar CigarBase
281 * @param seq SequenceI
282 * @param startpos int
284 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
286 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
287 int startpos, int endpos,
288 boolean initialDeletions)
292 int p = 0, res = seq.getLength();
294 if (!initialDeletions)
301 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
302 if ( (startpos <= p) && (p <= endpos))
306 if (range > 0 && op != I)
308 cigar.addOperation(op, range);
316 if (range > 0 && op != M)
318 cigar.addOperation(op, range);
329 if (range > 0 && op != D)
331 cigar.addOperation(op, range);
339 // do nothing - insertions are not made in flanking regions
346 cigar.addOperation(op, range);
351 * create a cigar string for given sequence
352 * @param seq SequenceI
354 public SeqCigar(SequenceI seq)
359 throw new Error("Implementation error for new Cigar(SequenceI)");
361 _setSeq(seq, false, 0, 0);
362 // there is still work to do
363 addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
367 * Create Cigar from a range of gaps and residues on a sequence object
368 * @param seq SequenceI
369 * @param start int - first column in range
370 * @param end int - last column in range
372 public SeqCigar(SequenceI seq, int start, int end)
377 throw new Error("Implementation error for new Cigar(SequenceI)");
379 _setSeq(seq, false, start, end + 1);
380 // there is still work to do
381 addSequenceOps(this, seq, start, end, false);
385 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
386 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
387 * @param seq SequenceI object resolvable to a dataset sequence
388 * @param cigarString String
391 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
394 Object[] opsandrange = parseCigarString(cigarString);
395 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
401 * @param alseqs SeqCigar[]
402 * @param gapCharacter char
403 * @return SequenceI[]
405 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
406 char gapCharacter, ColumnSelection colsel, int[] segments)
408 SequenceI[] seqs = new SequenceI[alseqs.length];
409 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
410 String[] alseqs_string = new String[alseqs.length];
411 Object[] gs_regions = new Object[alseqs.length];
412 for (int i = 0; i < alseqs.length; i++)
414 alseqs_string[i] = alseqs[i].getRefSeq().
415 getSequenceAsString(alseqs[i].start, alseqs[i].end);
416 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
417 gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
418 if (gs_regions[i] == null)
420 throw new Error("Implementation error: " + i +
421 "'th sequence Cigar has no operations.");
423 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
425 // Now account for insertions. (well - deletions)
426 // this is complicated because we must keep track of shifted positions in each sequence
427 ShiftList shifts = new ShiftList();
428 for (int i = 0; i < alseqs.length; i++)
430 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
431 if (gs_region != null)
434 for (int hr = 0; hr < gs_region.length; hr++)
436 int[] region = (int[]) gs_region[hr];
437 char[] insert = new char[region[1] - region[0] + 1];
438 for (int s = 0; s < insert.length; s++)
440 insert[s] = gapCharacter;
442 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
443 for (int s = 0; s < alseqs.length; s++)
447 if (g_seqs[s].length() <= inspos)
449 // prefix insertion with more gaps.
450 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
452 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
455 g_seqs[s].insert(inspos, insert);
459 g_seqs[s].insert(inspos,
460 alseqs_string[i].substring(region[0],
464 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
465 if (segments == null)
467 // add a hidden column for this deletion
468 colsel.hideColumns(inspos, inspos + insert.length - 1);
473 for (int i = 0; i < alseqs.length; i++)
475 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
476 SequenceI ref = alseqs[i].getRefSeq();
477 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
478 ref.getStart() + alseqs[i].start + bounds[0],
479 ref.getStart() + alseqs[i].start +
480 (bounds[2] == 0 ? -1 : bounds[2]));
481 seqs[i].setDatasetSequence(ref);
482 seqs[i].setDescription(ref.getDescription());
484 if (segments != null)
486 for (int i = 0; i < segments.length; i += 3)
488 //int start=shifts.shift(segments[i]-1)+1;
489 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
490 colsel.hideColumns(segments[i + 1],
491 segments[i + 1] + segments[i + 2] - 1);
498 * non rigorous testing
502 * @param seq Sequence
503 * @param ex_cs_gapped String
506 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
508 SeqCigar c_sgapped = new SeqCigar(seq);
509 String cs_gapped = c_sgapped.getCigarstring();
510 if (!cs_gapped.equals(ex_cs_gapped))
512 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
513 "' != " + ex_cs_gapped);
518 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
521 // this is non-rigorous - start and end recovery is not tested.
522 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
523 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
525 System.err.println("Couldn't reconstruct sequence.\n" +
526 gen_sgapped_s.getSequenceAsString() + "\n" +
527 s_gapped.getSequenceAsString());
533 public static void main(String argv[])
537 Sequence s = new Sequence("MySeq",
539 "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
542 Sequence s_gapped = new Sequence("MySeq",
544 "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
546 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
547 s_gapped.setDatasetSequence(s);
549 Sequence s_subsequence_gapped = new Sequence("MySeq",
551 "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
554 s_subsequence_gapped.setDatasetSequence(s);
555 SeqCigar c_null = new SeqCigar(s);
556 String cs_null = c_null.getCigarstring();
557 if (!cs_null.equals("42M"))
560 "Failed to recover ungapped sequence cigar operations:" +
561 ( (cs_null == "") ? "empty string" : cs_null));
563 testCigar_string(s_gapped, ex_cs_gapped);
564 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
565 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
567 System.err.println("Failed parseCigar(" + ex_cs_gapped +
568 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
571 testSeqRecovery(gen_sgapped, s_gapped);
572 // Test dataset resolution
573 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
574 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
576 System.err.println("Failed recovery for subsequence of dataset sequence");
579 if (sub_gapped.getWidth() != sub_gapped_s.length())
581 System.err.println("Failed getWidth()");
584 sub_gapped.getFullWidth();
585 if (sub_gapped.hasDeletedRegions())
587 System.err.println("hasDeletedRegions is incorrect.");
589 // Test start-end region SeqCigar
590 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
591 if (sub_se_gp.getWidth() != 41)
594 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
596 System.out.println("Original sequence align:\n" + sub_gapped_s +
597 "\nReconstructed window from 8 to 48\n"
598 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
599 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
601 SequenceI ssgp = sub_se_gp.getSeq('-');
602 System.out.println("\t " + ssgp.getSequenceAsString());
603 for (int r = 0; r < 10; r++)
605 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
606 int sl = sub_se_gp.getWidth();
608 for (int rs = 0; rs < 10; rs++)
611 sub_se_gp.deleteRange(st, e);
612 String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
613 System.out.println(st + "," + e + "\t:" + ssgapedseq);
618 SeqCigar[] set = new SeqCigar[]
620 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
621 new SeqCigar(s_gapped)};
622 Alignment al = new Alignment(set);
623 for (int i = 0; i < al.getHeight(); i++)
625 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
626 al.getSequenceAt(i).getStart() + "\t" +
627 al.getSequenceAt(i).getEnd() + "\t" +
628 al.getSequenceAt(i).getSequenceAsString());
632 System.out.println("Gapped.");
633 SeqCigar[] set = new SeqCigar[]
635 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
636 new SeqCigar(s_gapped)};
637 set[0].deleteRange(20, 25);
638 Alignment al = new Alignment(set);
639 for (int i = 0; i < al.getHeight(); i++)
641 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
642 al.getSequenceAt(i).getStart() + "\t" +
643 al.getSequenceAt(i).getEnd() + "\t" +
644 al.getSequenceAt(i).getSequenceAsString());
647 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
648 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());