2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import jalview.analysis.*;
22 import jalview.util.ShiftList;
23 import java.util.Vector;
29 * start(inclusive) and end(exclusive) of subsequence on refseq
31 private int start, end;
32 private SequenceI refseq = null;
34 * Reference dataset sequence for the cigar string
37 public SequenceI getRefSeq()
43 * @return int start index of cigar ops on refSeq
45 public int getStart() {
50 * @return int end index (exclusive) of cigar ops on refSeq
56 * Returns sequence as a string with cigar operations applied to it
59 public String getSequenceString(char GapChar)
61 return (length == 0) ? "" :
62 (String) getSequenceAndDeletions(refseq.getSequence().substring(start, end), GapChar)[0];
66 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
69 public SequenceI getSeq(char GapChar)
72 if (refseq == null || length == 0)
76 Object[] edit_result = getSequenceAndDeletions(refseq.getSequence().substring(start,end),
78 if (edit_result == null)
81 "Implementation Error - unexpected null from getSequenceAndDeletions");
83 int bounds[] = (int[]) edit_result[1];
84 seq = new Sequence(refseq.getName(), (String) edit_result[0],
85 refseq.getStart() + start+bounds[0],
86 refseq.getStart() + start+((bounds[2]==0) ? -1 : bounds[2]));
87 // seq.checkValidRange(); probably not needed
88 seq.setDatasetSequence(refseq);
93 We don't allow this - refseq is given at construction time only
94 public void setSeq(SequenceI seq) {
99 * internal constructor - sets seq to a gapless sequence derived from seq
100 * and prepends any 'D' operations needed to get to the first residue of seq.
101 * @param seq SequenceI
102 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
103 * @param _s index of first position in seq
104 * @param _e index after last position in (possibly gapped) seq
105 * @return true if gaps are present in seq
107 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
109 boolean hasgaps = false;
112 throw new Error("Implementation Error - _setSeq(null,...)");
115 throw new Error("Implementation Error: _s="+_s);
116 String seq_string = seq.getSequence();
117 if (_e==0 || _e<_s || _e>seq_string.length())
118 _e=seq_string.length();
119 // resolve start and end positions relative to ungapped reference sequence
120 start = seq.findPosition(_s)-seq.getStart();
121 end = seq.findPosition(_e)-seq.getStart();
122 int l_ungapped = end-start;
123 // Find correct sequence to reference and correct start and end - if necessary
124 SequenceI ds = seq.getDatasetSequence();
127 // make a new dataset sequence
128 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
129 new String(seq_string));
130 l_ungapped=ungapped.length();
131 // check that we haven't just duplicated an ungapped sequence.
132 if (l_ungapped == seq.getLength())
138 ds = new Sequence(seq.getName(), ungapped,
140 seq.getStart()+ungapped.length()-1);
141 // JBPNote: this would be consistent but may not be useful
142 // seq.setDatasetSequence(ds);
145 // add in offset between seq and the dataset sequence
146 if (ds.getStart() < seq.getStart())
148 int offset=seq.getStart()-ds.getStart();
149 if (initialDeletion) {
150 // absolute cigar string
151 addDeleted(_s+offset);
155 // normal behaviour - just mark start and end subsequence
163 // any gaps to process ?
164 if (l_ungapped!=(_e-_s))
170 if (end>ds.getLength()) {
171 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
172 // end = ds.getLength();
179 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
180 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
181 * @param seq SequenceI
182 * @param operation char[]
185 public SeqCigar(SequenceI seq, char operation[], int range[])
190 throw new Error("Implementation Bug. Null seq !");
192 if (operation.length != range.length)
194 throw new Error("Implementation Bug. Cigar Operation list!= range list");
197 if (operation != null)
199 this.operation = new char[operation.length + _inc_length];
200 this.range = new int[operation.length + _inc_length];
202 if (_setSeq(seq, false, 0, 0))
204 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
206 for (int i = this.length, j = 0; j < operation.length; i++, j++)
208 char op = operation[j];
209 if (op != M && op != I && op != D)
212 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
213 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
215 this.operation[i] = op;
216 this.range[i] = range[j];
218 this.length += operation.length;
222 this.operation = null;
225 if (_setSeq(seq, false,0, 0))
227 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
233 * add range matched residues to cigar string
236 public void addMatch(int range)
238 this.addOperation(M, range);
242 * insertion and match operations based on seq to the cigar up to
243 * the endpos column of seq.
245 * @param cigar CigarBase
246 * @param seq SequenceI
247 * @param startpos int
249 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
251 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
252 int startpos, int endpos, boolean initialDeletions)
256 int p = 0, res = seq.getLength();
258 if (!initialDeletions)
264 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
265 if ( (startpos <= p) && (p <= endpos))
269 if (range > 0 && op != I)
271 cigar.addOperation(op, range);
279 if (range > 0 && op != M)
281 cigar.addOperation(op, range);
292 if (range > 0 && op != D)
294 cigar.addOperation(op, range);
302 // do nothing - insertions are not made in flanking regions
309 cigar.addOperation(op, range);
314 * create a cigar string for given sequence
315 * @param seq SequenceI
317 public SeqCigar(SequenceI seq)
322 throw new Error("Implementation error for new Cigar(SequenceI)");
324 _setSeq(seq, false, 0, 0);
325 // there is still work to do
326 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
330 * Create Cigar from a range of gaps and residues on a sequence object
331 * @param seq SequenceI
332 * @param start int - first column in range
333 * @param end int - last column in range
335 public SeqCigar(SequenceI seq, int start, int end)
340 throw new Error("Implementation error for new Cigar(SequenceI)");
342 _setSeq(seq, false, start, end+1);
343 // there is still work to do
344 addSequenceOps(this, seq, start, end, false);
348 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
349 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
350 * @param seq SequenceI object resolvable to a dataset sequence
351 * @param cigarString String
354 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
357 Object[] opsandrange = parseCigarString(cigarString);
358 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
364 * @param alseqs SeqCigar[]
365 * @param gapCharacter char
366 * @return SequenceI[]
368 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
369 char gapCharacter, ColumnSelection colsel, int[] segments)
371 SequenceI[] seqs = new SequenceI[alseqs.length];
372 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
373 String[] alseqs_string=new String[alseqs.length];
374 Object[] gs_regions = new Object[alseqs.length];
375 for (int i = 0; i < alseqs.length; i++)
377 alseqs_string[i]=alseqs[i].getRefSeq().
378 getSequence().substring(alseqs[i].start,alseqs[i].end);
379 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
380 if (gs_regions[i] == null)
382 throw new Error("Implementation error: " + i +
383 "'th sequence Cigar has no operations.");
385 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
387 // Now account for insertions. (well - deletions)
388 // this is complicated because we must keep track of shifted positions in each sequence
389 ShiftList shifts = new ShiftList();
390 for (int i = 0; i < alseqs.length; i++)
392 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
393 if (gs_region != null)
396 for (int hr = 0; hr < gs_region.length; hr++)
398 int[] region = (int[]) gs_region[hr];
399 char[] insert = new char[region[1] - region[0] + 1];
400 for (int s = 0; s < insert.length; s++)
402 insert[s] = gapCharacter;
404 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
405 for (int s = 0; s < alseqs.length; s++)
409 if (g_seqs[s].length() <= inspos)
411 // prefix insertion with more gaps.
412 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
414 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
417 g_seqs[s].insert(inspos, insert);
421 g_seqs[s].insert(inspos,
422 alseqs_string[i].substring(region[0], region[1] + 1));
425 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
427 // add a hidden column for this deletion
428 colsel.hideColumns(inspos, inspos+insert.length-1);
432 for (int i = 0; i < alseqs.length; i++)
434 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
435 SequenceI ref = alseqs[i].getRefSeq();
436 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
437 ref.getStart() + alseqs[i].start+bounds[0],
438 ref.getStart() + alseqs[i].start+(bounds[2]==0 ? -1 : bounds[2]));
439 seqs[i].setDatasetSequence(ref);
440 seqs[i].setDescription(ref.getDescription());
442 if (segments!=null) {
443 for (int i=0; i<segments.length; i+=3) {
444 //int start=shifts.shift(segments[i]-1)+1;
445 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
446 colsel.hideColumns(segments[i+1], segments[i+1]+segments[i+2]-1);
453 * non rigorous testing
457 * @param seq Sequence
458 * @param ex_cs_gapped String
461 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
463 SeqCigar c_sgapped = new SeqCigar(seq);
464 String cs_gapped = c_sgapped.getCigarstring();
465 if (!cs_gapped.equals(ex_cs_gapped))
467 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
468 "' != " + ex_cs_gapped);
473 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
476 // this is non-rigorous - start and end recovery is not tested.
477 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
478 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
480 System.err.println("Couldn't reconstruct sequence.\n" +
481 gen_sgapped_s.getSequence() + "\n" +
482 s_gapped.getSequence());
488 public static void main(String argv[])
492 Sequence s = new Sequence("MySeq",
493 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
496 Sequence s_gapped = new Sequence("MySeq",
497 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
499 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
500 s_gapped.setDatasetSequence(s);
502 Sequence s_subsequence_gapped = new Sequence("MySeq",
503 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
506 s_subsequence_gapped.setDatasetSequence(s);
507 SeqCigar c_null = new SeqCigar(s);
508 String cs_null = c_null.getCigarstring();
509 if (!cs_null.equals("42M"))
512 "Failed to recover ungapped sequence cigar operations:" +
513 ( (cs_null == "") ? "empty string" : cs_null));
515 testCigar_string(s_gapped, ex_cs_gapped);
516 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
517 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
519 System.err.println("Failed parseCigar(" + ex_cs_gapped +
520 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
523 testSeqRecovery(gen_sgapped, s_gapped);
524 // Test dataset resolution
525 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
526 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
528 System.err.println("Failed recovery for subsequence of dataset sequence");
531 if (sub_gapped.getWidth() != sub_gapped_s.length())
533 System.err.println("Failed getWidth()");
536 sub_gapped.getFullWidth();
537 if (sub_gapped.hasDeletedRegions())
539 System.err.println("hasDeletedRegions is incorrect.");
541 // Test start-end region SeqCigar
542 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
543 if (sub_se_gp.getWidth() != 41)
546 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
548 System.out.println("Original sequence align:\n" + sub_gapped_s +
549 "\nReconstructed window from 8 to 48\n"
550 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
551 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
553 SequenceI ssgp = sub_se_gp.getSeq('-');
554 System.out.println("\t " + ssgp.getSequence());
555 for (int r = 0; r < 10; r++)
557 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
558 int sl = sub_se_gp.getWidth();
560 for (int rs = 0; rs < 10; rs++)
563 sub_se_gp.deleteRange(st, e);
564 String ssgapedseq = sub_se_gp.getSeq('-').getSequence();
565 System.out.println(st + "," + e + "\t:" + ssgapedseq);
570 SeqCigar[] set = new SeqCigar[]
572 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
573 new SeqCigar(s_gapped)};
574 Alignment al = new Alignment(set);
575 for (int i = 0; i < al.getHeight(); i++)
577 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
578 al.getSequenceAt(i).getStart() + "\t" +
579 al.getSequenceAt(i).getEnd() + "\t" +
580 al.getSequenceAt(i).getSequence());
584 System.out.println("Gapped.");
585 SeqCigar[] set = new SeqCigar[]
587 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
588 new SeqCigar(s_gapped)};
589 set[0].deleteRange(20, 25);
590 Alignment al = new Alignment(set);
591 for (int i = 0; i < al.getHeight(); i++)
593 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
594 al.getSequenceAt(i).getStart() + "\t" +
595 al.getSequenceAt(i).getEnd() + "\t" +
596 al.getSequenceAt(i).getSequence());
599 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
600 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());