1 package jalview.datamodel;
3 import jalview.analysis.*;
4 import jalview.util.ShiftList;
5 import java.util.Vector;
11 * start(inclusive) and end(exclusive) of subsequence on refseq
13 private int start, end;
14 private SequenceI refseq = null;
16 * Reference dataset sequence for the cigar string
19 public SequenceI getRefSeq()
25 * @return int start index of cigar ops on refSeq
27 public int getStart() {
32 * @return int end index (exclusive) of cigar ops on refSeq
38 * Returns sequence as a string with cigar operations applied to it
41 public String getSequenceString(char GapChar)
43 return (length == 0) ? "" :
44 (String) getSequenceAndDeletions(refseq.getSequence().substring(start, end), GapChar)[0];
48 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
51 public SequenceI getSeq(char GapChar)
54 if (refseq == null || length == 0)
58 Object[] edit_result = getSequenceAndDeletions(refseq.getSequence().substring(start,end),
60 if (edit_result == null)
63 "Implementation Error - unexpected null from getSequenceAndDeletions");
66 seq = new Sequence(refseq.getName(), (String) edit_result[0],
67 refseq.getStart() + start+( (int[]) edit_result[1])[0],
68 refseq.getStart() + start+( (int[]) edit_result[1])[2]);
69 seq.setDatasetSequence(refseq);
74 We don't allow this - refseq is given at construction time only
75 public void setSeq(SequenceI seq) {
80 * internal constructor - sets seq to a gapless sequence derived from seq
81 * and prepends any 'D' operations needed to get to the first residue of seq.
82 * @param seq SequenceI
83 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
84 * @param _s index of first position in seq
85 * @param _e index after last position in (possibly gapped) seq
86 * @return true if gaps are present in seq
88 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
90 boolean hasgaps = false;
93 throw new Error("Implementation Error - _setSeq(null,...)");
96 throw new Error("Implementation Error: _s="+_s);
97 String seq_string = seq.getSequence();
98 if (_e==0 || _e<_s || _e>seq_string.length())
99 _e=seq_string.length();
100 // resolve start and end positions relative to ungapped reference sequence
101 start = seq.findPosition(_s)-seq.getStart();
102 end = seq.findPosition(_e)-seq.getStart();
103 int l_ungapped = end-start;
104 // Find correct sequence to reference and correct start and end - if necessary
105 SequenceI ds = seq.getDatasetSequence();
108 // make a new dataset sequence
109 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
110 new String(seq_string));
111 l_ungapped=ungapped.length();
112 // check that we haven't just duplicated an ungapped sequence.
113 if (l_ungapped == seq.getLength())
119 ds = new Sequence(seq.getName(), ungapped,
121 seq.getStart()+ungapped.length()-1);
122 // JBPNote: this would be consistent but may not be useful
123 // seq.setDatasetSequence(ds);
126 // add in offset between seq and the dataset sequence
127 if (ds.getStart() < seq.getStart())
129 int offset=seq.getStart()-ds.getStart();
130 if (initialDeletion) {
131 // absolute cigar string
132 addDeleted(_s+offset);
136 // normal behaviour - just mark start and end subsequence
144 // any gaps to process ?
145 if (l_ungapped!=(_e-_s))
151 if (end>ds.getLength()) {
152 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
153 // end = ds.getLength();
160 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
161 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
162 * @param seq SequenceI
163 * @param operation char[]
166 public SeqCigar(SequenceI seq, char operation[], int range[])
171 throw new Error("Implementation Bug. Null seq !");
173 if (operation.length != range.length)
175 throw new Error("Implementation Bug. Cigar Operation list!= range list");
178 if (operation != null)
180 this.operation = new char[operation.length + _inc_length];
181 this.range = new int[operation.length + _inc_length];
183 if (_setSeq(seq, false, 0, 0))
185 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
187 for (int i = this.length, j = 0; j < operation.length; i++, j++)
189 char op = operation[j];
190 if (op != M && op != I && op != D)
193 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
194 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
196 this.operation[i] = op;
197 this.range[i] = range[j];
199 this.length += operation.length;
203 this.operation = null;
206 if (_setSeq(seq, false,0, 0))
208 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
214 * add range matched residues to cigar string
217 public void addMatch(int range)
219 this.addOperation(M, range);
223 * insertion and match operations based on seq to the cigar up to
224 * the endpos column of seq.
226 * @param cigar CigarBase
227 * @param seq SequenceI
228 * @param startpos int
230 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
232 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
233 int startpos, int endpos, boolean initialDeletions)
237 int p = 0, res = seq.getLength();
239 if (!initialDeletions)
245 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
246 if ( (startpos <= p) && (p <= endpos))
250 if (range > 0 && op != I)
252 cigar.addOperation(op, range);
260 if (range > 0 && op != M)
262 cigar.addOperation(op, range);
273 if (range > 0 && op != D)
275 cigar.addOperation(op, range);
283 // do nothing - insertions are not made in flanking regions
290 cigar.addOperation(op, range);
295 * create a cigar string for given sequence
296 * @param seq SequenceI
298 public SeqCigar(SequenceI seq)
303 throw new Error("Implementation error for new Cigar(SequenceI)");
305 _setSeq(seq, false, 0, 0);
306 // there is still work to do
307 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
311 * Create Cigar from a range of gaps and residues on a sequence object
312 * @param seq SequenceI
313 * @param start int - first column in range
314 * @param end int - last column in range
316 public SeqCigar(SequenceI seq, int start, int end)
321 throw new Error("Implementation error for new Cigar(SequenceI)");
323 _setSeq(seq, false, start, end+1);
324 // there is still work to do
325 addSequenceOps(this, seq, start, end, false);
329 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
330 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
331 * @param seq SequenceI object resolvable to a dataset sequence
332 * @param cigarString String
335 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
338 Object[] opsandrange = parseCigarString(cigarString);
339 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
345 * @param alseqs SeqCigar[]
346 * @param gapCharacter char
347 * @return SequenceI[]
349 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
350 char gapCharacter, ColumnSelection colsel, int[] segments)
352 SequenceI[] seqs = new SequenceI[alseqs.length];
353 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
354 String[] alseqs_string=new String[alseqs.length];
355 Object[] gs_regions = new Object[alseqs.length];
356 for (int i = 0; i < alseqs.length; i++)
358 alseqs_string[i]=alseqs[i].getRefSeq().
359 getSequence().substring(alseqs[i].start,alseqs[i].end);
360 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
361 if (gs_regions[i] == null)
363 throw new Error("Implementation error: " + i +
364 "'th sequence Cigar has no operations.");
366 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
368 // Now account for insertions. (well - deletions)
369 // this is complicated because we must keep track of shifted positions in each sequence
370 ShiftList shifts = new ShiftList();
371 for (int i = 0; i < alseqs.length; i++)
373 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
374 if (gs_region != null)
377 for (int hr = 0; hr < gs_region.length; hr++)
379 int[] region = (int[]) gs_region[hr];
380 char[] insert = new char[region[1] - region[0] + 1];
381 for (int s = 0; s < insert.length; s++)
383 insert[s] = gapCharacter;
385 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
386 for (int s = 0; s < alseqs.length; s++)
390 if (g_seqs[s].length() <= inspos)
392 // prefix insertion with more gaps.
393 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
395 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
398 g_seqs[s].insert(inspos, insert);
402 g_seqs[s].insert(inspos,
403 alseqs_string[i].substring(region[0], region[1] + 1));
406 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
408 // add a hidden column for this deletion
409 colsel.hideColumns(inspos, inspos+insert.length-1);
413 for (int i = 0; i < alseqs.length; i++)
415 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
416 SequenceI ref = alseqs[i].getRefSeq();
417 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
418 ref.getStart() + alseqs[i].start+bounds[0],
419 ref.getStart() + alseqs[i].start+bounds[2]);
420 seqs[i].setDatasetSequence(ref);
422 if (segments!=null) {
423 int delshift=0; // need to shift the segments to their absolute positions in the sequence cigar frame.
424 for (int i=0; i<segments.length; i+=3) {
425 //int start=shifts.shift(segments[i]-1)+1;
426 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
427 colsel.hideColumns(delshift+segments[i], delshift+segments[i]+segments[i+2]-1);
428 delshift+=segments[i+2];
435 * non rigorous testing
439 * @param seq Sequence
440 * @param ex_cs_gapped String
443 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
445 SeqCigar c_sgapped = new SeqCigar(seq);
446 String cs_gapped = c_sgapped.getCigarstring();
447 if (!cs_gapped.equals(ex_cs_gapped))
449 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
450 "' != " + ex_cs_gapped);
455 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
458 // this is non-rigorous - start and end recovery is not tested.
459 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
460 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
462 System.err.println("Couldn't reconstruct sequence.\n" +
463 gen_sgapped_s.getSequence() + "\n" +
464 s_gapped.getSequence());
470 public static void main(String argv[])
474 Sequence s = new Sequence("MySeq",
475 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
478 Sequence s_gapped = new Sequence("MySeq",
479 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
481 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
482 s_gapped.setDatasetSequence(s);
484 Sequence s_subsequence_gapped = new Sequence("MySeq",
485 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
488 s_subsequence_gapped.setDatasetSequence(s);
489 SeqCigar c_null = new SeqCigar(s);
490 String cs_null = c_null.getCigarstring();
491 if (!cs_null.equals("42M"))
494 "Failed to recover ungapped sequence cigar operations:" +
495 ( (cs_null == "") ? "empty string" : cs_null));
497 testCigar_string(s_gapped, ex_cs_gapped);
498 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
499 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
501 System.err.println("Failed parseCigar(" + ex_cs_gapped +
502 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
505 testSeqRecovery(gen_sgapped, s_gapped);
506 // Test dataset resolution
507 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
508 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
510 System.err.println("Failed recovery for subsequence of dataset sequence");
513 if (sub_gapped.getWidth() != sub_gapped_s.length())
515 System.err.println("Failed getWidth()");
518 sub_gapped.getFullWidth();
519 if (sub_gapped.hasDeletedRegions())
521 System.err.println("hasDeletedRegions is incorrect.");
523 // Test start-end region SeqCigar
524 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
525 if (sub_se_gp.getWidth() != 41)
528 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
530 System.out.println("Original sequence align:\n" + sub_gapped_s +
531 "\nReconstructed window from 8 to 48\n"
532 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
533 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
535 SequenceI ssgp = sub_se_gp.getSeq('-');
536 System.out.println("\t " + ssgp.getSequence());
537 for (int r = 0; r < 10; r++)
539 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
540 int sl = sub_se_gp.getWidth();
542 for (int rs = 0; rs < 10; rs++)
545 sub_se_gp.deleteRange(st, e);
546 String ssgapedseq = sub_se_gp.getSeq('-').getSequence();
547 System.out.println(st + "," + e + "\t:" + ssgapedseq);
552 SeqCigar[] set = new SeqCigar[]
554 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
555 new SeqCigar(s_gapped)};
556 Alignment al = new Alignment(set);
557 for (int i = 0; i < al.getHeight(); i++)
559 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
560 al.getSequenceAt(i).getStart() + "\t" +
561 al.getSequenceAt(i).getEnd() + "\t" +
562 al.getSequenceAt(i).getSequence());
566 System.out.println("Gapped.");
567 SeqCigar[] set = new SeqCigar[]
569 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
570 new SeqCigar(s_gapped)};
571 set[0].deleteRange(20, 25);
572 Alignment al = new Alignment(set);
573 for (int i = 0; i < al.getHeight(); i++)
575 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
576 al.getSequenceAt(i).getStart() + "\t" +
577 al.getSequenceAt(i).getEnd() + "\t" +
578 al.getSequenceAt(i).getSequence());
581 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
582 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());