1 package jalview.datamodel;
3 import jalview.analysis.*;
4 import jalview.util.ShiftList;
5 import java.util.Vector;
11 * start(inclusive) and end(exclusive) of subsequence on refseq
13 private int start, end;
14 private SequenceI refseq = null;
16 * Reference dataset sequence for the cigar string
19 public SequenceI getRefSeq()
25 * @return int start index of cigar ops on refSeq
27 public int getStart() {
32 * @return int end index (exclusive) of cigar ops on refSeq
38 * Returns sequence as a string with cigar operations applied to it
41 public String getSequenceString(char GapChar)
43 return (length == 0) ? "" :
44 (String) getSequenceAndDeletions(refseq.getSequence().substring(start, end), GapChar)[0];
48 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
51 public SequenceI getSeq(char GapChar)
54 if (refseq == null || length == 0)
58 Object[] edit_result = getSequenceAndDeletions(refseq.getSequence().substring(start,end),
60 if (edit_result == null)
63 "Implementation Error - unexpected null from getSequenceAndDeletions");
65 int bounds[] = (int[]) edit_result[1];
66 seq = new Sequence(refseq.getName(), (String) edit_result[0],
67 refseq.getStart() + start+bounds[0],
68 refseq.getStart() + start+((bounds[2]==0) ? -1 : bounds[2]));
69 // seq.checkValidRange(); probably not needed
70 seq.setDatasetSequence(refseq);
75 We don't allow this - refseq is given at construction time only
76 public void setSeq(SequenceI seq) {
81 * internal constructor - sets seq to a gapless sequence derived from seq
82 * and prepends any 'D' operations needed to get to the first residue of seq.
83 * @param seq SequenceI
84 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
85 * @param _s index of first position in seq
86 * @param _e index after last position in (possibly gapped) seq
87 * @return true if gaps are present in seq
89 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
91 boolean hasgaps = false;
94 throw new Error("Implementation Error - _setSeq(null,...)");
97 throw new Error("Implementation Error: _s="+_s);
98 String seq_string = seq.getSequence();
99 if (_e==0 || _e<_s || _e>seq_string.length())
100 _e=seq_string.length();
101 // resolve start and end positions relative to ungapped reference sequence
102 start = seq.findPosition(_s)-seq.getStart();
103 end = seq.findPosition(_e)-seq.getStart();
104 int l_ungapped = end-start;
105 // Find correct sequence to reference and correct start and end - if necessary
106 SequenceI ds = seq.getDatasetSequence();
109 // make a new dataset sequence
110 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
111 new String(seq_string));
112 l_ungapped=ungapped.length();
113 // check that we haven't just duplicated an ungapped sequence.
114 if (l_ungapped == seq.getLength())
120 ds = new Sequence(seq.getName(), ungapped,
122 seq.getStart()+ungapped.length()-1);
123 // JBPNote: this would be consistent but may not be useful
124 // seq.setDatasetSequence(ds);
127 // add in offset between seq and the dataset sequence
128 if (ds.getStart() < seq.getStart())
130 int offset=seq.getStart()-ds.getStart();
131 if (initialDeletion) {
132 // absolute cigar string
133 addDeleted(_s+offset);
137 // normal behaviour - just mark start and end subsequence
145 // any gaps to process ?
146 if (l_ungapped!=(_e-_s))
152 if (end>ds.getLength()) {
153 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
154 // end = ds.getLength();
161 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
162 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
163 * @param seq SequenceI
164 * @param operation char[]
167 public SeqCigar(SequenceI seq, char operation[], int range[])
172 throw new Error("Implementation Bug. Null seq !");
174 if (operation.length != range.length)
176 throw new Error("Implementation Bug. Cigar Operation list!= range list");
179 if (operation != null)
181 this.operation = new char[operation.length + _inc_length];
182 this.range = new int[operation.length + _inc_length];
184 if (_setSeq(seq, false, 0, 0))
186 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
188 for (int i = this.length, j = 0; j < operation.length; i++, j++)
190 char op = operation[j];
191 if (op != M && op != I && op != D)
194 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
195 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
197 this.operation[i] = op;
198 this.range[i] = range[j];
200 this.length += operation.length;
204 this.operation = null;
207 if (_setSeq(seq, false,0, 0))
209 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
215 * add range matched residues to cigar string
218 public void addMatch(int range)
220 this.addOperation(M, range);
224 * insertion and match operations based on seq to the cigar up to
225 * the endpos column of seq.
227 * @param cigar CigarBase
228 * @param seq SequenceI
229 * @param startpos int
231 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
233 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
234 int startpos, int endpos, boolean initialDeletions)
238 int p = 0, res = seq.getLength();
240 if (!initialDeletions)
246 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
247 if ( (startpos <= p) && (p <= endpos))
251 if (range > 0 && op != I)
253 cigar.addOperation(op, range);
261 if (range > 0 && op != M)
263 cigar.addOperation(op, range);
274 if (range > 0 && op != D)
276 cigar.addOperation(op, range);
284 // do nothing - insertions are not made in flanking regions
291 cigar.addOperation(op, range);
296 * create a cigar string for given sequence
297 * @param seq SequenceI
299 public SeqCigar(SequenceI seq)
304 throw new Error("Implementation error for new Cigar(SequenceI)");
306 _setSeq(seq, false, 0, 0);
307 // there is still work to do
308 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
312 * Create Cigar from a range of gaps and residues on a sequence object
313 * @param seq SequenceI
314 * @param start int - first column in range
315 * @param end int - last column in range
317 public SeqCigar(SequenceI seq, int start, int end)
322 throw new Error("Implementation error for new Cigar(SequenceI)");
324 _setSeq(seq, false, start, end+1);
325 // there is still work to do
326 addSequenceOps(this, seq, start, end, false);
330 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
331 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
332 * @param seq SequenceI object resolvable to a dataset sequence
333 * @param cigarString String
336 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
339 Object[] opsandrange = parseCigarString(cigarString);
340 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
346 * @param alseqs SeqCigar[]
347 * @param gapCharacter char
348 * @return SequenceI[]
350 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
351 char gapCharacter, ColumnSelection colsel, int[] segments)
353 SequenceI[] seqs = new SequenceI[alseqs.length];
354 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
355 String[] alseqs_string=new String[alseqs.length];
356 Object[] gs_regions = new Object[alseqs.length];
357 for (int i = 0; i < alseqs.length; i++)
359 alseqs_string[i]=alseqs[i].getRefSeq().
360 getSequence().substring(alseqs[i].start,alseqs[i].end);
361 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
362 if (gs_regions[i] == null)
364 throw new Error("Implementation error: " + i +
365 "'th sequence Cigar has no operations.");
367 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
369 // Now account for insertions. (well - deletions)
370 // this is complicated because we must keep track of shifted positions in each sequence
371 ShiftList shifts = new ShiftList();
372 for (int i = 0; i < alseqs.length; i++)
374 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
375 if (gs_region != null)
378 for (int hr = 0; hr < gs_region.length; hr++)
380 int[] region = (int[]) gs_region[hr];
381 char[] insert = new char[region[1] - region[0] + 1];
382 for (int s = 0; s < insert.length; s++)
384 insert[s] = gapCharacter;
386 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
387 for (int s = 0; s < alseqs.length; s++)
391 if (g_seqs[s].length() <= inspos)
393 // prefix insertion with more gaps.
394 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
396 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
399 g_seqs[s].insert(inspos, insert);
403 g_seqs[s].insert(inspos,
404 alseqs_string[i].substring(region[0], region[1] + 1));
407 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
409 // add a hidden column for this deletion
410 colsel.hideColumns(inspos, inspos+insert.length-1);
414 for (int i = 0; i < alseqs.length; i++)
416 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
417 SequenceI ref = alseqs[i].getRefSeq();
418 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
419 ref.getStart() + alseqs[i].start+bounds[0],
420 ref.getStart() + alseqs[i].start+(bounds[2]==0 ? -1 : bounds[2]));
421 seqs[i].setDatasetSequence(ref);
423 if (segments!=null) {
424 for (int i=0; i<segments.length; i+=3) {
425 //int start=shifts.shift(segments[i]-1)+1;
426 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
427 colsel.hideColumns(segments[i+1], segments[i+1]+segments[i+2]-1);
434 * non rigorous testing
438 * @param seq Sequence
439 * @param ex_cs_gapped String
442 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
444 SeqCigar c_sgapped = new SeqCigar(seq);
445 String cs_gapped = c_sgapped.getCigarstring();
446 if (!cs_gapped.equals(ex_cs_gapped))
448 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
449 "' != " + ex_cs_gapped);
454 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
457 // this is non-rigorous - start and end recovery is not tested.
458 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
459 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
461 System.err.println("Couldn't reconstruct sequence.\n" +
462 gen_sgapped_s.getSequence() + "\n" +
463 s_gapped.getSequence());
469 public static void main(String argv[])
473 Sequence s = new Sequence("MySeq",
474 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
477 Sequence s_gapped = new Sequence("MySeq",
478 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
480 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
481 s_gapped.setDatasetSequence(s);
483 Sequence s_subsequence_gapped = new Sequence("MySeq",
484 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
487 s_subsequence_gapped.setDatasetSequence(s);
488 SeqCigar c_null = new SeqCigar(s);
489 String cs_null = c_null.getCigarstring();
490 if (!cs_null.equals("42M"))
493 "Failed to recover ungapped sequence cigar operations:" +
494 ( (cs_null == "") ? "empty string" : cs_null));
496 testCigar_string(s_gapped, ex_cs_gapped);
497 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
498 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
500 System.err.println("Failed parseCigar(" + ex_cs_gapped +
501 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
504 testSeqRecovery(gen_sgapped, s_gapped);
505 // Test dataset resolution
506 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
507 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
509 System.err.println("Failed recovery for subsequence of dataset sequence");
512 if (sub_gapped.getWidth() != sub_gapped_s.length())
514 System.err.println("Failed getWidth()");
517 sub_gapped.getFullWidth();
518 if (sub_gapped.hasDeletedRegions())
520 System.err.println("hasDeletedRegions is incorrect.");
522 // Test start-end region SeqCigar
523 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
524 if (sub_se_gp.getWidth() != 41)
527 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
529 System.out.println("Original sequence align:\n" + sub_gapped_s +
530 "\nReconstructed window from 8 to 48\n"
531 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
532 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
534 SequenceI ssgp = sub_se_gp.getSeq('-');
535 System.out.println("\t " + ssgp.getSequence());
536 for (int r = 0; r < 10; r++)
538 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
539 int sl = sub_se_gp.getWidth();
541 for (int rs = 0; rs < 10; rs++)
544 sub_se_gp.deleteRange(st, e);
545 String ssgapedseq = sub_se_gp.getSeq('-').getSequence();
546 System.out.println(st + "," + e + "\t:" + ssgapedseq);
551 SeqCigar[] set = new SeqCigar[]
553 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
554 new SeqCigar(s_gapped)};
555 Alignment al = new Alignment(set);
556 for (int i = 0; i < al.getHeight(); i++)
558 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
559 al.getSequenceAt(i).getStart() + "\t" +
560 al.getSequenceAt(i).getEnd() + "\t" +
561 al.getSequenceAt(i).getSequence());
565 System.out.println("Gapped.");
566 SeqCigar[] set = new SeqCigar[]
568 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
569 new SeqCigar(s_gapped)};
570 set[0].deleteRange(20, 25);
571 Alignment al = new Alignment(set);
572 for (int i = 0; i < al.getHeight(); i++)
574 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
575 al.getSequenceAt(i).getStart() + "\t" +
576 al.getSequenceAt(i).getEnd() + "\t" +
577 al.getSequenceAt(i).getSequence());
580 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
581 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());