1 package jalview.datamodel;
3 import jalview.analysis.*;
4 import jalview.util.ShiftList;
5 import java.util.Vector;
11 * start(inclusive) and end(exclusive) of subsequence on refseq
13 private int start, end;
14 private SequenceI refseq = null;
16 * Reference dataset sequence for the cigar string
19 public SequenceI getRefSeq()
25 * @return int start index of cigar ops on refSeq
27 public int getStart() {
32 * @return int end index (exclusive) of cigar ops on refSeq
38 * Returns sequence as a string with cigar operations applied to it
41 public String getSequenceString(char GapChar)
43 return (length == 0) ? "" :
44 (String) getSequenceAndDeletions(refseq.getSequence().substring(start, end), GapChar)[0];
48 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
51 public SequenceI getSeq(char GapChar)
54 if (refseq == null || length == 0)
58 Object[] edit_result = getSequenceAndDeletions(refseq.getSequence().substring(start,end),
60 if (edit_result == null)
63 "Implementation Error - unexpected null from getSequenceAndDeletions");
66 seq = new Sequence(refseq.getName(), (String) edit_result[0],
67 refseq.getStart() + start+( (int[]) edit_result[1])[0],
68 refseq.getStart() + start+( (int[]) edit_result[1])[2]);
69 seq.setDatasetSequence(refseq);
74 We don't allow this - refseq is given at construction time only
75 public void setSeq(SequenceI seq) {
80 * internal constructor - sets seq to a gapless sequence derived from seq
81 * and prepends any 'D' operations needed to get to the first residue of seq.
82 * @param seq SequenceI
83 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
84 * @param _s index of first position in seq
85 * @param _e index after last position in (possibly gapped) seq
86 * @return true if gaps are present in seq
88 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
90 boolean hasgaps = false;
93 throw new Error("Implementation Error - _setSeq(null,...)");
96 throw new Error("Implementation Error: _s="+_s);
97 String seq_string = seq.getSequence();
98 if (_e==0 || _e<_s || _e>seq_string.length())
99 _e=seq_string.length();
100 // resolve start and end positions relative to ungapped reference sequence
101 start = seq.findPosition(_s)-seq.getStart();
102 end = seq.findPosition(_e)-seq.getStart();
103 int l_ungapped = end-start;
104 // Find correct sequence to reference and correct start and end - if necessary
105 SequenceI ds = seq.getDatasetSequence();
108 // make a new dataset sequence
109 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
110 new String(seq_string));
111 l_ungapped=ungapped.length();
112 // check that we haven't just duplicated an ungapped sequence.
113 if (l_ungapped == seq.getLength())
119 ds = new Sequence(seq.getName(), ungapped,
121 seq.getStart()+ungapped.length()-1);
122 // JBPNote: this would be consistent but may not be useful
123 // seq.setDatasetSequence(ds);
126 // add in offset between seq and the dataset sequence
127 if (ds.getStart() < seq.getStart())
129 int offset=seq.getStart()-ds.getStart();
130 if (initialDeletion) {
131 // absolute cigar string
132 addDeleted(_s+offset);
136 // normal behaviour - just mark start and end subsequence
144 // any gaps to process ?
145 if (l_ungapped!=(_e-_s))
151 if (end>ds.getLength()) {
152 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
153 // end = ds.getLength();
160 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
161 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
162 * @param seq SequenceI
163 * @param operation char[]
166 public SeqCigar(SequenceI seq, char operation[], int range[])
171 throw new Error("Implementation Bug. Null seq !");
173 if (operation.length != range.length)
175 throw new Error("Implementation Bug. Cigar Operation list!= range list");
178 if (operation != null)
180 this.operation = new char[operation.length + _inc_length];
181 this.range = new int[operation.length + _inc_length];
183 if (_setSeq(seq, false, 0, 0))
185 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
187 for (int i = this.length, j = 0; j < operation.length; i++, j++)
189 char op = operation[j];
190 if (op != M && op != I && op != D)
193 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
194 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
196 this.operation[i] = op;
197 this.range[i] = range[j];
199 this.length += operation.length;
203 this.operation = null;
206 if (_setSeq(seq, false,0, 0))
208 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
214 * add range matched residues to cigar string
217 public void addMatch(int range)
219 this.addOperation(M, range);
223 * Deleted regions mean that there will be discontinuous sequence numbering in the
224 * sequence returned by getSeq(char).
225 * @return true if there deletions
227 public boolean hasDeletedRegions()
229 for (int i = 0, l = length; i < l; i++)
231 if (operation[i] == D)
241 * insertion and match operations based on seq to the cigar up to
242 * the endpos column of seq.
244 * @param cigar CigarBase
245 * @param seq SequenceI
246 * @param startpos int
248 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
250 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
251 int startpos, int endpos, boolean initialDeletions)
255 int p = 0, res = seq.getLength();
257 if (!initialDeletions)
263 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
264 if ( (startpos <= p) && (p <= endpos))
268 if (range > 0 && op != I)
270 cigar.addOperation(op, range);
278 if (range > 0 && op != M)
280 cigar.addOperation(op, range);
291 if (range > 0 && op != D)
293 cigar.addOperation(op, range);
301 // do nothing - insertions are not made in flanking regions
308 cigar.addOperation(op, range);
313 * create a cigar string for given sequence
314 * @param seq SequenceI
316 public SeqCigar(SequenceI seq)
321 throw new Error("Implementation error for new Cigar(SequenceI)");
323 _setSeq(seq, false, 0, 0);
324 // there is still work to do
325 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
329 * Create Cigar from a range of gaps and residues on a sequence object
330 * @param seq SequenceI
331 * @param start int - first column in range
332 * @param end int - last column in range
334 public SeqCigar(SequenceI seq, int start, int end)
339 throw new Error("Implementation error for new Cigar(SequenceI)");
341 _setSeq(seq, false, start, end+1);
342 // there is still work to do
343 addSequenceOps(this, seq, start, end, false);
347 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
348 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
349 * @param seq SequenceI object resolvable to a dataset sequence
350 * @param cigarString String
353 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
356 Object[] opsandrange = parseCigarString(cigarString);
357 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
363 * @param alseqs SeqCigar[]
364 * @param gapCharacter char
365 * @return SequenceI[]
367 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
368 char gapCharacter, ColumnSelection colsel)
370 SequenceI[] seqs = new SequenceI[alseqs.length];
371 Vector hiddenRegions=new Vector();
372 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
373 String[] alseqs_string=new String[alseqs.length];
374 Object[] gs_regions = new Object[alseqs.length];
375 for (int i = 0; i < alseqs.length; i++)
377 alseqs_string[i]=alseqs[i].getRefSeq().
378 getSequence().substring(alseqs[i].start,alseqs[i].end);
379 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
380 if (gs_regions[i] == null)
382 throw new Error("Implementation error: " + i +
383 "'th sequence Cigar has no operations.");
385 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
387 // Now account for insertions.
388 // this is complicated because we must keep track of shifted positions in each sequence
389 ShiftList shifts = new ShiftList();
390 for (int i = 0; i < alseqs.length; i++)
392 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
393 if (gs_region != null)
396 for (int hr = 0; hr < gs_region.length; hr++)
398 int[] region = (int[]) gs_region[hr];
399 char[] insert = new char[region[1] - region[0] + 1];
400 for (int s = 0; s < insert.length; s++)
402 insert[s] = gapCharacter;
404 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
405 for (int s = 0; s < alseqs.length; s++)
409 if (g_seqs[s].length() <= inspos)
411 // prefix insertion with more gaps.
412 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
414 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
417 g_seqs[s].insert(inspos, insert);
421 g_seqs[s].insert(inspos,
422 alseqs_string[i].substring(region[0], region[1] + 1));
425 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
426 colsel.hideColumns(inspos, inspos+insert.length-1);
430 for (int i = 0; i < alseqs.length; i++)
432 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
433 SequenceI ref = alseqs[i].getRefSeq();
434 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
435 ref.getStart() + alseqs[i].start+bounds[0],
436 ref.getStart() + alseqs[i].start+bounds[2]);
437 seqs[i].setDatasetSequence(ref);
443 * non rigorous testing
447 * @param seq Sequence
448 * @param ex_cs_gapped String
451 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
453 SeqCigar c_sgapped = new SeqCigar(seq);
454 String cs_gapped = c_sgapped.getCigarstring();
455 if (!cs_gapped.equals(ex_cs_gapped))
457 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
458 "' != " + ex_cs_gapped);
463 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
466 // this is non-rigorous - start and end recovery is not tested.
467 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
468 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
470 System.err.println("Couldn't reconstruct sequence.\n" +
471 gen_sgapped_s.getSequence() + "\n" +
472 s_gapped.getSequence());
478 public static void main(String argv[])
482 Sequence s = new Sequence("MySeq",
483 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
486 Sequence s_gapped = new Sequence("MySeq",
487 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
489 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
490 s_gapped.setDatasetSequence(s);
492 Sequence s_subsequence_gapped = new Sequence("MySeq",
493 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
496 s_subsequence_gapped.setDatasetSequence(s);
497 SeqCigar c_null = new SeqCigar(s);
498 String cs_null = c_null.getCigarstring();
499 if (!cs_null.equals("42M"))
502 "Failed to recover ungapped sequence cigar operations:" +
503 ( (cs_null == "") ? "empty string" : cs_null));
505 testCigar_string(s_gapped, ex_cs_gapped);
506 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
507 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
509 System.err.println("Failed parseCigar(" + ex_cs_gapped +
510 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
513 testSeqRecovery(gen_sgapped, s_gapped);
514 // Test dataset resolution
515 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
516 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
518 System.err.println("Failed recovery for subsequence of dataset sequence");
521 if (sub_gapped.getWidth() != sub_gapped_s.length())
523 System.err.println("Failed getWidth()");
526 sub_gapped.getFullWidth();
527 if (sub_gapped.hasDeletedRegions())
529 System.err.println("hasDeletedRegions is incorrect.");
531 // Test start-end region SeqCigar
532 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
533 if (sub_se_gp.getWidth() != 41)
536 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
538 System.out.println("Original sequence align:\n" + sub_gapped_s +
539 "\nReconstructed window from 8 to 48\n"
540 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
541 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
543 SequenceI ssgp = sub_se_gp.getSeq('-');
544 System.out.println("\t " + ssgp.getSequence());
545 for (int r = 0; r < 10; r++)
547 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
548 int sl = sub_se_gp.getWidth();
550 for (int rs = 0; rs < 10; rs++)
553 sub_se_gp.deleteRange(st, e);
554 String ssgapedseq = sub_se_gp.getSeq('-').getSequence();
555 System.out.println(st + "," + e + "\t:" + ssgapedseq);
559 SeqCigar[] set = new SeqCigar[]
561 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
562 new SeqCigar(s_gapped)};
563 Alignment al = new Alignment(set);
564 for (int i = 0; i < al.getHeight(); i++)
566 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
567 al.getSequenceAt(i).getStart() + "\t" +
568 al.getSequenceAt(i).getEnd() + "\t" +
569 al.getSequenceAt(i).getSequence());
573 System.out.println("Gapped.");
574 SeqCigar[] set = new SeqCigar[]
576 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
577 new SeqCigar(s_gapped)};
578 set[0].deleteRange(20, 25);
579 Alignment al = new Alignment(set);
580 for (int i = 0; i < al.getHeight(); i++)
582 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
583 al.getSequenceAt(i).getStart() + "\t" +
584 al.getSequenceAt(i).getEnd() + "\t" +
585 al.getSequenceAt(i).getSequence());
588 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
589 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());