2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import jalview.analysis.*;
22 import jalview.util.ShiftList;
23 import java.util.Vector;
29 * start(inclusive) and end(exclusive) of subsequence on refseq
31 private int start, end;
32 private SequenceI refseq = null;
34 * Reference dataset sequence for the cigar string
37 public SequenceI getRefSeq()
43 * @return int start index of cigar ops on refSeq
45 public int getStart() {
50 * @return int end index (exclusive) of cigar ops on refSeq
56 * Returns sequence as a string with cigar operations applied to it
59 public String getSequenceString(char GapChar)
61 return (length == 0) ? "" :
62 (String) getSequenceAndDeletions(refseq.getSequence().substring(start, end), GapChar)[0];
66 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
69 public SequenceI getSeq(char GapChar)
72 if (refseq == null || length == 0)
76 Object[] edit_result = getSequenceAndDeletions(refseq.getSequence().substring(start,end),
78 if (edit_result == null)
81 "Implementation Error - unexpected null from getSequenceAndDeletions");
83 int bounds[] = (int[]) edit_result[1];
84 seq = new Sequence(refseq.getName(), (String) edit_result[0],
85 refseq.getStart() + start+bounds[0],
86 refseq.getStart() + start+((bounds[2]==0) ? -1 : bounds[2]));
87 // seq.checkValidRange(); probably not needed
88 seq.setDatasetSequence(refseq);
93 We don't allow this - refseq is given at construction time only
94 public void setSeq(SequenceI seq) {
99 * internal constructor - sets seq to a gapless sequence derived from seq
100 * and prepends any 'D' operations needed to get to the first residue of seq.
101 * @param seq SequenceI
102 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
103 * @param _s index of first position in seq
104 * @param _e index after last position in (possibly gapped) seq
105 * @return true if gaps are present in seq
107 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
109 boolean hasgaps = false;
112 throw new Error("Implementation Error - _setSeq(null,...)");
115 throw new Error("Implementation Error: _s="+_s);
116 String seq_string = seq.getSequence();
117 if (_e==0 || _e<_s || _e>seq_string.length())
118 _e=seq_string.length();
119 // resolve start and end positions relative to ungapped reference sequence
120 start = seq.findPosition(_s)-seq.getStart();
121 end = seq.findPosition(_e)-seq.getStart();
122 int l_ungapped = end-start;
123 // Find correct sequence to reference and correct start and end - if necessary
124 SequenceI ds = seq.getDatasetSequence();
127 // make a new dataset sequence
128 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
129 new String(seq_string));
130 l_ungapped=ungapped.length();
131 // check that we haven't just duplicated an ungapped sequence.
132 if (l_ungapped == seq.getLength())
138 ds = new Sequence(seq.getName(), ungapped,
140 seq.getStart()+ungapped.length()-1);
141 // JBPNote: this would be consistent but may not be useful
142 // seq.setDatasetSequence(ds);
145 // add in offset between seq and the dataset sequence
146 if (ds.getStart() < seq.getStart())
148 int offset=seq.getStart()-ds.getStart();
149 if (initialDeletion) {
150 // absolute cigar string
151 addDeleted(_s+offset);
155 // normal behaviour - just mark start and end subsequence
163 // any gaps to process ?
164 if (l_ungapped!=(_e-_s))
170 if (end>ds.getLength()) {
171 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
172 // end = ds.getLength();
179 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
180 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
181 * @param seq SequenceI
182 * @param operation char[]
185 public SeqCigar(SequenceI seq, char operation[], int range[])
190 throw new Error("Implementation Bug. Null seq !");
192 if (operation.length != range.length)
194 throw new Error("Implementation Bug. Cigar Operation list!= range list");
197 if (operation != null)
199 this.operation = new char[operation.length + _inc_length];
200 this.range = new int[operation.length + _inc_length];
202 if (_setSeq(seq, false, 0, 0))
204 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
206 for (int i = this.length, j = 0; j < operation.length; i++, j++)
208 char op = operation[j];
209 if (op != M && op != I && op != D)
212 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
213 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
215 this.operation[i] = op;
216 this.range[i] = range[j];
218 this.length += operation.length;
222 this.operation = null;
225 if (_setSeq(seq, false,0, 0))
227 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
233 * add range matched residues to cigar string
236 public void addMatch(int range)
238 this.addOperation(M, range);
242 * insertion and match operations based on seq to the cigar up to
243 * the endpos column of seq.
245 * @param cigar CigarBase
246 * @param seq SequenceI
247 * @param startpos int
249 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
251 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
252 int startpos, int endpos, boolean initialDeletions)
256 int p = 0, res = seq.getLength();
258 if (!initialDeletions)
264 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
265 if ( (startpos <= p) && (p <= endpos))
269 if (range > 0 && op != I)
271 cigar.addOperation(op, range);
279 if (range > 0 && op != M)
281 cigar.addOperation(op, range);
292 if (range > 0 && op != D)
294 cigar.addOperation(op, range);
302 // do nothing - insertions are not made in flanking regions
309 cigar.addOperation(op, range);
314 * create a cigar string for given sequence
315 * @param seq SequenceI
317 public SeqCigar(SequenceI seq)
322 throw new Error("Implementation error for new Cigar(SequenceI)");
324 _setSeq(seq, false, 0, 0);
325 // there is still work to do
326 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
330 * Create Cigar from a range of gaps and residues on a sequence object
331 * @param seq SequenceI
332 * @param start int - first column in range
333 * @param end int - last column in range
335 public SeqCigar(SequenceI seq, int start, int end)
340 throw new Error("Implementation error for new Cigar(SequenceI)");
342 _setSeq(seq, false, start, end+1);
343 // there is still work to do
344 addSequenceOps(this, seq, start, end, false);
348 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
349 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
350 * @param seq SequenceI object resolvable to a dataset sequence
351 * @param cigarString String
354 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
357 Object[] opsandrange = parseCigarString(cigarString);
358 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
364 * @param alseqs SeqCigar[]
365 * @param gapCharacter char
366 * @return SequenceI[]
368 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
369 char gapCharacter, ColumnSelection colsel, int[] segments)
371 SequenceI[] seqs = new SequenceI[alseqs.length];
372 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
373 String[] alseqs_string=new String[alseqs.length];
374 Object[] gs_regions = new Object[alseqs.length];
375 for (int i = 0; i < alseqs.length; i++)
377 alseqs_string[i]=alseqs[i].getRefSeq().
378 getSequence().substring(alseqs[i].start,alseqs[i].end);
379 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
380 if (gs_regions[i] == null)
382 throw new Error("Implementation error: " + i +
383 "'th sequence Cigar has no operations.");
385 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
387 // Now account for insertions. (well - deletions)
388 // this is complicated because we must keep track of shifted positions in each sequence
389 ShiftList shifts = new ShiftList();
390 for (int i = 0; i < alseqs.length; i++)
392 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
393 if (gs_region != null)
396 for (int hr = 0; hr < gs_region.length; hr++)
398 int[] region = (int[]) gs_region[hr];
399 char[] insert = new char[region[1] - region[0] + 1];
400 for (int s = 0; s < insert.length; s++)
402 insert[s] = gapCharacter;
404 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
405 for (int s = 0; s < alseqs.length; s++)
409 if (g_seqs[s].length() <= inspos)
411 // prefix insertion with more gaps.
412 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
414 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
417 g_seqs[s].insert(inspos, insert);
421 g_seqs[s].insert(inspos,
422 alseqs_string[i].substring(region[0], region[1] + 1));
425 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
427 // add a hidden column for this deletion
428 colsel.hideColumns(inspos, inspos+insert.length-1);
432 for (int i = 0; i < alseqs.length; i++)
434 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
435 SequenceI ref = alseqs[i].getRefSeq();
436 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
437 ref.getStart() + alseqs[i].start+bounds[0],
438 ref.getStart() + alseqs[i].start+(bounds[2]==0 ? -1 : bounds[2]));
439 seqs[i].setDatasetSequence(ref);
441 if (segments!=null) {
442 for (int i=0; i<segments.length; i+=3) {
443 //int start=shifts.shift(segments[i]-1)+1;
444 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
445 colsel.hideColumns(segments[i+1], segments[i+1]+segments[i+2]-1);
452 * non rigorous testing
456 * @param seq Sequence
457 * @param ex_cs_gapped String
460 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
462 SeqCigar c_sgapped = new SeqCigar(seq);
463 String cs_gapped = c_sgapped.getCigarstring();
464 if (!cs_gapped.equals(ex_cs_gapped))
466 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
467 "' != " + ex_cs_gapped);
472 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
475 // this is non-rigorous - start and end recovery is not tested.
476 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
477 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
479 System.err.println("Couldn't reconstruct sequence.\n" +
480 gen_sgapped_s.getSequence() + "\n" +
481 s_gapped.getSequence());
487 public static void main(String argv[])
491 Sequence s = new Sequence("MySeq",
492 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
495 Sequence s_gapped = new Sequence("MySeq",
496 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
498 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
499 s_gapped.setDatasetSequence(s);
501 Sequence s_subsequence_gapped = new Sequence("MySeq",
502 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
505 s_subsequence_gapped.setDatasetSequence(s);
506 SeqCigar c_null = new SeqCigar(s);
507 String cs_null = c_null.getCigarstring();
508 if (!cs_null.equals("42M"))
511 "Failed to recover ungapped sequence cigar operations:" +
512 ( (cs_null == "") ? "empty string" : cs_null));
514 testCigar_string(s_gapped, ex_cs_gapped);
515 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
516 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
518 System.err.println("Failed parseCigar(" + ex_cs_gapped +
519 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
522 testSeqRecovery(gen_sgapped, s_gapped);
523 // Test dataset resolution
524 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
525 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
527 System.err.println("Failed recovery for subsequence of dataset sequence");
530 if (sub_gapped.getWidth() != sub_gapped_s.length())
532 System.err.println("Failed getWidth()");
535 sub_gapped.getFullWidth();
536 if (sub_gapped.hasDeletedRegions())
538 System.err.println("hasDeletedRegions is incorrect.");
540 // Test start-end region SeqCigar
541 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
542 if (sub_se_gp.getWidth() != 41)
545 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
547 System.out.println("Original sequence align:\n" + sub_gapped_s +
548 "\nReconstructed window from 8 to 48\n"
549 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
550 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
552 SequenceI ssgp = sub_se_gp.getSeq('-');
553 System.out.println("\t " + ssgp.getSequence());
554 for (int r = 0; r < 10; r++)
556 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
557 int sl = sub_se_gp.getWidth();
559 for (int rs = 0; rs < 10; rs++)
562 sub_se_gp.deleteRange(st, e);
563 String ssgapedseq = sub_se_gp.getSeq('-').getSequence();
564 System.out.println(st + "," + e + "\t:" + ssgapedseq);
569 SeqCigar[] set = new SeqCigar[]
571 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
572 new SeqCigar(s_gapped)};
573 Alignment al = new Alignment(set);
574 for (int i = 0; i < al.getHeight(); i++)
576 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
577 al.getSequenceAt(i).getStart() + "\t" +
578 al.getSequenceAt(i).getEnd() + "\t" +
579 al.getSequenceAt(i).getSequence());
583 System.out.println("Gapped.");
584 SeqCigar[] set = new SeqCigar[]
586 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
587 new SeqCigar(s_gapped)};
588 set[0].deleteRange(20, 25);
589 Alignment al = new Alignment(set);
590 for (int i = 0; i < al.getHeight(); i++)
592 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
593 al.getSequenceAt(i).getStart() + "\t" +
594 al.getSequenceAt(i).getEnd() + "\t" +
595 al.getSequenceAt(i).getSequence());
598 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
599 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());