2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2006 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import jalview.analysis.*;
22 import jalview.util.ShiftList;
23 import java.util.Vector;
29 * start(inclusive) and end(exclusive) of subsequence on refseq
31 private int start, end;
32 private SequenceI refseq = null;
34 * Reference dataset sequence for the cigar string
37 public SequenceI getRefSeq()
43 * @return int start index of cigar ops on refSeq
45 public int getStart() {
50 * @return int end index (exclusive) of cigar ops on refSeq
56 * Returns sequence as a string with cigar operations applied to it
59 public String getSequenceString(char GapChar)
61 return (length == 0) ? "" :
62 (String) getSequenceAndDeletions(refseq.getSequenceAsString(start, end), GapChar)[0];
66 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
69 public SequenceI getSeq(char GapChar)
72 if (refseq == null || length == 0)
76 Object[] edit_result = getSequenceAndDeletions(refseq.getSequenceAsString(start,end),
78 if (edit_result == null)
81 "Implementation Error - unexpected null from getSequenceAndDeletions");
83 int bounds[] = (int[]) edit_result[1];
84 seq = new Sequence(refseq.getName(), (String) edit_result[0],
85 refseq.getStart() + start+bounds[0],
86 refseq.getStart() + start+((bounds[2]==0) ? -1 : bounds[2]));
87 // seq.checkValidRange(); probably not needed
88 seq.setDatasetSequence(refseq);
89 seq.setDescription(refseq.getDescription());
94 We don't allow this - refseq is given at construction time only
95 public void setSeq(SequenceI seq) {
100 * internal constructor - sets seq to a gapless sequence derived from seq
101 * and prepends any 'D' operations needed to get to the first residue of seq.
102 * @param seq SequenceI
103 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
104 * @param _s index of first position in seq
105 * @param _e index after last position in (possibly gapped) seq
106 * @return true if gaps are present in seq
108 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, int _e)
110 boolean hasgaps = false;
113 throw new Error("Implementation Error - _setSeq(null,...)");
116 throw new Error("Implementation Error: _s="+_s);
117 String seq_string = seq.getSequenceAsString();
118 if (_e==0 || _e<_s || _e>seq_string.length())
119 _e=seq_string.length();
120 // resolve start and end positions relative to ungapped reference sequence
121 start = seq.findPosition(_s)-seq.getStart();
122 end = seq.findPosition(_e)-seq.getStart();
123 int l_ungapped = end-start;
124 // Find correct sequence to reference and correct start and end - if necessary
125 SequenceI ds = seq.getDatasetSequence();
128 // make a new dataset sequence
129 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
130 new String(seq_string));
131 l_ungapped=ungapped.length();
132 // check that we haven't just duplicated an ungapped sequence.
133 if (l_ungapped == seq.getLength())
139 ds = new Sequence(seq.getName(), ungapped,
141 seq.getStart()+ungapped.length()-1);
142 // JBPNote: this would be consistent but may not be useful
143 // seq.setDatasetSequence(ds);
146 // add in offset between seq and the dataset sequence
147 if (ds.getStart() < seq.getStart())
149 int offset=seq.getStart()-ds.getStart();
150 if (initialDeletion) {
151 // absolute cigar string
152 addDeleted(_s+offset);
156 // normal behaviour - just mark start and end subsequence
164 // any gaps to process ?
165 if (l_ungapped!=(_e-_s))
171 if (end>ds.getLength()) {
172 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
173 // end = ds.getLength();
180 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
181 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
182 * @param seq SequenceI
183 * @param operation char[]
186 public SeqCigar(SequenceI seq, char operation[], int range[])
191 throw new Error("Implementation Bug. Null seq !");
193 if (operation.length != range.length)
195 throw new Error("Implementation Bug. Cigar Operation list!= range list");
198 if (operation != null)
200 this.operation = new char[operation.length + _inc_length];
201 this.range = new int[operation.length + _inc_length];
203 if (_setSeq(seq, false, 0, 0))
205 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
207 for (int i = this.length, j = 0; j < operation.length; i++, j++)
209 char op = operation[j];
210 if (op != M && op != I && op != D)
213 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
214 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
216 this.operation[i] = op;
217 this.range[i] = range[j];
219 this.length += operation.length;
223 this.operation = null;
226 if (_setSeq(seq, false,0, 0))
228 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
234 * add range matched residues to cigar string
237 public void addMatch(int range)
239 this.addOperation(M, range);
243 * insertion and match operations based on seq to the cigar up to
244 * the endpos column of seq.
246 * @param cigar CigarBase
247 * @param seq SequenceI
248 * @param startpos int
250 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
252 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
253 int startpos, int endpos, boolean initialDeletions)
257 int p = 0, res = seq.getLength();
259 if (!initialDeletions)
265 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
266 if ( (startpos <= p) && (p <= endpos))
270 if (range > 0 && op != I)
272 cigar.addOperation(op, range);
280 if (range > 0 && op != M)
282 cigar.addOperation(op, range);
293 if (range > 0 && op != D)
295 cigar.addOperation(op, range);
303 // do nothing - insertions are not made in flanking regions
310 cigar.addOperation(op, range);
315 * create a cigar string for given sequence
316 * @param seq SequenceI
318 public SeqCigar(SequenceI seq)
323 throw new Error("Implementation error for new Cigar(SequenceI)");
325 _setSeq(seq, false, 0, 0);
326 // there is still work to do
327 addSequenceOps(this, seq, 0, seq.getLength()-1, false);
331 * Create Cigar from a range of gaps and residues on a sequence object
332 * @param seq SequenceI
333 * @param start int - first column in range
334 * @param end int - last column in range
336 public SeqCigar(SequenceI seq, int start, int end)
341 throw new Error("Implementation error for new Cigar(SequenceI)");
343 _setSeq(seq, false, start, end+1);
344 // there is still work to do
345 addSequenceOps(this, seq, start, end, false);
349 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
350 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
351 * @param seq SequenceI object resolvable to a dataset sequence
352 * @param cigarString String
355 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
358 Object[] opsandrange = parseCigarString(cigarString);
359 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
365 * @param alseqs SeqCigar[]
366 * @param gapCharacter char
367 * @return SequenceI[]
369 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
370 char gapCharacter, ColumnSelection colsel, int[] segments)
372 SequenceI[] seqs = new SequenceI[alseqs.length];
373 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
374 String[] alseqs_string=new String[alseqs.length];
375 Object[] gs_regions = new Object[alseqs.length];
376 for (int i = 0; i < alseqs.length; i++)
378 alseqs_string[i]=alseqs[i].getRefSeq().
379 getSequenceAsString(alseqs[i].start,alseqs[i].end);
380 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
381 if (gs_regions[i] == null)
383 throw new Error("Implementation error: " + i +
384 "'th sequence Cigar has no operations.");
386 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
388 // Now account for insertions. (well - deletions)
389 // this is complicated because we must keep track of shifted positions in each sequence
390 ShiftList shifts = new ShiftList();
391 for (int i = 0; i < alseqs.length; i++)
393 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
394 if (gs_region != null)
397 for (int hr = 0; hr < gs_region.length; hr++)
399 int[] region = (int[]) gs_region[hr];
400 char[] insert = new char[region[1] - region[0] + 1];
401 for (int s = 0; s < insert.length; s++)
403 insert[s] = gapCharacter;
405 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
406 for (int s = 0; s < alseqs.length; s++)
410 if (g_seqs[s].length() <= inspos)
412 // prefix insertion with more gaps.
413 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
415 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
418 g_seqs[s].insert(inspos, insert);
422 g_seqs[s].insert(inspos,
423 alseqs_string[i].substring(region[0], region[1] + 1));
426 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
428 // add a hidden column for this deletion
429 colsel.hideColumns(inspos, inspos+insert.length-1);
433 for (int i = 0; i < alseqs.length; i++)
435 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
436 SequenceI ref = alseqs[i].getRefSeq();
437 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
438 ref.getStart() + alseqs[i].start+bounds[0],
439 ref.getStart() + alseqs[i].start+(bounds[2]==0 ? -1 : bounds[2]));
440 seqs[i].setDatasetSequence(ref);
441 seqs[i].setDescription(ref.getDescription());
443 if (segments!=null) {
444 for (int i=0; i<segments.length; i+=3) {
445 //int start=shifts.shift(segments[i]-1)+1;
446 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
447 colsel.hideColumns(segments[i+1], segments[i+1]+segments[i+2]-1);
454 * non rigorous testing
458 * @param seq Sequence
459 * @param ex_cs_gapped String
462 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
464 SeqCigar c_sgapped = new SeqCigar(seq);
465 String cs_gapped = c_sgapped.getCigarstring();
466 if (!cs_gapped.equals(ex_cs_gapped))
468 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
469 "' != " + ex_cs_gapped);
474 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
477 // this is non-rigorous - start and end recovery is not tested.
478 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
479 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
481 System.err.println("Couldn't reconstruct sequence.\n" +
482 gen_sgapped_s.getSequenceAsString() + "\n" +
483 s_gapped.getSequenceAsString());
489 public static void main(String argv[])
493 Sequence s = new Sequence("MySeq",
494 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
497 Sequence s_gapped = new Sequence("MySeq",
498 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
500 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
501 s_gapped.setDatasetSequence(s);
503 Sequence s_subsequence_gapped = new Sequence("MySeq",
504 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
507 s_subsequence_gapped.setDatasetSequence(s);
508 SeqCigar c_null = new SeqCigar(s);
509 String cs_null = c_null.getCigarstring();
510 if (!cs_null.equals("42M"))
513 "Failed to recover ungapped sequence cigar operations:" +
514 ( (cs_null == "") ? "empty string" : cs_null));
516 testCigar_string(s_gapped, ex_cs_gapped);
517 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
518 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
520 System.err.println("Failed parseCigar(" + ex_cs_gapped +
521 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
524 testSeqRecovery(gen_sgapped, s_gapped);
525 // Test dataset resolution
526 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
527 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
529 System.err.println("Failed recovery for subsequence of dataset sequence");
532 if (sub_gapped.getWidth() != sub_gapped_s.length())
534 System.err.println("Failed getWidth()");
537 sub_gapped.getFullWidth();
538 if (sub_gapped.hasDeletedRegions())
540 System.err.println("hasDeletedRegions is incorrect.");
542 // Test start-end region SeqCigar
543 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
544 if (sub_se_gp.getWidth() != 41)
547 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
549 System.out.println("Original sequence align:\n" + sub_gapped_s +
550 "\nReconstructed window from 8 to 48\n"
551 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
552 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
554 SequenceI ssgp = sub_se_gp.getSeq('-');
555 System.out.println("\t " + ssgp.getSequenceAsString());
556 for (int r = 0; r < 10; r++)
558 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
559 int sl = sub_se_gp.getWidth();
561 for (int rs = 0; rs < 10; rs++)
564 sub_se_gp.deleteRange(st, e);
565 String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
566 System.out.println(st + "," + e + "\t:" + ssgapedseq);
571 SeqCigar[] set = new SeqCigar[]
573 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
574 new SeqCigar(s_gapped)};
575 Alignment al = new Alignment(set);
576 for (int i = 0; i < al.getHeight(); i++)
578 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
579 al.getSequenceAt(i).getStart() + "\t" +
580 al.getSequenceAt(i).getEnd() + "\t" +
581 al.getSequenceAt(i).getSequenceAsString());
585 System.out.println("Gapped.");
586 SeqCigar[] set = new SeqCigar[]
588 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
589 new SeqCigar(s_gapped)};
590 set[0].deleteRange(20, 25);
591 Alignment al = new Alignment(set);
592 for (int i = 0; i < al.getHeight(); i++)
594 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
595 al.getSequenceAt(i).getStart() + "\t" +
596 al.getSequenceAt(i).getEnd() + "\t" +
597 al.getSequenceAt(i).getSequenceAsString());
600 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
601 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());