2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import jalview.analysis.*;
22 import jalview.util.*;
28 * start(inclusive) and end(exclusive) of subsequence on refseq
30 private int start, end;
31 private SequenceI refseq = null;
33 * Reference dataset sequence for the cigar string
36 public SequenceI getRefSeq()
43 * @return int start index of cigar ops on refSeq
52 * @return int end index (exclusive) of cigar ops on refSeq
60 * Returns sequence as a string with cigar operations applied to it
63 public String getSequenceString(char GapChar)
65 return (length == 0) ? "" :
66 (String) getSequenceAndDeletions(refseq.getSequenceAsString(start, end),
71 * recreates a gapped and edited version of RefSeq or null for an empty cigar string
74 public SequenceI getSeq(char GapChar)
77 if (refseq == null || length == 0)
81 Object[] edit_result = getSequenceAndDeletions(refseq.getSequenceAsString(
84 if (edit_result == null)
87 "Implementation Error - unexpected null from getSequenceAndDeletions");
89 int bounds[] = (int[]) edit_result[1];
90 seq = new Sequence(refseq.getName(), (String) edit_result[0],
91 refseq.getStart() + start + bounds[0],
92 refseq.getStart() + start +
93 ( (bounds[2] == 0) ? -1 : bounds[2]));
94 // seq.checkValidRange(); probably not needed
95 seq.setDatasetSequence(refseq);
96 seq.setDescription(refseq.getDescription());
101 We don't allow this - refseq is given at construction time only
102 public void setSeq(SequenceI seq) {
107 * internal constructor - sets seq to a gapless sequence derived from seq
108 * and prepends any 'D' operations needed to get to the first residue of seq.
109 * @param seq SequenceI
110 * @param initialDeletion true to mark initial dataset sequence residues as deleted in subsequence
111 * @param _s index of first position in seq
112 * @param _e index after last position in (possibly gapped) seq
113 * @return true if gaps are present in seq
115 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
118 boolean hasgaps = false;
121 throw new Error("Implementation Error - _setSeq(null,...)");
125 throw new Error("Implementation Error: _s=" + _s);
127 String seq_string = seq.getSequenceAsString();
128 if (_e == 0 || _e < _s || _e > seq_string.length())
130 _e = seq_string.length();
132 // resolve start and end positions relative to ungapped reference sequence
133 start = seq.findPosition(_s) - seq.getStart();
134 end = seq.findPosition(_e) - seq.getStart();
135 int l_ungapped = end - start;
136 // Find correct sequence to reference and correct start and end - if necessary
137 SequenceI ds = seq.getDatasetSequence();
140 // make a new dataset sequence
141 String ungapped = AlignSeq.extractGaps(jalview.util.Comparison.GapChars,
142 new String(seq_string));
143 l_ungapped = ungapped.length();
144 // check that we haven't just duplicated an ungapped sequence.
145 if (l_ungapped == seq.getLength())
151 ds = new Sequence(seq.getName(), ungapped,
153 seq.getStart() + ungapped.length() - 1);
154 // JBPNote: this would be consistent but may not be useful
155 // seq.setDatasetSequence(ds);
158 // add in offset between seq and the dataset sequence
159 if (ds.getStart() < seq.getStart())
161 int offset = seq.getStart() - ds.getStart();
164 // absolute cigar string
165 addDeleted(_s + offset);
171 // normal behaviour - just mark start and end subsequence
179 // any gaps to process ?
180 if (l_ungapped != (_e - _s))
188 if (end > ds.getLength())
190 throw new Error("SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
191 // end = ds.getLength();
198 * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to.
199 * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq.
200 * @param seq SequenceI
201 * @param operation char[]
204 public SeqCigar(SequenceI seq, char operation[], int range[])
209 throw new Error("Implementation Bug. Null seq !");
211 if (operation.length != range.length)
213 throw new Error("Implementation Bug. Cigar Operation list!= range list");
216 if (operation != null)
218 this.operation = new char[operation.length + _inc_length];
219 this.range = new int[operation.length + _inc_length];
221 if (_setSeq(seq, false, 0, 0))
223 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
225 for (int i = this.length, j = 0; j < operation.length; i++, j++)
227 char op = operation[j];
228 if (op != M && op != I && op != D)
231 "Implementation Bug. Cigar Operation '" + j + "' '" + op +
232 "' not one of '" + M + "', '" + I + "', or '" + D + "'.");
234 this.operation[i] = op;
235 this.range[i] = range[j];
237 this.length += operation.length;
241 this.operation = null;
244 if (_setSeq(seq, false, 0, 0))
246 throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
252 * add range matched residues to cigar string
255 public void addMatch(int range)
257 this.addOperation(M, range);
262 * insertion and match operations based on seq to the cigar up to
263 * the endpos column of seq.
265 * @param cigar CigarBase
266 * @param seq SequenceI
267 * @param startpos int
269 * @param initialDeletions if true then initial deletions will be added from start of seq to startpos
271 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
272 int startpos, int endpos,
273 boolean initialDeletions)
277 int p = 0, res = seq.getLength();
279 if (!initialDeletions)
286 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq.getCharAt(p)) : true;
287 if ( (startpos <= p) && (p <= endpos))
291 if (range > 0 && op != I)
293 cigar.addOperation(op, range);
301 if (range > 0 && op != M)
303 cigar.addOperation(op, range);
314 if (range > 0 && op != D)
316 cigar.addOperation(op, range);
324 // do nothing - insertions are not made in flanking regions
331 cigar.addOperation(op, range);
336 * create a cigar string for given sequence
337 * @param seq SequenceI
339 public SeqCigar(SequenceI seq)
344 throw new Error("Implementation error for new Cigar(SequenceI)");
346 _setSeq(seq, false, 0, 0);
347 // there is still work to do
348 addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
352 * Create Cigar from a range of gaps and residues on a sequence object
353 * @param seq SequenceI
354 * @param start int - first column in range
355 * @param end int - last column in range
357 public SeqCigar(SequenceI seq, int start, int end)
362 throw new Error("Implementation error for new Cigar(SequenceI)");
364 _setSeq(seq, false, start, end + 1);
365 // there is still work to do
366 addSequenceOps(this, seq, start, end, false);
370 * Create a cigar object from a cigar string like '[<I|D|M><range>]+'
371 * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix)
372 * @param seq SequenceI object resolvable to a dataset sequence
373 * @param cigarString String
376 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
379 Object[] opsandrange = parseCigarString(cigarString);
380 return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]);
386 * @param alseqs SeqCigar[]
387 * @param gapCharacter char
388 * @return SequenceI[]
390 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
391 char gapCharacter, ColumnSelection colsel, int[] segments)
393 SequenceI[] seqs = new SequenceI[alseqs.length];
394 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
395 String[] alseqs_string = new String[alseqs.length];
396 Object[] gs_regions = new Object[alseqs.length];
397 for (int i = 0; i < alseqs.length; i++)
399 alseqs_string[i] = alseqs[i].getRefSeq().
400 getSequenceAsString(alseqs[i].start, alseqs[i].end);
401 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
402 gapCharacter); // gapped sequence, {start, start col, end. endcol}, hidden regions {{start, end, col}})
403 if (gs_regions[i] == null)
405 throw new Error("Implementation error: " + i +
406 "'th sequence Cigar has no operations.");
408 g_seqs[i] = new StringBuffer( (String) ( (Object[]) gs_regions[i])[0]); // the visible gapped sequence
410 // Now account for insertions. (well - deletions)
411 // this is complicated because we must keep track of shifted positions in each sequence
412 ShiftList shifts = new ShiftList();
413 for (int i = 0; i < alseqs.length; i++)
415 Object[] gs_region = ( (Object[]) ( (Object[]) gs_regions[i])[2]);
416 if (gs_region != null)
419 for (int hr = 0; hr < gs_region.length; hr++)
421 int[] region = (int[]) gs_region[hr];
422 char[] insert = new char[region[1] - region[0] + 1];
423 for (int s = 0; s < insert.length; s++)
425 insert[s] = gapCharacter;
427 int inspos = shifts.shift(region[2]); // resolve insertion position in current alignment frame of reference
428 for (int s = 0; s < alseqs.length; s++)
432 if (g_seqs[s].length() <= inspos)
434 // prefix insertion with more gaps.
435 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
437 g_seqs[s].append(gapCharacter); // to debug - use a diffferent gap character here
440 g_seqs[s].insert(inspos, insert);
444 g_seqs[s].insert(inspos,
445 alseqs_string[i].substring(region[0],
449 shifts.addShift(region[2], insert.length); // update shift in alignment frame of reference
450 if (segments == null)
452 // add a hidden column for this deletion
453 colsel.hideColumns(inspos, inspos + insert.length - 1);
458 for (int i = 0; i < alseqs.length; i++)
460 int[] bounds = ( (int[]) ( (Object[]) gs_regions[i])[1]);
461 SequenceI ref = alseqs[i].getRefSeq();
462 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
463 ref.getStart() + alseqs[i].start + bounds[0],
464 ref.getStart() + alseqs[i].start +
465 (bounds[2] == 0 ? -1 : bounds[2]));
466 seqs[i].setDatasetSequence(ref);
467 seqs[i].setDescription(ref.getDescription());
469 if (segments != null)
471 for (int i = 0; i < segments.length; i += 3)
473 //int start=shifts.shift(segments[i]-1)+1;
474 //int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
475 colsel.hideColumns(segments[i + 1],
476 segments[i + 1] + segments[i + 2] - 1);
483 * non rigorous testing
487 * @param seq Sequence
488 * @param ex_cs_gapped String
491 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
493 SeqCigar c_sgapped = new SeqCigar(seq);
494 String cs_gapped = c_sgapped.getCigarstring();
495 if (!cs_gapped.equals(ex_cs_gapped))
497 System.err.println("Failed getCigarstring: incorect string '" + cs_gapped +
498 "' != " + ex_cs_gapped);
503 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
506 // this is non-rigorous - start and end recovery is not tested.
507 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
508 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
510 System.err.println("Couldn't reconstruct sequence.\n" +
511 gen_sgapped_s.getSequenceAsString() + "\n" +
512 s_gapped.getSequenceAsString());
518 public static void main(String argv[])
522 Sequence s = new Sequence("MySeq",
524 "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",
527 Sequence s_gapped = new Sequence("MySeq",
529 "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
531 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
532 s_gapped.setDatasetSequence(s);
534 Sequence s_subsequence_gapped = new Sequence("MySeq",
536 "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
539 s_subsequence_gapped.setDatasetSequence(s);
540 SeqCigar c_null = new SeqCigar(s);
541 String cs_null = c_null.getCigarstring();
542 if (!cs_null.equals("42M"))
545 "Failed to recover ungapped sequence cigar operations:" +
546 ( (cs_null == "") ? "empty string" : cs_null));
548 testCigar_string(s_gapped, ex_cs_gapped);
549 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
550 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
552 System.err.println("Failed parseCigar(" + ex_cs_gapped +
553 ")->getCigarString()->'" + gen_sgapped.getCigarstring() +
556 testSeqRecovery(gen_sgapped, s_gapped);
557 // Test dataset resolution
558 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
559 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
561 System.err.println("Failed recovery for subsequence of dataset sequence");
564 if (sub_gapped.getWidth() != sub_gapped_s.length())
566 System.err.println("Failed getWidth()");
569 sub_gapped.getFullWidth();
570 if (sub_gapped.hasDeletedRegions())
572 System.err.println("hasDeletedRegions is incorrect.");
574 // Test start-end region SeqCigar
575 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
576 if (sub_se_gp.getWidth() != 41)
579 "SeqCigar(seq, start, end) not properly clipped alignsequence.");
581 System.out.println("Original sequence align:\n" + sub_gapped_s +
582 "\nReconstructed window from 8 to 48\n"
583 + "XXXXXXXX" + sub_se_gp.getSequenceString('-') + "..."
584 + "\nCigar String:" + sub_se_gp.getCigarstring() + "\n"
586 SequenceI ssgp = sub_se_gp.getSeq('-');
587 System.out.println("\t " + ssgp.getSequenceAsString());
588 for (int r = 0; r < 10; r++)
590 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
591 int sl = sub_se_gp.getWidth();
593 for (int rs = 0; rs < 10; rs++)
596 sub_se_gp.deleteRange(st, e);
597 String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
598 System.out.println(st + "," + e + "\t:" + ssgapedseq);
603 SeqCigar[] set = new SeqCigar[]
605 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
606 new SeqCigar(s_gapped)};
607 Alignment al = new Alignment(set);
608 for (int i = 0; i < al.getHeight(); i++)
610 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
611 al.getSequenceAt(i).getStart() + "\t" +
612 al.getSequenceAt(i).getEnd() + "\t" +
613 al.getSequenceAt(i).getSequenceAsString());
617 System.out.println("Gapped.");
618 SeqCigar[] set = new SeqCigar[]
620 new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
621 new SeqCigar(s_gapped)};
622 set[0].deleteRange(20, 25);
623 Alignment al = new Alignment(set);
624 for (int i = 0; i < al.getHeight(); i++)
626 System.out.println("" + al.getSequenceAt(i).getName() + "\t" +
627 al.getSequenceAt(i).getStart() + "\t" +
628 al.getSequenceAt(i).getEnd() + "\t" +
629 al.getSequenceAt(i).getSequenceAsString());
632 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
633 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());