2 * Jalview - A Sequence Alignment Editor and Viewer (Development Version 2.4.1)
3 * Copyright (C) 2009 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
19 package jalview.datamodel;
21 import java.util.Hashtable;
23 import jalview.analysis.*;
24 import jalview.util.*;
26 public class SeqCigar extends CigarSimple
29 * start(inclusive) and end(exclusive) of subsequence on refseq
31 private int start, end;
33 private SequenceI refseq = null;
35 private Hashtable seqProps;
38 * Reference dataset sequence for the cigar string
42 public SequenceI getRefSeq()
49 * @return int start index of cigar ops on refSeq
58 * @return int end index (exclusive) of cigar ops on refSeq
66 * Returns sequence as a string with cigar operations applied to it
70 public String getSequenceString(char GapChar)
72 return (length == 0) ? "" : (String) getSequenceAndDeletions(refseq
73 .getSequenceAsString(start, end), GapChar)[0];
77 * recreates a gapped and edited version of RefSeq or null for an empty cigar
82 public SequenceI getSeq(char GapChar)
85 if (refseq == null || length == 0)
89 Object[] edit_result = getSequenceAndDeletions(refseq
90 .getSequenceAsString(start, end), GapChar);
91 if (edit_result == null)
94 "Implementation Error - unexpected null from getSequenceAndDeletions");
96 int bounds[] = (int[]) edit_result[1];
97 seq = new Sequence(refseq.getName(), (String) edit_result[0], refseq
99 + start + bounds[0], refseq.getStart() + start
100 + ((bounds[2] == 0) ? -1 : bounds[2]));
101 seq.setDescription(refseq.getDescription());
102 int sstart = seq.getStart(), send = seq.getEnd();
103 // seq.checkValidRange(); probably not needed
104 // recover local properties if present
105 if (seqProps != null)
107 // this recovers dataset sequence reference as well as local features,
108 // names, start/end settings.
109 SeqsetUtils.SeqCharacterUnhash(seq, seqProps);
111 // ensure dataset sequence is up to date from local reference
112 seq.setDatasetSequence(refseq);
113 seq.setStart(sstart);
119 * We don't allow this - refseq is given at construction time only public void
120 * setSeq(SequenceI seq) { this.seq = seq; }
123 * internal constructor - sets seq to a gapless sequence derived from seq and
124 * prepends any 'D' operations needed to get to the first residue of seq.
128 * @param initialDeletion
129 * true to mark initial dataset sequence residues as deleted in
132 * index of first position in seq
134 * index after last position in (possibly gapped) seq
135 * @return true if gaps are present in seq
137 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
140 boolean hasgaps = false;
143 throw new Error("Implementation Error - _setSeq(null,...)");
147 throw new Error("Implementation Error: _s=" + _s);
149 String seq_string = seq.getSequenceAsString();
150 if (_e == 0 || _e < _s || _e > seq_string.length())
152 _e = seq_string.length();
154 // resolve start and end positions relative to ungapped reference sequence
155 start = seq.findPosition(_s) - seq.getStart();
156 end = seq.findPosition(_e) - seq.getStart();
157 int l_ungapped = end - start;
158 // Find correct sequence to reference and correct start and end - if
160 SequenceI ds = seq.getDatasetSequence();
163 // make a new dataset sequence
164 String ungapped = AlignSeq.extractGaps(
165 jalview.util.Comparison.GapChars, new String(seq_string));
166 l_ungapped = ungapped.length();
167 // check that we haven't just duplicated an ungapped sequence.
168 if (l_ungapped == seq.getLength())
174 ds = new Sequence(seq.getName(), ungapped, seq.getStart(), seq
176 + ungapped.length() - 1);
177 // JBPNote: this would be consistent but may not be useful
178 // seq.setDatasetSequence(ds);
181 // add in offset between seq and the dataset sequence
182 if (ds.getStart() < seq.getStart())
184 int offset = seq.getStart() - ds.getStart();
187 // absolute cigar string
188 addDeleted(_s + offset);
194 // normal behaviour - just mark start and end subsequence
202 // any gaps to process ?
203 if (l_ungapped != (_e - _s))
209 // copy over local properties for the sequence instance of the refseq
210 seqProps = SeqsetUtils.SeqCharacterHash(seq);
212 if (end > ds.getLength())
215 "SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
216 // end = ds.getLength();
223 * directly initialise a cigar object with a sequence of range, operation
224 * pairs and a sequence to apply it to. operation and range should be relative
225 * to the seq.getStart()'th residue of the dataset seq resolved from seq.
234 public SeqCigar(SequenceI seq, char operation[], int range[])
239 throw new Error("Implementation Bug. Null seq !");
241 if (operation.length != range.length)
244 "Implementation Bug. Cigar Operation list!= range list");
247 if (operation != null)
249 this.operation = new char[operation.length + _inc_length];
250 this.range = new int[operation.length + _inc_length];
252 if (_setSeq(seq, false, 0, 0))
255 "NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
257 for (int i = this.length, j = 0; j < operation.length; i++, j++)
259 char op = operation[j];
260 if (op != M && op != I && op != D)
262 throw new Error("Implementation Bug. Cigar Operation '" + j
263 + "' '" + op + "' not one of '" + M + "', '" + I
264 + "', or '" + D + "'.");
266 this.operation[i] = op;
267 this.range[i] = range[j];
269 this.length += operation.length;
273 this.operation = null;
276 if (_setSeq(seq, false, 0, 0))
279 "NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
285 * add range matched residues to cigar string
290 public void addMatch(int range)
292 this.addOperation(M, range);
296 * Adds insertion and match operations based on seq to the cigar up to the
297 * endpos column of seq.
307 * @param initialDeletions
308 * if true then initial deletions will be added from start of
311 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
312 int startpos, int endpos, boolean initialDeletions)
316 int p = 0, res = seq.getLength();
318 if (!initialDeletions)
325 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq
326 .getCharAt(p)) : true;
327 if ((startpos <= p) && (p <= endpos))
331 if (range > 0 && op != I)
333 cigar.addOperation(op, range);
341 if (range > 0 && op != M)
343 cigar.addOperation(op, range);
354 if (range > 0 && op != D)
356 cigar.addOperation(op, range);
364 // do nothing - insertions are not made in flanking regions
371 cigar.addOperation(op, range);
376 * create a cigar string for given sequence
381 public SeqCigar(SequenceI seq)
386 throw new Error("Implementation error for new Cigar(SequenceI)");
388 _setSeq(seq, false, 0, 0);
389 // there is still work to do
390 addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
394 * Create Cigar from a range of gaps and residues on a sequence object
399 * int - first column in range
401 * int - last column in range
403 public SeqCigar(SequenceI seq, int start, int end)
408 throw new Error("Implementation error for new Cigar(SequenceI)");
410 _setSeq(seq, false, start, end + 1);
411 // there is still work to do
412 addSequenceOps(this, seq, start, end, false);
416 * Create a cigar object from a cigar string like '[<I|D|M><range>]+' Will
417 * fail if the given seq already contains gaps (JBPNote: future implementation
421 * SequenceI object resolvable to a dataset sequence
426 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
429 Object[] opsandrange = parseCigarString(cigarString);
430 return new SeqCigar(seq, (char[]) opsandrange[0],
431 (int[]) opsandrange[1]);
439 * @param gapCharacter
441 * @return SequenceI[]
443 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
444 char gapCharacter, ColumnSelection colsel, int[] segments)
446 SequenceI[] seqs = new SequenceI[alseqs.length];
447 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
448 String[] alseqs_string = new String[alseqs.length];
449 Object[] gs_regions = new Object[alseqs.length];
450 for (int i = 0; i < alseqs.length; i++)
452 alseqs_string[i] = alseqs[i].getRefSeq().getSequenceAsString(
453 alseqs[i].start, alseqs[i].end);
454 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
455 gapCharacter); // gapped sequence, {start, start col, end.
456 // endcol}, hidden regions {{start, end, col}})
457 if (gs_regions[i] == null)
459 throw new Error("Implementation error: " + i
460 + "'th sequence Cigar has no operations.");
462 g_seqs[i] = new StringBuffer((String) ((Object[]) gs_regions[i])[0]); // the
467 // Now account for insertions. (well - deletions)
468 // this is complicated because we must keep track of shifted positions in
470 ShiftList shifts = new ShiftList();
471 for (int i = 0; i < alseqs.length; i++)
473 Object[] gs_region = ((Object[]) ((Object[]) gs_regions[i])[2]);
474 if (gs_region != null)
477 for (int hr = 0; hr < gs_region.length; hr++)
479 int[] region = (int[]) gs_region[hr];
480 char[] insert = new char[region[1] - region[0] + 1];
481 for (int s = 0; s < insert.length; s++)
483 insert[s] = gapCharacter;
485 int inspos = shifts.shift(region[2]); // resolve insertion position in
486 // current alignment frame of
488 for (int s = 0; s < alseqs.length; s++)
492 if (g_seqs[s].length() <= inspos)
494 // prefix insertion with more gaps.
495 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
497 g_seqs[s].append(gapCharacter); // to debug - use a diffferent
498 // gap character here
501 g_seqs[s].insert(inspos, insert);
505 g_seqs[s].insert(inspos, alseqs_string[i].substring(
506 region[0], region[1] + 1));
509 shifts.addShift(region[2], insert.length); // update shift in
510 // alignment frame of
512 if (segments == null)
514 // add a hidden column for this deletion
515 colsel.hideColumns(inspos, inspos + insert.length - 1);
520 for (int i = 0; i < alseqs.length; i++)
522 int[] bounds = ((int[]) ((Object[]) gs_regions[i])[1]);
523 SequenceI ref = alseqs[i].getRefSeq();
524 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(), ref
526 + alseqs[i].start + bounds[0], ref.getStart()
527 + alseqs[i].start + (bounds[2] == 0 ? -1 : bounds[2]));
528 seqs[i].setDatasetSequence(ref);
529 seqs[i].setDescription(ref.getDescription());
531 if (segments != null)
533 for (int i = 0; i < segments.length; i += 3)
535 // int start=shifts.shift(segments[i]-1)+1;
536 // int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
537 colsel.hideColumns(segments[i + 1], segments[i + 1]
538 + segments[i + 2] - 1);
545 * non rigorous testing
551 * @param ex_cs_gapped
555 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
557 SeqCigar c_sgapped = new SeqCigar(seq);
558 String cs_gapped = c_sgapped.getCigarstring();
559 if (!cs_gapped.equals(ex_cs_gapped))
561 System.err.println("Failed getCigarstring: incorect string '"
562 + cs_gapped + "' != " + ex_cs_gapped);
567 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
570 // this is non-rigorous - start and end recovery is not tested.
571 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
572 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
574 System.err.println("Couldn't reconstruct sequence.\n"
575 + gen_sgapped_s.getSequenceAsString() + "\n"
576 + s_gapped.getSequenceAsString());
582 public static void main(String argv[]) throws Exception
585 Sequence s = new Sequence("MySeq",
586 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt", 39, 80);
588 Sequence s_gapped = new Sequence(
590 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
592 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
593 s_gapped.setDatasetSequence(s);
595 Sequence s_subsequence_gapped = new Sequence(
597 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
600 s_subsequence_gapped.setDatasetSequence(s);
601 SeqCigar c_null = new SeqCigar(s);
602 String cs_null = c_null.getCigarstring();
603 if (!cs_null.equals("42M"))
606 .println("Failed to recover ungapped sequence cigar operations:"
607 + ((cs_null == "") ? "empty string" : cs_null));
609 testCigar_string(s_gapped, ex_cs_gapped);
610 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
611 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
613 System.err.println("Failed parseCigar(" + ex_cs_gapped
614 + ")->getCigarString()->'" + gen_sgapped.getCigarstring()
617 testSeqRecovery(gen_sgapped, s_gapped);
618 // Test dataset resolution
619 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
620 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
623 .println("Failed recovery for subsequence of dataset sequence");
626 if (sub_gapped.getWidth() != sub_gapped_s.length())
628 System.err.println("Failed getWidth()");
631 sub_gapped.getFullWidth();
632 if (sub_gapped.hasDeletedRegions())
634 System.err.println("hasDeletedRegions is incorrect.");
636 // Test start-end region SeqCigar
637 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
638 if (sub_se_gp.getWidth() != 41)
641 .println("SeqCigar(seq, start, end) not properly clipped alignsequence.");
643 System.out.println("Original sequence align:\n" + sub_gapped_s
644 + "\nReconstructed window from 8 to 48\n" + "XXXXXXXX"
645 + sub_se_gp.getSequenceString('-') + "..." + "\nCigar String:"
646 + sub_se_gp.getCigarstring() + "\n");
647 SequenceI ssgp = sub_se_gp.getSeq('-');
648 System.out.println("\t " + ssgp.getSequenceAsString());
649 for (int r = 0; r < 10; r++)
651 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
652 int sl = sub_se_gp.getWidth();
654 for (int rs = 0; rs < 10; rs++)
657 sub_se_gp.deleteRange(st, e);
658 String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
659 System.out.println(st + "," + e + "\t:" + ssgapedseq);
664 SeqCigar[] set = new SeqCigar[]
665 { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
666 new SeqCigar(s_gapped) };
667 Alignment al = new Alignment(set);
668 for (int i = 0; i < al.getHeight(); i++)
670 System.out.println("" + al.getSequenceAt(i).getName() + "\t"
671 + al.getSequenceAt(i).getStart() + "\t"
672 + al.getSequenceAt(i).getEnd() + "\t"
673 + al.getSequenceAt(i).getSequenceAsString());
677 System.out.println("Gapped.");
678 SeqCigar[] set = new SeqCigar[]
679 { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
680 new SeqCigar(s_gapped) };
681 set[0].deleteRange(20, 25);
682 Alignment al = new Alignment(set);
683 for (int i = 0; i < al.getHeight(); i++)
685 System.out.println("" + al.getSequenceAt(i).getName() + "\t"
686 + al.getSequenceAt(i).getStart() + "\t"
687 + al.getSequenceAt(i).getEnd() + "\t"
688 + al.getSequenceAt(i).getSequenceAsString());
691 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
692 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());