2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.datamodel;
23 import java.util.Enumeration;
24 import java.util.Hashtable;
26 import jalview.analysis.*;
27 import jalview.util.*;
29 public class SeqCigar extends CigarSimple
32 * start(inclusive) and end(exclusive) of subsequence on refseq
34 private int start, end;
36 private SequenceI refseq = null;
38 private Hashtable seqProps;
41 * Reference dataset sequence for the cigar string
45 public SequenceI getRefSeq()
52 * @return int start index of cigar ops on refSeq
61 * @return int end index (exclusive) of cigar ops on refSeq
69 * Returns sequence as a string with cigar operations applied to it
73 public String getSequenceString(char GapChar)
75 return (length == 0) ? "" : (String) getSequenceAndDeletions(
76 refseq.getSequenceAsString(start, end), GapChar)[0];
80 * recreates a gapped and edited version of RefSeq or null for an empty cigar
85 public SequenceI getSeq(char GapChar)
88 if (refseq == null || length == 0)
92 Object[] edit_result = getSequenceAndDeletions(
93 refseq.getSequenceAsString(start, end), GapChar);
94 if (edit_result == null)
96 throw new Error(MessageManager.getString("error.implementation_error_unexpected_null_from_get_sequence_and_deletions"));
98 int bounds[] = (int[]) edit_result[1];
99 seq = new Sequence(refseq.getName(), (String) edit_result[0],
100 refseq.getStart() + start + bounds[0], refseq.getStart()
101 + start + ((bounds[2] == 0) ? -1 : bounds[2]));
102 seq.setDescription(refseq.getDescription());
103 int sstart = seq.getStart(), send = seq.getEnd();
104 // seq.checkValidRange(); probably not needed
105 // recover local properties if present
106 if (seqProps != null)
108 // this recovers dataset sequence reference as well as local features,
109 // names, start/end settings.
110 SeqsetUtils.SeqCharacterUnhash(seq, seqProps);
112 // ensure dataset sequence is up to date from local reference
113 seq.setDatasetSequence(refseq);
114 seq.setStart(sstart);
120 * We don't allow this - refseq is given at construction time only public void
121 * setSeq(SequenceI seq) { this.seq = seq; }
124 * internal constructor - sets seq to a gapless sequence derived from seq and
125 * prepends any 'D' operations needed to get to the first residue of seq.
129 * @param initialDeletion
130 * true to mark initial dataset sequence residues as deleted in
133 * index of first position in seq
135 * index after last position in (possibly gapped) seq
136 * @return true if gaps are present in seq
138 private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
141 boolean hasgaps = false;
144 throw new Error(MessageManager.getString("error.implementation_error_set_seq_null"));
148 throw new Error(MessageManager.formatMessage("error.implementation_error_s", new String[]{Integer.valueOf(_s).toString()}));
150 String seq_string = seq.getSequenceAsString();
151 if (_e == 0 || _e < _s || _e > seq_string.length())
153 _e = seq_string.length();
155 // resolve start and end positions relative to ungapped reference sequence
156 start = seq.findPosition(_s) - seq.getStart();
157 end = seq.findPosition(_e) - seq.getStart();
158 int l_ungapped = end - start;
159 // Find correct sequence to reference and correct start and end - if
161 SequenceI ds = seq.getDatasetSequence();
164 // make a new dataset sequence
165 String ungapped = AlignSeq.extractGaps(
166 jalview.util.Comparison.GapChars, new String(seq_string));
167 l_ungapped = ungapped.length();
168 // check that we haven't just duplicated an ungapped sequence.
169 if (l_ungapped == seq.getLength())
175 ds = new Sequence(seq.getName(), ungapped, seq.getStart(),
176 seq.getStart() + ungapped.length() - 1);
177 // JBPNote: this would be consistent but may not be useful
178 // seq.setDatasetSequence(ds);
181 // add in offset between seq and the dataset sequence
182 if (ds.getStart() < seq.getStart())
184 int offset = seq.getStart() - ds.getStart();
187 // absolute cigar string
188 addDeleted(_s + offset);
194 // normal behaviour - just mark start and end subsequence
202 // any gaps to process ?
203 if (l_ungapped != (_e - _s))
209 // copy over local properties for the sequence instance of the refseq
210 seqProps = SeqsetUtils.SeqCharacterHash(seq);
212 if (end > ds.getLength())
214 throw new Error(MessageManager.getString("error.implementation_error_seqcigar_possible"));
215 // end = ds.getLength();
222 * directly initialise a cigar object with a sequence of range, operation
223 * pairs and a sequence to apply it to. operation and range should be relative
224 * to the seq.getStart()'th residue of the dataset seq resolved from seq.
233 public SeqCigar(SequenceI seq, char operation[], int range[])
238 throw new Error(MessageManager.getString("error.implmentation_bug_seq_null"));
240 if (operation.length != range.length)
242 throw new Error(MessageManager.getString("error.implementation_bug_cigar_operation_list_range_list"));
245 if (operation != null)
247 this.operation = new char[operation.length + _inc_length];
248 this.range = new int[operation.length + _inc_length];
250 if (_setSeq(seq, false, 0, 0))
252 throw new Error(MessageManager.getString("error.not_yet_implemented_cigar_object_from_cigar_string"));
254 for (int i = this.length, j = 0; j < operation.length; i++, j++)
256 char op = operation[j];
257 if (op != M && op != I && op != D)
259 throw new Error(MessageManager.formatMessage("error.implementation_bug_cigar_operation", new String[]{Integer.valueOf(j).toString(),Integer.valueOf(op).toString(),Integer.valueOf(M).toString(),Integer.valueOf(I).toString(),Integer.valueOf(D).toString()}));
261 this.operation[i] = op;
262 this.range[i] = range[j];
264 this.length += operation.length;
268 this.operation = null;
271 if (_setSeq(seq, false, 0, 0))
273 throw new Error(MessageManager.getString("error.not_yet_implemented_cigar_object_from_cigar_string"));
279 * add range matched residues to cigar string
284 public void addMatch(int range)
286 this.addOperation(M, range);
290 * Adds insertion and match operations based on seq to the cigar up to the
291 * endpos column of seq.
301 * @param initialDeletions
302 * if true then initial deletions will be added from start of seq to
305 protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
306 int startpos, int endpos, boolean initialDeletions)
310 int p = 0, res = seq.getLength();
312 if (!initialDeletions)
319 boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq
320 .getCharAt(p)) : true;
321 if ((startpos <= p) && (p <= endpos))
325 if (range > 0 && op != I)
327 cigar.addOperation(op, range);
335 if (range > 0 && op != M)
337 cigar.addOperation(op, range);
348 if (range > 0 && op != D)
350 cigar.addOperation(op, range);
358 // do nothing - insertions are not made in flanking regions
365 cigar.addOperation(op, range);
370 * create a cigar string for given sequence
375 public SeqCigar(SequenceI seq)
380 throw new Error(MessageManager.getString("error.implementation_error_for_new_cigar"));
382 _setSeq(seq, false, 0, 0);
383 // there is still work to do
384 addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
388 * Create Cigar from a range of gaps and residues on a sequence object
393 * int - first column in range
395 * int - last column in range
397 public SeqCigar(SequenceI seq, int start, int end)
402 throw new Error(MessageManager.getString("error.implementation_error_for_new_cigar"));
404 _setSeq(seq, false, start, end + 1);
405 // there is still work to do
406 addSequenceOps(this, seq, start, end, false);
410 * Create a cigar object from a cigar string like '[<I|D|M><range>]+' Will
411 * fail if the given seq already contains gaps (JBPNote: future implementation
415 * SequenceI object resolvable to a dataset sequence
420 public static SeqCigar parseCigar(SequenceI seq, String cigarString)
423 Object[] opsandrange = parseCigarString(cigarString);
424 return new SeqCigar(seq, (char[]) opsandrange[0],
425 (int[]) opsandrange[1]);
429 * create an alignment from the given array of cigar sequences and gap
430 * character, and marking the given segments as visible in the given
434 * @param gapCharacter
436 * - columnSelection where hidden regions are marked
438 * - visible regions of alignment
439 * @return SequenceI[]
441 public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
442 char gapCharacter, ColumnSelection colsel, int[] segments)
444 SequenceI[] seqs = new SequenceI[alseqs.length];
445 StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
446 String[] alseqs_string = new String[alseqs.length];
447 Object[] gs_regions = new Object[alseqs.length];
448 for (int i = 0; i < alseqs.length; i++)
450 alseqs_string[i] = alseqs[i].getRefSeq().getSequenceAsString(
451 alseqs[i].start, alseqs[i].end);
452 gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
453 gapCharacter); // gapped sequence, {start, start col, end.
454 // endcol}, hidden regions {{start, end, col}})
455 if (gs_regions[i] == null)
457 throw new Error(MessageManager.formatMessage("error.implementation_error_cigar_seq_no_operations", new String[]{Integer.valueOf(i).toString()}));
459 g_seqs[i] = new StringBuffer((String) ((Object[]) gs_regions[i])[0]); // the
464 // Now account for insertions. (well - deletions)
465 // this is complicated because we must keep track of shifted positions in
467 ShiftList shifts = new ShiftList();
468 for (int i = 0; i < alseqs.length; i++)
470 Object[] gs_region = ((Object[]) ((Object[]) gs_regions[i])[2]);
471 if (gs_region != null)
474 for (int hr = 0; hr < gs_region.length; hr++)
476 int[] region = (int[]) gs_region[hr];
477 char[] insert = new char[region[1] - region[0] + 1];
478 for (int s = 0; s < insert.length; s++)
480 insert[s] = gapCharacter;
482 int inspos = shifts.shift(region[2]); // resolve insertion position in
483 // current alignment frame of
485 for (int s = 0; s < alseqs.length; s++)
489 if (g_seqs[s].length() <= inspos)
491 // prefix insertion with more gaps.
492 for (int l = inspos - g_seqs[s].length(); l > 0; l--)
494 g_seqs[s].append(gapCharacter); // to debug - use a diffferent
495 // gap character here
498 g_seqs[s].insert(inspos, insert);
502 g_seqs[s].insert(inspos,
503 alseqs_string[i].substring(region[0], region[1] + 1));
506 shifts.addShift(region[2], insert.length); // update shift in
507 // alignment frame of
509 if (segments == null)
511 // add a hidden column for this deletion
512 colsel.hideColumns(inspos, inspos + insert.length - 1);
517 for (int i = 0; i < alseqs.length; i++)
519 int[] bounds = ((int[]) ((Object[]) gs_regions[i])[1]);
520 SequenceI ref = alseqs[i].getRefSeq();
521 seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
522 ref.getStart() + alseqs[i].start + bounds[0], ref.getStart()
523 + alseqs[i].start + (bounds[2] == 0 ? -1 : bounds[2]));
524 seqs[i].setDatasetSequence(ref);
525 seqs[i].setDescription(ref.getDescription());
527 if (segments != null)
529 for (int i = 0; i < segments.length; i += 3)
531 // int start=shifts.shift(segments[i]-1)+1;
532 // int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
533 colsel.hideColumns(segments[i + 1], segments[i + 1]
534 + segments[i + 2] - 1);
541 * non rigorous testing
547 * @param ex_cs_gapped
551 public static String testCigar_string(Sequence seq, String ex_cs_gapped)
553 SeqCigar c_sgapped = new SeqCigar(seq);
554 String cs_gapped = c_sgapped.getCigarstring();
555 if (!cs_gapped.equals(ex_cs_gapped))
557 System.err.println("Failed getCigarstring: incorect string '"
558 + cs_gapped + "' != " + ex_cs_gapped);
563 public static boolean testSeqRecovery(SeqCigar gen_sgapped,
566 // this is non-rigorous - start and end recovery is not tested.
567 SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
568 if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
570 System.err.println("Couldn't reconstruct sequence.\n"
571 + gen_sgapped_s.getSequenceAsString() + "\n"
572 + s_gapped.getSequenceAsString());
578 public static void main(String argv[]) throws Exception
581 Sequence s = new Sequence("MySeq",
582 o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt", 39, 80);
584 Sequence s_gapped = new Sequence(
586 orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
588 String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
589 s_gapped.setDatasetSequence(s);
591 Sequence s_subsequence_gapped = new Sequence(
593 sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
596 s_subsequence_gapped.setDatasetSequence(s);
597 SeqCigar c_null = new SeqCigar(s);
598 String cs_null = c_null.getCigarstring();
599 if (!cs_null.equals("42M"))
602 .println("Failed to recover ungapped sequence cigar operations:"
603 + ((cs_null == "") ? "empty string" : cs_null));
605 testCigar_string(s_gapped, ex_cs_gapped);
606 SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
607 if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
609 System.err.println("Failed parseCigar(" + ex_cs_gapped
610 + ")->getCigarString()->'" + gen_sgapped.getCigarstring()
613 testSeqRecovery(gen_sgapped, s_gapped);
614 // Test dataset resolution
615 SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
616 if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
619 .println("Failed recovery for subsequence of dataset sequence");
622 if (sub_gapped.getWidth() != sub_gapped_s.length())
624 System.err.println("Failed getWidth()");
627 sub_gapped.getFullWidth();
628 if (sub_gapped.hasDeletedRegions())
630 System.err.println("hasDeletedRegions is incorrect.");
632 // Test start-end region SeqCigar
633 SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
634 if (sub_se_gp.getWidth() != 41)
637 .println("SeqCigar(seq, start, end) not properly clipped alignsequence.");
639 System.out.println("Original sequence align:\n" + sub_gapped_s
640 + "\nReconstructed window from 8 to 48\n" + "XXXXXXXX"
641 + sub_se_gp.getSequenceString('-') + "..." + "\nCigar String:"
642 + sub_se_gp.getCigarstring() + "\n");
643 SequenceI ssgp = sub_se_gp.getSeq('-');
644 System.out.println("\t " + ssgp.getSequenceAsString());
645 for (int r = 0; r < 10; r++)
647 sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
648 int sl = sub_se_gp.getWidth();
650 for (int rs = 0; rs < 10; rs++)
653 sub_se_gp.deleteRange(st, e);
654 String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
655 System.out.println(st + "," + e + "\t:" + ssgapedseq);
660 SeqCigar[] set = new SeqCigar[]
661 { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
662 new SeqCigar(s_gapped) };
663 Alignment al = new Alignment(set);
664 for (int i = 0; i < al.getHeight(); i++)
666 System.out.println("" + al.getSequenceAt(i).getName() + "\t"
667 + al.getSequenceAt(i).getStart() + "\t"
668 + al.getSequenceAt(i).getEnd() + "\t"
669 + al.getSequenceAt(i).getSequenceAsString());
673 System.out.println("Gapped.");
674 SeqCigar[] set = new SeqCigar[]
675 { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
676 new SeqCigar(s_gapped) };
677 set[0].deleteRange(20, 25);
678 Alignment al = new Alignment(set);
679 for (int i = 0; i < al.getHeight(); i++)
681 System.out.println("" + al.getSequenceAt(i).getName() + "\t"
682 + al.getSequenceAt(i).getStart() + "\t"
683 + al.getSequenceAt(i).getEnd() + "\t"
684 + al.getSequenceAt(i).getSequenceAsString());
687 // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
688 // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());
692 * references to entities that this sequence cigar is associated with.
694 private Hashtable selGroups = null;
696 public void setGroupMembership(Object group)
698 if (selGroups == null)
700 selGroups = new Hashtable();
702 selGroups.put(group, new int[0]);
706 * Test for and if present remove association to group.
709 * @return true if group was associated and it was removed
711 public boolean removeGroupMembership(Object group)
713 if (selGroups != null && selGroups.containsKey(group))
715 selGroups.remove(group);
722 * forget all associations for this sequence.
724 public void clearMemberships()
726 if (selGroups != null)
735 * @return null or array of all associated entities
737 public Object[] getAllMemberships()
739 if (selGroups == null)
743 Object[] mmbs = new Object[selGroups.size()];
744 Enumeration en = selGroups.keys();
745 for (int i = 0; en.hasMoreElements(); i++)
747 mmbs[i] = en.nextElement();
753 * Test for group membership
756 * - a selection group or some other object that may be associated
758 * @return true if sgr is associated with this seqCigar
760 public boolean isMemberOf(Object sgr)
762 return (selGroups != null) && selGroups.get(sgr) != null;