/*
* Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
* Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with Jalview. If not, see .
*/
package jalview.datamodel;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import jalview.analysis.*;
import jalview.util.*;
public class SeqCigar extends CigarSimple
{
/**
* start(inclusive) and end(exclusive) of subsequence on refseq
*/
private int start, end;
private SequenceI refseq = null;
private Hashtable seqProps;
/**
* Reference dataset sequence for the cigar string
*
* @return SequenceI
*/
public SequenceI getRefSeq()
{
return refseq;
}
/**
*
* @return int start index of cigar ops on refSeq
*/
public int getStart()
{
return start;
}
/**
*
* @return int end index (exclusive) of cigar ops on refSeq
*/
public int getEnd()
{
return end;
}
/**
* Returns sequence as a string with cigar operations applied to it
*
* @return String
*/
public String getSequenceString(char GapChar)
{
return (length == 0) ? "" : (String) getSequenceAndDeletions(
refseq.getSequenceAsString(start, end), GapChar)[0];
}
/**
* recreates a gapped and edited version of RefSeq or null for an empty cigar
* string
*
* @return SequenceI
*/
public SequenceI getSeq(char GapChar)
{
Sequence seq;
if (refseq == null || length == 0)
{
return null;
}
Object[] edit_result = getSequenceAndDeletions(
refseq.getSequenceAsString(start, end), GapChar);
if (edit_result == null)
{
throw new Error(
"Implementation Error - unexpected null from getSequenceAndDeletions");
}
int bounds[] = (int[]) edit_result[1];
seq = new Sequence(refseq.getName(), (String) edit_result[0],
refseq.getStart() + start + bounds[0], refseq.getStart()
+ start + ((bounds[2] == 0) ? -1 : bounds[2]));
seq.setDescription(refseq.getDescription());
int sstart = seq.getStart(), send = seq.getEnd();
// seq.checkValidRange(); probably not needed
// recover local properties if present
if (seqProps != null)
{
// this recovers dataset sequence reference as well as local features,
// names, start/end settings.
SeqsetUtils.SeqCharacterUnhash(seq, seqProps);
}
// ensure dataset sequence is up to date from local reference
seq.setDatasetSequence(refseq);
seq.setStart(sstart);
seq.setEnd(send);
return seq;
}
/*
* We don't allow this - refseq is given at construction time only public void
* setSeq(SequenceI seq) { this.seq = seq; }
*/
/**
* internal constructor - sets seq to a gapless sequence derived from seq and
* prepends any 'D' operations needed to get to the first residue of seq.
*
* @param seq
* SequenceI
* @param initialDeletion
* true to mark initial dataset sequence residues as deleted in
* subsequence
* @param _s
* index of first position in seq
* @param _e
* index after last position in (possibly gapped) seq
* @return true if gaps are present in seq
*/
private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
int _e)
{
boolean hasgaps = false;
if (seq == null)
{
throw new Error("Implementation Error - _setSeq(null,...)");
}
if (_s < 0)
{
throw new Error("Implementation Error: _s=" + _s);
}
String seq_string = seq.getSequenceAsString();
if (_e == 0 || _e < _s || _e > seq_string.length())
{
_e = seq_string.length();
}
// resolve start and end positions relative to ungapped reference sequence
start = seq.findPosition(_s) - seq.getStart();
end = seq.findPosition(_e) - seq.getStart();
int l_ungapped = end - start;
// Find correct sequence to reference and correct start and end - if
// necessary
SequenceI ds = seq.getDatasetSequence();
if (ds == null)
{
// make a new dataset sequence
String ungapped = AlignSeq.extractGaps(
jalview.util.Comparison.GapChars, new String(seq_string));
l_ungapped = ungapped.length();
// check that we haven't just duplicated an ungapped sequence.
if (l_ungapped == seq.getLength())
{
ds = seq;
}
else
{
ds = new Sequence(seq.getName(), ungapped, seq.getStart(),
seq.getStart() + ungapped.length() - 1);
// JBPNote: this would be consistent but may not be useful
// seq.setDatasetSequence(ds);
}
}
// add in offset between seq and the dataset sequence
if (ds.getStart() < seq.getStart())
{
int offset = seq.getStart() - ds.getStart();
if (initialDeletion)
{
// absolute cigar string
addDeleted(_s + offset);
start = 0;
end += offset;
}
else
{
// normal behaviour - just mark start and end subsequence
start += offset;
end += offset;
}
}
// any gaps to process ?
if (l_ungapped != (_e - _s))
{
hasgaps = true;
}
refseq = ds;
// copy over local properties for the sequence instance of the refseq
seqProps = SeqsetUtils.SeqCharacterHash(seq);
// Check offsets
if (end > ds.getLength())
{
throw new Error(
"SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
// end = ds.getLength();
}
return hasgaps;
}
/**
* directly initialise a cigar object with a sequence of range, operation
* pairs and a sequence to apply it to. operation and range should be relative
* to the seq.getStart()'th residue of the dataset seq resolved from seq.
*
* @param seq
* SequenceI
* @param operation
* char[]
* @param range
* int[]
*/
public SeqCigar(SequenceI seq, char operation[], int range[])
{
super();
if (seq == null)
{
throw new Error("Implementation Bug. Null seq !");
}
if (operation.length != range.length)
{
throw new Error(
"Implementation Bug. Cigar Operation list!= range list");
}
if (operation != null)
{
this.operation = new char[operation.length + _inc_length];
this.range = new int[operation.length + _inc_length];
if (_setSeq(seq, false, 0, 0))
{
throw new Error(
"NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
}
for (int i = this.length, j = 0; j < operation.length; i++, j++)
{
char op = operation[j];
if (op != M && op != I && op != D)
{
throw new Error("Implementation Bug. Cigar Operation '" + j
+ "' '" + op + "' not one of '" + M + "', '" + I
+ "', or '" + D + "'.");
}
this.operation[i] = op;
this.range[i] = range[j];
}
this.length += operation.length;
}
else
{
this.operation = null;
this.range = null;
this.length = 0;
if (_setSeq(seq, false, 0, 0))
{
throw new Error(
"NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
}
}
}
/**
* add range matched residues to cigar string
*
* @param range
* int
*/
public void addMatch(int range)
{
this.addOperation(M, range);
}
/**
* Adds insertion and match operations based on seq to the cigar up to the
* endpos column of seq.
*
* @param cigar
* CigarBase
* @param seq
* SequenceI
* @param startpos
* int
* @param endpos
* int
* @param initialDeletions
* if true then initial deletions will be added from start of seq to
* startpos
*/
protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
int startpos, int endpos, boolean initialDeletions)
{
char op = '\0';
int range = 0;
int p = 0, res = seq.getLength();
if (!initialDeletions)
{
p = startpos;
}
while (p <= endpos)
{
boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq
.getCharAt(p)) : true;
if ((startpos <= p) && (p <= endpos))
{
if (isGap)
{
if (range > 0 && op != I)
{
cigar.addOperation(op, range);
range = 0;
}
op = I;
range++;
}
else
{
if (range > 0 && op != M)
{
cigar.addOperation(op, range);
range = 0;
}
op = M;
range++;
}
}
else
{
if (!isGap)
{
if (range > 0 && op != D)
{
cigar.addOperation(op, range);
range = 0;
}
op = D;
range++;
}
else
{
// do nothing - insertions are not made in flanking regions
}
}
p++;
}
if (range > 0)
{
cigar.addOperation(op, range);
}
}
/**
* create a cigar string for given sequence
*
* @param seq
* SequenceI
*/
public SeqCigar(SequenceI seq)
{
super();
if (seq == null)
{
throw new Error("Implementation error for new Cigar(SequenceI)");
}
_setSeq(seq, false, 0, 0);
// there is still work to do
addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
}
/**
* Create Cigar from a range of gaps and residues on a sequence object
*
* @param seq
* SequenceI
* @param start
* int - first column in range
* @param end
* int - last column in range
*/
public SeqCigar(SequenceI seq, int start, int end)
{
super();
if (seq == null)
{
throw new Error("Implementation error for new Cigar(SequenceI)");
}
_setSeq(seq, false, start, end + 1);
// there is still work to do
addSequenceOps(this, seq, start, end, false);
}
/**
* Create a cigar object from a cigar string like '[]+' Will
* fail if the given seq already contains gaps (JBPNote: future implementation
* will fix)
*
* @param seq
* SequenceI object resolvable to a dataset sequence
* @param cigarString
* String
* @return Cigar
*/
public static SeqCigar parseCigar(SequenceI seq, String cigarString)
throws Exception
{
Object[] opsandrange = parseCigarString(cigarString);
return new SeqCigar(seq, (char[]) opsandrange[0],
(int[]) opsandrange[1]);
}
/**
* create an alignment from the given array of cigar sequences and gap
* character, and marking the given segments as visible in the given
* columselection.
*
* @param alseqs
* @param gapCharacter
* @param colsel
* - columnSelection where hidden regions are marked
* @param segments
* - visible regions of alignment
* @return SequenceI[]
*/
public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
char gapCharacter, ColumnSelection colsel, int[] segments)
{
SequenceI[] seqs = new SequenceI[alseqs.length];
StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
String[] alseqs_string = new String[alseqs.length];
Object[] gs_regions = new Object[alseqs.length];
for (int i = 0; i < alseqs.length; i++)
{
alseqs_string[i] = alseqs[i].getRefSeq().getSequenceAsString(
alseqs[i].start, alseqs[i].end);
gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
gapCharacter); // gapped sequence, {start, start col, end.
// endcol}, hidden regions {{start, end, col}})
if (gs_regions[i] == null)
{
throw new Error("Implementation error: " + i
+ "'th sequence Cigar has no operations.");
}
g_seqs[i] = new StringBuffer((String) ((Object[]) gs_regions[i])[0]); // the
// visible
// gapped
// sequence
}
// Now account for insertions. (well - deletions)
// this is complicated because we must keep track of shifted positions in
// each sequence
ShiftList shifts = new ShiftList();
for (int i = 0; i < alseqs.length; i++)
{
Object[] gs_region = ((Object[]) ((Object[]) gs_regions[i])[2]);
if (gs_region != null)
{
for (int hr = 0; hr < gs_region.length; hr++)
{
int[] region = (int[]) gs_region[hr];
char[] insert = new char[region[1] - region[0] + 1];
for (int s = 0; s < insert.length; s++)
{
insert[s] = gapCharacter;
}
int inspos = shifts.shift(region[2]); // resolve insertion position in
// current alignment frame of
// reference
for (int s = 0; s < alseqs.length; s++)
{
if (s != i)
{
if (g_seqs[s].length() <= inspos)
{
// prefix insertion with more gaps.
for (int l = inspos - g_seqs[s].length(); l > 0; l--)
{
g_seqs[s].append(gapCharacter); // to debug - use a diffferent
// gap character here
}
}
g_seqs[s].insert(inspos, insert);
}
else
{
g_seqs[s].insert(inspos,
alseqs_string[i].substring(region[0], region[1] + 1));
}
}
shifts.addShift(region[2], insert.length); // update shift in
// alignment frame of
// reference
if (segments == null)
{
// add a hidden column for this deletion
colsel.hideColumns(inspos, inspos + insert.length - 1);
}
}
}
}
for (int i = 0; i < alseqs.length; i++)
{
int[] bounds = ((int[]) ((Object[]) gs_regions[i])[1]);
SequenceI ref = alseqs[i].getRefSeq();
seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
ref.getStart() + alseqs[i].start + bounds[0], ref.getStart()
+ alseqs[i].start + (bounds[2] == 0 ? -1 : bounds[2]));
seqs[i].setDatasetSequence(ref);
seqs[i].setDescription(ref.getDescription());
}
if (segments != null)
{
for (int i = 0; i < segments.length; i += 3)
{
// int start=shifts.shift(segments[i]-1)+1;
// int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
colsel.hideColumns(segments[i + 1], segments[i + 1]
+ segments[i + 2] - 1);
}
}
return seqs;
}
/**
* non rigorous testing
*/
/**
*
* @param seq
* Sequence
* @param ex_cs_gapped
* String
* @return String
*/
public static String testCigar_string(Sequence seq, String ex_cs_gapped)
{
SeqCigar c_sgapped = new SeqCigar(seq);
String cs_gapped = c_sgapped.getCigarstring();
if (!cs_gapped.equals(ex_cs_gapped))
{
System.err.println("Failed getCigarstring: incorect string '"
+ cs_gapped + "' != " + ex_cs_gapped);
}
return cs_gapped;
}
public static boolean testSeqRecovery(SeqCigar gen_sgapped,
SequenceI s_gapped)
{
// this is non-rigorous - start and end recovery is not tested.
SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
{
System.err.println("Couldn't reconstruct sequence.\n"
+ gen_sgapped_s.getSequenceAsString() + "\n"
+ s_gapped.getSequenceAsString());
return false;
}
return true;
}
public static void main(String argv[]) throws Exception
{
String o_seq;
Sequence s = new Sequence("MySeq",
o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt", 39, 80);
String orig_gapped;
Sequence s_gapped = new Sequence(
"MySeq",
orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
39, 80);
String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
s_gapped.setDatasetSequence(s);
String sub_gapped_s;
Sequence s_subsequence_gapped = new Sequence(
"MySeq",
sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
43, 77);
s_subsequence_gapped.setDatasetSequence(s);
SeqCigar c_null = new SeqCigar(s);
String cs_null = c_null.getCigarstring();
if (!cs_null.equals("42M"))
{
System.err
.println("Failed to recover ungapped sequence cigar operations:"
+ ((cs_null == "") ? "empty string" : cs_null));
}
testCigar_string(s_gapped, ex_cs_gapped);
SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
{
System.err.println("Failed parseCigar(" + ex_cs_gapped
+ ")->getCigarString()->'" + gen_sgapped.getCigarstring()
+ "'");
}
testSeqRecovery(gen_sgapped, s_gapped);
// Test dataset resolution
SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
{
System.err
.println("Failed recovery for subsequence of dataset sequence");
}
// width functions
if (sub_gapped.getWidth() != sub_gapped_s.length())
{
System.err.println("Failed getWidth()");
}
sub_gapped.getFullWidth();
if (sub_gapped.hasDeletedRegions())
{
System.err.println("hasDeletedRegions is incorrect.");
}
// Test start-end region SeqCigar
SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
if (sub_se_gp.getWidth() != 41)
{
System.err
.println("SeqCigar(seq, start, end) not properly clipped alignsequence.");
}
System.out.println("Original sequence align:\n" + sub_gapped_s
+ "\nReconstructed window from 8 to 48\n" + "XXXXXXXX"
+ sub_se_gp.getSequenceString('-') + "..." + "\nCigar String:"
+ sub_se_gp.getCigarstring() + "\n");
SequenceI ssgp = sub_se_gp.getSeq('-');
System.out.println("\t " + ssgp.getSequenceAsString());
for (int r = 0; r < 10; r++)
{
sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
int sl = sub_se_gp.getWidth();
int st = sl - 1 - r;
for (int rs = 0; rs < 10; rs++)
{
int e = st + rs;
sub_se_gp.deleteRange(st, e);
String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
System.out.println(st + "," + e + "\t:" + ssgapedseq);
st -= 3;
}
}
{
SeqCigar[] set = new SeqCigar[]
{ new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
new SeqCigar(s_gapped) };
Alignment al = new Alignment(set);
for (int i = 0; i < al.getHeight(); i++)
{
System.out.println("" + al.getSequenceAt(i).getName() + "\t"
+ al.getSequenceAt(i).getStart() + "\t"
+ al.getSequenceAt(i).getEnd() + "\t"
+ al.getSequenceAt(i).getSequenceAsString());
}
}
{
System.out.println("Gapped.");
SeqCigar[] set = new SeqCigar[]
{ new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
new SeqCigar(s_gapped) };
set[0].deleteRange(20, 25);
Alignment al = new Alignment(set);
for (int i = 0; i < al.getHeight(); i++)
{
System.out.println("" + al.getSequenceAt(i).getName() + "\t"
+ al.getSequenceAt(i).getStart() + "\t"
+ al.getSequenceAt(i).getEnd() + "\t"
+ al.getSequenceAt(i).getSequenceAsString());
}
}
// if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
// System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());
}
/**
* references to entities that this sequence cigar is associated with.
*/
private Hashtable selGroups = null;
public void setGroupMembership(Object group)
{
if (selGroups == null)
{
selGroups = new Hashtable();
}
selGroups.put(group, new int[0]);
}
/**
* Test for and if present remove association to group.
*
* @param group
* @return true if group was associated and it was removed
*/
public boolean removeGroupMembership(Object group)
{
if (selGroups != null && selGroups.containsKey(group))
{
selGroups.remove(group);
return true;
}
return false;
}
/**
* forget all associations for this sequence.
*/
public void clearMemberships()
{
if (selGroups != null)
{
selGroups.clear();
}
selGroups = null;
}
/**
*
* @return null or array of all associated entities
*/
public Object[] getAllMemberships()
{
if (selGroups == null)
{
return null;
}
Object[] mmbs = new Object[selGroups.size()];
Enumeration en = selGroups.keys();
for (int i = 0; en.hasMoreElements(); i++)
{
mmbs[i] = en.nextElement();
}
return mmbs;
}
/**
* Test for group membership
*
* @param sgr
* - a selection group or some other object that may be associated
* with seqCigar
* @return true if sgr is associated with this seqCigar
*/
public boolean isMemberOf(Object sgr)
{
return (selGroups != null) && selGroups.get(sgr) != null;
}
}