X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FSeqCigar.java;h=c282d6e04056079f6a06f6cf0ead92bec22b6b9d;hb=f2fed0286852eee53492661a16309f6948f7ce05;hp=536e4eaed3480678db86ac1de18b64d0a79be74e;hpb=60508bc218cee42c6fa3405db19f7790acafabab;p=jalview.git diff --git a/src/jalview/datamodel/SeqCigar.java b/src/jalview/datamodel/SeqCigar.java index 536e4ea..c282d6e 100644 --- a/src/jalview/datamodel/SeqCigar.java +++ b/src/jalview/datamodel/SeqCigar.java @@ -1,299 +1,678 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.datamodel; import jalview.analysis.AlignSeq; +import jalview.analysis.SeqsetUtils; +import jalview.util.MessageManager; +import jalview.util.ShiftList; -public class SeqCigar - extends CigarSimple +import java.util.Enumeration; +import java.util.Hashtable; + +public class SeqCigar extends CigarSimple { + /** + * start(inclusive) and end(exclusive) of subsequence on refseq + */ + private int start, end; + + private SequenceI refseq = null; + + private Hashtable seqProps; - private SequenceI refseq=null; /** * Reference dataset sequence for the cigar string + * * @return SequenceI */ - public SequenceI getRefSeq() { + public SequenceI getRefSeq() + { return refseq; } + + /** + * + * @return int start index of cigar ops on refSeq + */ + public int getStart() + { + return start; + } + + /** + * + * @return int end index (exclusive) of cigar ops on refSeq + */ + public int getEnd() + { + return end; + } + + /** + * + * @param column + * @return position in sequence for column (or -1 if no match state exists) + */ + public int findPosition(int column) + { + int w = 0, ew, p = refseq.findPosition(start); + if (column < 0) + { + return -1; + } + if (range != null) + { + for (int i = 0; i < length; i++) + { + if (operation[i] == M || operation[i] == D) + { + p += range[i]; + } + if (operation[i] == M || operation[i] == I) + { + ew = w + range[i]; + if (column < ew) + { + if (operation[i] == I) + { + return -1; + } + return p - (ew - column); + } + w = ew; + } + } + } + return -1; + } + /** * Returns sequence as a string with cigar operations applied to it + * * @return String */ + @Override public String getSequenceString(char GapChar) { - return (length==0) ? "" : (String) getSequenceAndDeletions(refseq.getSequence(), GapChar)[0]; + return (length == 0) ? "" + : (String) getSequenceAndDeletions( + refseq.getSequenceAsString(start, end), GapChar)[0]; } /** - * recreates a gapped and edited version of RefSeq or null for an empty cigar string + * recreates a gapped and edited version of RefSeq or null for an empty cigar + * string + * * @return SequenceI */ - public SequenceI getSeq(char GapChar) { + public SequenceI getSeq(char GapChar) + { Sequence seq; - if (refseq==null || length==0) + if (refseq == null || length == 0) + { return null; - Object[] edit_result=getSequenceAndDeletions(refseq.getSequence(), GapChar); - if (edit_result==null) - throw new Error("Implementation Error - unexpected null from getSequenceAndDeletions"); - - seq = new Sequence(refseq.getName(), (String) edit_result[0], refseq.getStart()+((int[]) edit_result[1])[0], refseq.getStart()+((int[]) edit_result[1])[2]); + } + Object[] edit_result = getSequenceAndDeletions( + refseq.getSequenceAsString(start, end), GapChar); + if (edit_result == null) + { + throw new Error(MessageManager.getString( + "error.implementation_error_unexpected_null_from_get_sequence_and_deletions")); + } + int bounds[] = (int[]) edit_result[1]; + seq = new Sequence(refseq.getName(), (String) edit_result[0], + refseq.getStart() + start + bounds[0], refseq.getStart() + start + + ((bounds[2] == 0) ? -1 : bounds[2])); + seq.setDescription(refseq.getDescription()); + int sstart = seq.getStart(), send = seq.getEnd(); + // seq.checkValidRange(); probably not needed + // recover local properties if present + if (seqProps != null) + { + // this recovers dataset sequence reference as well as local features, + // names, start/end settings. + SeqsetUtils.SeqCharacterUnhash(seq, seqProps); + } + // ensure dataset sequence is up to date from local reference seq.setDatasetSequence(refseq); + seq.setStart(sstart); + seq.setEnd(send); return seq; } + /* - We don't allow this - refseq is given at construction time only - public void setSeq(SequenceI seq) { - this.seq = seq; - } - */ + * We don't allow this - refseq is given at construction time only public void + * setSeq(SequenceI seq) { this.seq = seq; } + */ /** - * internal constructor - sets seq to a gapless sequence derived from seq - * and prepends any 'D' operations needed to get to the first residue of seq. - * @param seq SequenceI + * internal constructor - sets seq to a gapless sequence derived from seq and + * prepends any 'D' operations needed to get to the first residue of seq. + * + * @param seq + * SequenceI + * @param initialDeletion + * true to mark initial dataset sequence residues as deleted in + * subsequence + * @param _s + * index of first position in seq + * @param _e + * index after last position in (possibly gapped) seq * @return true if gaps are present in seq */ - private boolean _setSeq(SequenceI seq) { - boolean hasgaps=false; - - if (seq==null) - throw new Error("Implementation Error - _setSeq(null)"); - - // Find correct sequence to reference and add initial hidden offset - SequenceI ds = seq.getDatasetSequence(); - if (ds==null) { - ds = new Sequence(seq.getName(), - AlignSeq.extractGaps(jalview.util.Comparison.GapChars, new String(seq.getSequence())), - seq.getStart(), - seq.getEnd()); + private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s, + int _e) + { + boolean hasgaps = false; + if (seq == null) + { + throw new Error(MessageManager + .getString("error.implementation_error_set_seq_null")); } - // check that we haven't just duplicated an ungapped sequence. - if (ds.getLength()==seq.getLength()) { + if (_s < 0) + { + throw new Error(MessageManager + .formatMessage("error.implementation_error_s", new String[] + { Integer.valueOf(_s).toString() })); + } + String seq_string = seq.getSequenceAsString(); + if (_e == 0 || _e < _s || _e > seq_string.length()) + { + _e = seq_string.length(); + } + // resolve start and end positions relative to ungapped reference sequence + start = seq.findPosition(_s) - seq.getStart(); + end = seq.findPosition(_e) - seq.getStart(); + int l_ungapped = end - start; + // Find correct sequence to reference and correct start and end - if + // necessary + SequenceI ds = seq.getDatasetSequence(); + if (ds == null) + { + // make a new dataset sequence + String ungapped = AlignSeq.extractGaps( + jalview.util.Comparison.GapChars, new String(seq_string)); + l_ungapped = ungapped.length(); + // check that we haven't just duplicated an ungapped sequence. + if (l_ungapped == seq.getLength()) + { ds = seq; - } else { - hasgaps = true; } - this.refseq = ds; - // Adjust offset - if (ds.getStart() ds.getLength()) + { + throw new Error(MessageManager + .getString("error.implementation_error_seqcigar_possible")); + // end = ds.getLength(); + } + return hasgaps; } + /** - * directly initialise a cigar object with a sequence of range, operation pairs and a sequence to apply it to. - * operation and range should be relative to the seq.getStart()'th residue of the dataset seq resolved from seq. - * @param seq SequenceI - * @param operation char[] - * @param range int[] + * directly initialise a cigar object with a sequence of range, operation + * pairs and a sequence to apply it to. operation and range should be relative + * to the seq.getStart()'th residue of the dataset seq resolved from seq. + * + * @param seq + * SequenceI + * @param operation + * char[] + * @param range + * int[] */ - public SeqCigar(SequenceI seq, char operation[], int range[]) { + public SeqCigar(SequenceI seq, char operation[], int range[]) + { super(); - if (seq==null) - throw new Error("Implementation Bug. Null seq !"); - if (operation.length!=range.length) { - throw new Error("Implementation Bug. Cigar Operation list!= range list"); + if (seq == null) + { + throw new Error( + MessageManager.getString("error.implmentation_bug_seq_null")); + } + if (operation.length != range.length) + { + throw new Error(MessageManager.getString( + "error.implementation_bug_cigar_operation_list_range_list")); } - if (operation!=null) { - this.operation = new char[operation.length+_inc_length]; - this.range = new int[operation.length+_inc_length]; + if (operation != null) + { + this.operation = new char[operation.length + _inc_length]; + this.range = new int[operation.length + _inc_length]; - if (_setSeq(seq)) { - throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence."); + if (_setSeq(seq, false, 0, 0)) + { + throw new Error(MessageManager.getString( + "error.not_yet_implemented_cigar_object_from_cigar_string")); } - for (int i = this.length, j=0; j < operation.length; i++,j++) + for (int i = this.length, j = 0; j < operation.length; i++, j++) { char op = operation[j]; if (op != M && op != I && op != D) { - throw new Error( - "Implementation Bug. Cigar Operation '"+j+"' '"+op+"' not one of '"+M+"', '"+I+"', or '"+D+"'."); + throw new Error(MessageManager.formatMessage( + "error.implementation_bug_cigar_operation", new String[] + { Integer.valueOf(j).toString(), + Integer.valueOf(op).toString(), + Integer.valueOf(M).toString(), + Integer.valueOf(I).toString(), + Integer.valueOf(D).toString() })); } this.operation[i] = op; this.range[i] = range[j]; } - this.length+=operation.length; - } else { + this.length += operation.length; + } + else + { this.operation = null; this.range = null; - this.length=0; - if (_setSeq(seq)) { - throw new Error("NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence."); + this.length = 0; + if (_setSeq(seq, false, 0, 0)) + { + throw new Error(MessageManager.getString( + "error.not_yet_implemented_cigar_object_from_cigar_string")); } } } + /** * add range matched residues to cigar string - * @param range int + * + * @param range + * int */ - public void addMatch(int range) { + public void addMatch(int range) + { this.addOperation(M, range); } + /** - * Deleted regions mean that there will be discontinuous sequence numbering in the - * sequence returned by getSeq(char). - * @return true if there are non-terminal deletions + * Adds insertion and match operations based on seq to the cigar up to the + * endpos column of seq. + * + * @param cigar + * CigarBase + * @param seq + * SequenceI + * @param startpos + * int + * @param endpos + * int + * @param initialDeletions + * if true then initial deletions will be added from start of seq to + * startpos */ - public boolean hasDeletedRegions() { - for (int i=1, l=length-1; i 0 && op != I) + if (isGap) + { + if (range > 0 && op != I) + { + cigar.addOperation(op, range); + range = 0; + } + op = I; + range++; + } + else { - cigar.addOperation(op, range); - range = 0; + if (range > 0 && op != M) + { + cigar.addOperation(op, range); + range = 0; + } + op = M; + range++; } - op = I; - range++; } else { - if (range > 0 && op != M) + if (!isGap) { - cigar.addOperation(op, range); - range = 0; + if (range > 0 && op != D) + { + cigar.addOperation(op, range); + range = 0; + } + op = D; + range++; } - op = M; - range++; - } - } - else - { - if (!isGap) - { - if (range > 0 && op != D) + else { - cigar.addOperation(op, range); - range = 0; + // do nothing - insertions are not made in flanking regions } - op = D; - range++; - } - else - { - // do nothing - insertions are not recorded in flanking regions. } + p++; + } + if (range > 0) + { + cigar.addOperation(op, range); } } - if (range > 0) - { - cigar.addOperation(op, range); - } - } + /** * create a cigar string for given sequence - * @param seq SequenceI + * + * @param seq + * SequenceI */ - public SeqCigar(SequenceI seq) { + public SeqCigar(SequenceI seq) + { super(); if (seq == null) - throw new Error("Implementation error for new Cigar(SequenceI)"); - if (_setSeq(seq)) { - // there is still work to do - addSequenceOps(this, seq, 0, seq.getLength()); + throw new Error(MessageManager + .getString("error.implementation_error_for_new_cigar")); } + _setSeq(seq, false, 0, 0); + // there is still work to do + addSequenceOps(this, seq, 0, seq.getLength() - 1, false); } - public SeqCigar(SequenceI seq, int start, int end) { + + /** + * Create Cigar from a range of gaps and residues on a sequence object + * + * @param seq + * SequenceI + * @param start + * int - first column in range + * @param end + * int - last column in range + */ + public SeqCigar(SequenceI seq, int start, int end) + { super(); if (seq == null) - throw new Error("Implementation error for new Cigar(SequenceI)"); - if (_setSeq(seq)) { - // there is still work to do - addSequenceOps(this, seq, start, end); + throw new Error(MessageManager + .getString("error.implementation_error_for_new_cigar")); } + _setSeq(seq, false, start, end + 1); + // there is still work to do + addSequenceOps(this, seq, start, end, false); } /** - * Create a cigar object from a cigar string like '[]+' - * Will fail if the given seq already contains gaps (JBPNote: future implementation will fix) - * @param seq SequenceI object resolvable to a dataset sequence - * @param cigarString String + * Create a cigar object from a cigar string like '[]+' Will + * fail if the given seq already contains gaps (JBPNote: future implementation + * will fix) + * + * @param seq + * SequenceI object resolvable to a dataset sequence + * @param cigarString + * String * @return Cigar */ public static SeqCigar parseCigar(SequenceI seq, String cigarString) - throws Exception + throws Exception { Object[] opsandrange = parseCigarString(cigarString); - return new SeqCigar(seq, (char[]) opsandrange[0], (int[]) opsandrange[1]); + return new SeqCigar(seq, (char[]) opsandrange[0], + (int[]) opsandrange[1]); } + /** - * non rigorous testing + * create an alignment from the given array of cigar sequences and gap + * character, and marking the given segments as visible in the given + * hiddenColumns. + * + * @param alseqs + * @param gapCharacter + * @param hidden + * - hiddenColumns where hidden regions are marked + * @param segments + * - visible regions of alignment + * @return SequenceI[] */ + public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs, + char gapCharacter, HiddenColumns hidden, int[] segments) + { + SequenceI[] seqs = new SequenceI[alseqs.length]; + StringBuffer[] g_seqs = new StringBuffer[alseqs.length]; + String[] alseqs_string = new String[alseqs.length]; + Object[] gs_regions = new Object[alseqs.length]; + for (int i = 0; i < alseqs.length; i++) + { + alseqs_string[i] = alseqs[i].getRefSeq() + .getSequenceAsString(alseqs[i].start, alseqs[i].end); + gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i], + gapCharacter); // gapped sequence, {start, start col, end. + // endcol}, hidden regions {{start, end, col}}) + if (gs_regions[i] == null) + { + throw new Error(MessageManager.formatMessage( + "error.implementation_error_cigar_seq_no_operations", + new String[] + { Integer.valueOf(i).toString() })); + } + g_seqs[i] = new StringBuffer((String) ((Object[]) gs_regions[i])[0]); // the + // visible + // gapped + // sequence + } + // Now account for insertions. (well - deletions) + // this is complicated because we must keep track of shifted positions in + // each sequence + ShiftList shifts = new ShiftList(); + for (int i = 0; i < alseqs.length; i++) + { + Object[] gs_region = ((Object[]) ((Object[]) gs_regions[i])[2]); + if (gs_region != null) + + { + for (int hr = 0; hr < gs_region.length; hr++) + { + int[] region = (int[]) gs_region[hr]; + char[] insert = new char[region[1] - region[0] + 1]; + for (int s = 0; s < insert.length; s++) + { + insert[s] = gapCharacter; + } + int inspos = shifts.shift(region[2]); // resolve insertion position in + // current alignment frame of + // reference + for (int s = 0; s < alseqs.length; s++) + { + if (s != i) + { + if (g_seqs[s].length() <= inspos) + { + // prefix insertion with more gaps. + for (int l = inspos - g_seqs[s].length(); l > 0; l--) + { + g_seqs[s].append(gapCharacter); // to debug - use a diffferent + // gap character here + } + } + g_seqs[s].insert(inspos, insert); + } + else + { + g_seqs[s].insert(inspos, + alseqs_string[i].substring(region[0], region[1] + 1)); + } + } + shifts.addShift(region[2], insert.length); // update shift in + // alignment frame of + // reference + if (segments == null) + { + // add a hidden column for this deletion + hidden.hideColumns(inspos, inspos + insert.length - 1); + } + } + } + } + for (int i = 0; i < alseqs.length; i++) + { + int[] bounds = ((int[]) ((Object[]) gs_regions[i])[1]); + SequenceI ref = alseqs[i].getRefSeq(); + seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(), + ref.getStart() + alseqs[i].start + bounds[0], + ref.getStart() + alseqs[i].start + + (bounds[2] == 0 ? -1 : bounds[2])); + seqs[i].setDatasetSequence(ref); + seqs[i].setDescription(ref.getDescription()); + SeqsetUtils.SeqCharacterUnhash(seqs[i],alseqs[i].seqProps); + } + if (segments != null) + { + for (int i = 0; i < segments.length; i += 3) + { + // int start=shifts.shift(segments[i]-1)+1; + // int end=shifts.shift(segments[i]+segments[i+1]-1)-1; + hidden.hideColumns(segments[i + 1], + segments[i + 1] + segments[i + 2] - 1); + } + } + return seqs; + } + /** - * - * @param seq Sequence - * @param ex_cs_gapped String - * @return String + * references to entities that this sequence cigar is associated with. + */ + private Hashtable selGroups = null; + + public void setGroupMembership(Object group) + { + if (selGroups == null) + { + selGroups = new Hashtable(); + } + selGroups.put(group, new int[0]); + } + + /** + * Test for and if present remove association to group. + * + * @param group + * @return true if group was associated and it was removed */ - public static String testCigar_string(Sequence seq, String ex_cs_gapped) { - SeqCigar c_sgapped = new SeqCigar(seq); - String cs_gapped = c_sgapped.getCigarstring(); - if (!cs_gapped.equals(ex_cs_gapped)) - System.err.println("Failed getCigarstring: incorect string '"+cs_gapped+"' != "+ex_cs_gapped); - return cs_gapped; + public boolean removeGroupMembership(Object group) + { + if (selGroups != null && selGroups.containsKey(group)) + { + selGroups.remove(group); + return true; + } + return false; } - public static boolean testSeqRecovery(SeqCigar gen_sgapped, SequenceI s_gapped) { - SequenceI gen_sgapped_s = gen_sgapped.getSeq('-'); - if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence())) { - System.err.println("Couldn't reconstruct sequence.\n" + - gen_sgapped_s.getSequence() + "\n" + - s_gapped.getSequence()); - return false; + + /** + * forget all associations for this sequence. + */ + public void clearMemberships() + { + if (selGroups != null) + { + selGroups.clear(); } - return true; + selGroups = null; } - public static void main(String argv[]) throws Exception { - Sequence s=new Sequence("MySeq", "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt",39,80); - String orig_gapped; - Sequence s_gapped=new Sequence("MySeq", orig_gapped="----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt", 39,80); - String ex_cs_gapped="4I4M6I6M3I11M4I12M4I9M"; - s_gapped.setDatasetSequence(s); - String sub_gapped_s; - Sequence s_subsequence_gapped=new Sequence("MySeq", sub_gapped_s="------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh", 43,77); - - s_subsequence_gapped.setDatasetSequence(s); - SeqCigar c_null = new SeqCigar(s); - String cs_null = c_null.getCigarstring(); - if (cs_null.length()>0) - System.err.println("Failed getCigarstring: Unexpected cigar operations:"+cs_null); - testCigar_string(s_gapped, ex_cs_gapped); - SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped); - if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped)) - System.err.println("Failed parseCigar("+ex_cs_gapped+")->getCigarString()->'"+gen_sgapped.getCigarstring()+"'"); - testSeqRecovery(gen_sgapped, s_gapped); - // Test dataset resolution - SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped); - if (!testSeqRecovery(sub_gapped, s_subsequence_gapped)) - System.err.println("Failed recovery for subsequence of dataset sequence"); - // width functions - if (sub_gapped.getWidth()!=sub_gapped_s.length()) - System.err.println("Failed getWidth()"); - - sub_gapped.getFullWidth(); - if (sub_gapped.hasDeletedRegions()) - System.err.println("hasDeletedRegions is incorrect."); - // Test start-end region SeqCigar - SeqCigar sub_se_gp= new SeqCigar(s_subsequence_gapped, 8, 48); - if (sub_se_gp.getWidth()!=40) - System.err.println("SeqCigar(seq, start, end) not properly clipped alignsequence."); - System.out.println("Original sequence align:\n"+sub_gapped_s+"\nReconstructed window from 8 to 48\n"+"XXXXXXXX"+sub_se_gp.getSequenceString('-')+"...."+"\nCigar String:"+sub_se_gp.getCigarstring()+""); + + /** + * + * @return null or array of all associated entities + */ + public Object[] getAllMemberships() + { + if (selGroups == null) + { + return null; + } + Object[] mmbs = new Object[selGroups.size()]; + Enumeration en = selGroups.keys(); + for (int i = 0; en.hasMoreElements(); i++) + { + mmbs[i] = en.nextElement(); + } + return mmbs; } + /** + * Test for group membership + * + * @param sgr + * - a selection group or some other object that may be associated + * with seqCigar + * @return true if sgr is associated with this seqCigar + */ + public boolean isMemberOf(Object sgr) + { + return (selGroups != null) && selGroups.get(sgr) != null; + } }