*/
package jalview.analysis;
-import java.util.Locale;
-
import jalview.analysis.scoremodels.PIDModel;
import jalview.analysis.scoremodels.ScoreMatrix;
import jalview.analysis.scoremodels.ScoreModels;
import jalview.analysis.scoremodels.SimilarityParams;
+import jalview.api.analysis.SimilarityParamsI;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
+import jalview.math.MiscMath;
import jalview.util.Comparison;
import jalview.util.Format;
import jalview.util.MapList;
import java.awt.Color;
import java.awt.Graphics;
import java.io.PrintStream;
+import java.lang.IllegalArgumentException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.List;
import java.util.StringTokenizer;
+import java.util.Locale;
/**
- *
- *
* @author $author$
* @version $Revision$
*/
{
private static final int MAX_NAME_LENGTH = 30;
- private static final int GAP_OPEN_COST = 120;
+ private static final int DEFAULT_OPENCOST = 120;
+
+ private static final int DEFAULT_EXTENDCOST = 20;
+
+ private int GAP_OPEN_COST=DEFAULT_OPENCOST;
- private static final int GAP_EXTEND_COST = 20;
+ private int GAP_EXTEND_COST=DEFAULT_EXTENDCOST;
private static final int GAP_INDEX = -1;
float[][] score;
+ float alignmentScore;
+
float[][] E;
float[][] F;
int[] aseq1;
int[] aseq2;
+
+ /*
+ * matches in alignment
+ */
+ int match=-1;
public String astr1 = "";
public String astr2 = "";
+ public String indelfreeAstr1 = "";
+
+ public String indelfreeAstr2 = "";
+
/** DOCUMENT ME!! */
public int seq1start;
public float maxscore;
+ public float meanScore; //needed for PaSiMap
+
+ public int hypotheticMaxScore; // needed for PaSiMap
+
int prev = 0;
StringBuffer output = new StringBuffer();
* @param type
* molecule type, either AlignSeq.PEP or AlignSeq.DNA
*/
+ public AlignSeq(int opencost, int extcost)
+ {
+ GAP_OPEN_COST = opencost;
+ GAP_EXTEND_COST = extcost;
+ }
public AlignSeq(SequenceI s1, SequenceI s2, String type)
{
seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(),
/**
* Creates a new AlignSeq object.
*
- * @param s1
- * DOCUMENT ME!
- * @param s2
- * DOCUMENT ME!
+ * @param s1,string1
+ * s1 reference sequence for string1
+ * @param s2,string2
+ * s2 reference sequence for string2
* @param type
- * DOCUMENT ME!
+ * molecule type, either AlignSeq.PEP or AlignSeq.DNA
*/
public AlignSeq(SequenceI s1, String string1, SequenceI s2,
String string2, String type)
string2.toUpperCase(Locale.ROOT), type);
}
+ public AlignSeq(SequenceI s1, SequenceI s2, String type, int opencost,
+ int extcost)
+ {
+ this(s1,s2,type);
+ GAP_OPEN_COST=opencost;
+ GAP_EXTEND_COST=extcost;
+ }
+
+ public AlignSeq(SequenceI s12, String string1, SequenceI s22,
+ String string2, String type2, int defaultOpencost,
+ int defaultExtendcost)
+ {
+ this(s12,string1,s22,string2,type2);
+ GAP_OPEN_COST=defaultOpencost;
+ GAP_EXTEND_COST=defaultExtendcost;
+ }
+
/**
* DOCUMENT ME!
*
}
/**
+ * returns the overall score of the alignment
+ *
+ * @return
+ */
+ public float getAlignmentScore()
+ {
+ return alignmentScore;
+ }
+
+ /**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
return alSeq2;
}
+ /**
+ * fraction of seq2 matched in the alignment
+ * @return NaN or [0..1]
+ */
+ public double getS2Coverage()
+ {
+ if (match>=0)
+ {
+ return ((double)match)/((double)s2.getEnd()-s2.getStart()+1);
+ }
+ return Double.NaN;
+ }
+ /**
+ * fraction of seq1 matched in the alignment
+ * @return NaN or [0..1]
+ */
+ public double getS1Coverage()
+ {
+ if (match>=0)
+ {
+ return ((double)match)/((double)s1.getEnd()-s1.getStart()+1);
+ }
+ return Double.NaN;
+ }
/**
* Construct score matrix for sequences with standard DNA or PEPTIDE matrix
public void seqInit(SequenceI s1, String string1, SequenceI s2,
String string2, String type)
{
+ seqInit(s1,string1,s2,string2,type,GAP_OPEN_COST,GAP_EXTEND_COST);
+ }
+ public void seqInit(SequenceI s1, String string1, SequenceI s2,
+ String string2, String type, int opening,int extension)
+ {
+ GAP_OPEN_COST=opening;
+ GAP_EXTEND_COST=extension;
this.s1 = s1;
this.s2 = s2;
setDefaultParams(type);
aseq1 = new int[seq1.length + seq2.length];
aseq2 = new int[seq1.length + seq2.length];
-
+ match=0;
StringBuilder sb1 = new StringBuilder(aseq1.length);
StringBuilder sb2 = new StringBuilder(aseq2.length);
count = (seq1.length + seq2.length) - 1;
+
while (i > 0 && j > 0)
{
aseq1[count] = seq1[i];
sb1.append(s1str.charAt(i));
aseq2[count] = seq2[j];
sb2.append(s2str.charAt(j));
-
trace = findTrace(i, j);
if (trace == 0)
{
+ match++;
i--;
j--;
}
{
aseq2[count] = seq2[j];
sb2.append(s2str.charAt(j));
+ if (aseq1[count]!=GAP_INDEX) {
+ match++;
+ }
+ }
+
+
+ /*
+ * we built the character strings backwards, so now
+ * reverse them to convert to sequence strings
+ */
+ astr1 = sb1.reverse().toString();
+ astr2 = sb2.reverse().toString();
+ }
+
+ /**
+ * DOCUMENT ME!
+ */
+ public void traceAlignmentWithEndGaps()
+ {
+ // Find the maximum score along the rhs or bottom row
+ float max = -Float.MAX_VALUE;
+
+ for (int i = 0; i < seq1.length; i++)
+ {
+ if (score[i][seq2.length - 1] > max)
+ {
+ max = score[i][seq2.length - 1];
+ maxi = i;
+ maxj = seq2.length - 1;
+ }
+ }
+
+ for (int j = 0; j < seq2.length; j++)
+ {
+ if (score[seq1.length - 1][j] > max)
+ {
+ max = score[seq1.length - 1][j];
+ maxi = seq1.length - 1;
+ maxj = j;
+ }
+ }
+
+ int i = maxi;
+ int j = maxj;
+ int trace;
+ maxscore = score[i][j] / 10f;
+
+ //prepare trailing gaps
+ while ((i < seq1.length - 1) || (j < seq2.length - 1))
+ {
+ i++;
+ j++;
+ }
+ seq1end = i + 1;
+ seq2end = j + 1;
+
+ aseq1 = new int[seq1.length + seq2.length];
+ aseq2 = new int[seq1.length + seq2.length];
+
+ StringBuilder sb1 = new StringBuilder(aseq1.length);
+ StringBuilder sb2 = new StringBuilder(aseq2.length);
+
+ count = (seq1.length + seq2.length) - 1;
+
+ //get trailing gaps
+ while ((i >= seq1.length) || (j >= seq2.length))
+ {
+ if (i >= seq1.length)
+ {
+ aseq1[count] = GAP_INDEX;
+ sb1.append("-");
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+ } else if (j >= seq2.length) {
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ aseq2[count] = GAP_INDEX;
+ sb2.append("-");
+ }
+ i--;
+ j--;
+ }
+
+ while (i > 0 && j > 0)
+ {
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+
+ trace = findTrace(i, j);
+
+ if (trace == 0)
+ {
+ i--;
+ j--;
+ }
+ else if (trace == 1)
+ {
+ j--;
+ aseq1[count] = GAP_INDEX;
+ sb1.replace(sb1.length() - 1, sb1.length(), "-");
+ }
+ else if (trace == -1)
+ {
+ i--;
+ aseq2[count] = GAP_INDEX;
+ sb2.replace(sb2.length() - 1, sb2.length(), "-");
+ }
+
+ count--;
+ }
+
+ seq1start = i + 1;
+ seq2start = j + 1;
+
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+
+ //get initial gaps
+ while (j > 0 || i > 0)
+ {
+ if (j > 0)
+ {
+ j--;
+ sb1.append("-");
+ sb2.append(s2str.charAt(j));
+ } else if (i > 0) {
+ i--;
+ sb1.append(s1str.charAt(i));
+ sb2.append("-");
+ }
}
/*
{
int n = seq1.length;
int m = seq2.length;
-
+ final int GAP_EX_COST=GAP_EXTEND_COST;
+ final int GAP_OP_COST = GAP_OPEN_COST;
// top left hand element
score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
s2str.charAt(0)) * 10;
- E[0][0] = -GAP_EXTEND_COST;
+ E[0][0] = -GAP_EX_COST;
F[0][0] = 0;
// Calculate the top row first
for (int j = 1; j < m; j++)
{
// What should these values be? 0 maybe
- E[0][j] = max(score[0][j - 1] - GAP_OPEN_COST,
- E[0][j - 1] - GAP_EXTEND_COST);
- F[0][j] = -GAP_EXTEND_COST;
+ E[0][j] = max(score[0][j - 1] - GAP_OP_COST,
+ E[0][j - 1] - GAP_EX_COST);
+ F[0][j] = -GAP_EX_COST;
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
s2str.charAt(j));
- score[0][j] = max(pairwiseScore * 10, -GAP_OPEN_COST,
- -GAP_EXTEND_COST);
+ score[0][j] = max(pairwiseScore * 10, -GAP_OP_COST,
+ -GAP_EX_COST);
traceback[0][j] = 1;
}
// Now do the left hand column
for (int i = 1; i < n; i++)
{
- E[i][0] = -GAP_OPEN_COST;
- F[i][0] = max(score[i - 1][0] - GAP_OPEN_COST,
- F[i - 1][0] - GAP_EXTEND_COST);
+ E[i][0] = -GAP_OP_COST;
+ F[i][0] = max(score[i - 1][0] - GAP_OP_COST,
+ F[i - 1][0] - GAP_EX_COST);
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
s2str.charAt(0));
{
for (int j = 1; j < m; j++)
{
- E[i][j] = max(score[i][j - 1] - GAP_OPEN_COST,
- E[i][j - 1] - GAP_EXTEND_COST);
- F[i][j] = max(score[i - 1][j] - GAP_OPEN_COST,
- F[i - 1][j] - GAP_EXTEND_COST);
+ E[i][j] = max(score[i][j - 1] - GAP_OP_COST,
+ E[i][j - 1] - GAP_EX_COST);
+ F[i][j] = max(score[i - 1][j] - GAP_OP_COST,
+ F[i - 1][j] - GAP_EX_COST);
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
s2str.charAt(j));
public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
String type)
{
- AlignSeq as = new AlignSeq(s1, s2, type);
+ return doGlobalNWAlignment(s1, s2, type, DEFAULT_OPENCOST,DEFAULT_EXTENDCOST);
+ }
+ public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
+ String type, int opencost,int extcost)
+ {
+
+ AlignSeq as = new AlignSeq(s1, s2, type,opencost,extcost);
as.calcScoreMatrix();
as.traceAlignment();
}
return redundancy;
}
+
+ /**
+ * calculate the mean score of the alignment
+ * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
+ *
+ */
+ public void meanScore()
+ {
+ int length = indelfreeAstr1.length(); //both have the same length
+ //create HashMap for counting residues in each sequence
+ HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
+ HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
+
+ // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
+ for (char residue: indelfreeAstr1.toCharArray())
+ {
+ seq1ResCount.putIfAbsent(residue, 0);
+ seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
+ }
+ for (char residue: indelfreeAstr2.toCharArray())
+ {
+ seq2ResCount.putIfAbsent(residue, 0);
+ seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
+ }
+
+ // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
+ // divide the meanscore by the sequence length afterwards
+ float _meanscore = 0;
+ for (char resA : seq1ResCount.keySet())
+ {
+ for (char resB : seq2ResCount.keySet())
+ {
+ int countA = seq1ResCount.get(resA);
+ int countB = seq2ResCount.get(resB);
+
+ float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
+
+ _meanscore += countA * countB * scoreAB;
+ }
+ }
+ _meanscore /= length;
+ this.meanScore = _meanscore;
+ }
+
+ public float getMeanScore()
+ {
+ return this.meanScore;
+ }
+
+ /**
+ * calculate the hypothetic max score using the self-alignment of the sequences
+ */
+ public void hypotheticMaxScore()
+ {
+ int _hmsA = 0;
+ int _hmsB = 0;
+ for (char residue: indelfreeAstr1.toCharArray())
+ {
+ _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
+ }
+ for (char residue: indelfreeAstr2.toCharArray())
+ {
+ _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
+ }
+ this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
+
+ }
+
+ public int getHypotheticMaxScore()
+ {
+ return this.hypotheticMaxScore;
+ }
+
+ /**
+ * create strings based of astr1 and astr2 but without gaps
+ */
+ public void getIndelfreeAstr()
+ {
+ int n = astr1.length(); // both have the same length
+ for (int i = 0; i < n; i++)
+ {
+ if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
+ {
+ this.indelfreeAstr1 += astr1.charAt(i);
+ this.indelfreeAstr2 += astr2.charAt(i);
+ }
+ }
+ }
+
+ /**
+ * calculates the overall score of the alignment
+ * preprescore = sum of all scores - all penalties
+ * if preprescore < 1 ~ alignmentScore = Float.NaN >
+ * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
+ */
+ public void scoreAlignment()
+ {
+
+ getIndelfreeAstr();
+ meanScore();
+ hypotheticMaxScore();
+ // cannot calculate score because denominator would be zero
+ if (this.hypotheticMaxScore == this.meanScore)
+ {
+ this.alignmentScore = Float.NaN;
+ return;
+ }
+ int n = indelfreeAstr1.length();
+
+ float score = 0;
+ boolean aGapOpen = false;
+ boolean bGapOpen = false;
+ for (int i = 0; i < n; i++)
+ {
+ char char1 = indelfreeAstr1.charAt(i);
+ char char2 = indelfreeAstr2.charAt(i);
+ boolean aIsLetter = Character.isLetter(char1);
+ boolean bIsLetter = Character.isLetter(char2);
+ if (aIsLetter && bIsLetter) // if pair -> get score
+ {
+ score += scoreMatrix.getPairwiseScore(char1, char2);
+ } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
+ } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
+ score -= GAP_EXTEND_COST;
+ } else { // no gap open -> score - gap_open
+ score -= GAP_OPEN_COST;
+ }
+ // adjust GapOpen status in both sequences
+ aGapOpen = (!aIsLetter) ? true : false;
+ bGapOpen = (!bIsLetter) ? true : false;
+ }
+
+ float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
+ score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
+ int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
+ float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
+ float prescore = score; // only debug
+ score *= coverage;
+
+ //System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %1.16f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
+ float minScore = 0f;
+ this.alignmentScore = (score <= minScore) ? Float.NaN : score;
+ }
+
+ public void setScoreMatrix(ScoreMatrix sm)
+ {
+ if (sm != null) {
+ scoreMatrix = sm;
+ }
+ }
}