*/
package jalview.analysis;
+import java.util.Locale;
+
import jalview.analysis.scoremodels.PIDModel;
import jalview.analysis.scoremodels.ScoreMatrix;
import jalview.analysis.scoremodels.ScoreModels;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
+import jalview.math.MiscMath;
import jalview.util.Comparison;
import jalview.util.Format;
import jalview.util.MapList;
import java.awt.Color;
import java.awt.Graphics;
+import java.io.PrintStream;
+import java.lang.IllegalArgumentException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.List;
import java.util.StringTokenizer;
*/
public class AlignSeq
{
+ private static final int MAX_NAME_LENGTH = 30;
+
+ //&!
+ private static final int GAP_OPEN_COST = 120;
+ //private static final int GAP_OPEN_COST = 100;
+
+ private static final int GAP_EXTEND_COST = 20;
+ //private static final int GAP_EXTEND_COST = 5;
+
+ private static final int GAP_INDEX = -1;
+
public static final String PEP = "pep";
public static final String DNA = "dna";
float[][] score;
+ float alignmentScore;
+
float[][] E;
float[][] F;
- int[][] traceback;
+ int[][] traceback; // todo is this actually used?
int[] seq1;
public String astr2 = "";
+ public String indelfreeAstr1 = "";
+
+ public String indelfreeAstr2 = "";
+
/** DOCUMENT ME!! */
public int seq1start;
/** DOCUMENT ME!! */
public int seq2start;
- /** DOCUMENT ME!! */
public int seq2end;
int count;
- /** DOCUMENT ME!! */
public float maxscore;
- float pid;
-
- int prev = 0;
+ public float meanScore; //needed for PaSiMap
- int gapOpen = 120;
+ public int hypotheticMaxScore; // needed for PaSiMap
- int gapExtend = 20;
+ int prev = 0;
StringBuffer output = new StringBuffer();
private ScoreMatrix scoreMatrix;
- private static final int GAP_INDEX = -1;
-
/**
* Creates a new AlignSeq object.
*
- * @param s1 first sequence for alignment
- * @param s2 second sequence for alignment
- * @param type molecule type, either AlignSeq.PEP or AlignSeq.DNA
+ * @param s1
+ * first sequence for alignment
+ * @param s2
+ * second sequence for alignment
+ * @param type
+ * molecule type, either AlignSeq.PEP or AlignSeq.DNA
*/
public AlignSeq(SequenceI s1, SequenceI s2, String type)
{
public AlignSeq(SequenceI s1, String string1, SequenceI s2,
String string2, String type)
{
- seqInit(s1, string1.toUpperCase(), s2, string2.toUpperCase(), type);
+ seqInit(s1, string1.toUpperCase(Locale.ROOT), s2,
+ string2.toUpperCase(Locale.ROOT), type);
}
/**
}
/**
+ * returns the overall score of the alignment
+ *
+ * @return
+ */
+ public float getAlignmentScore()
+ {
+ return alignmentScore;
+ }
+
+ /**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1());
alSeq1.setStart(s1.getStart() + getSeq1Start() - 1);
alSeq1.setEnd(s1.getStart() + getSeq1End() - 1);
- alSeq1.setDatasetSequence(s1.getDatasetSequence() == null ? s1 : s1
- .getDatasetSequence());
+ alSeq1.setDatasetSequence(
+ s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence());
return alSeq1;
}
SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2());
alSeq2.setStart(s2.getStart() + getSeq2Start() - 1);
alSeq2.setEnd(s2.getStart() + getSeq2End() - 1);
- alSeq2.setDatasetSequence(s2.getDatasetSequence() == null ? s2 : s2
- .getDatasetSequence());
+ alSeq2.setDatasetSequence(
+ s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
return alSeq2;
}
if (s1str.length() == 0 || s2str.length() == 0)
{
- output.append("ALL GAPS: "
- + (s1str.length() == 0 ? s1.getName() : " ")
- + (s2str.length() == 0 ? s2.getName() : ""));
+ output.append(
+ "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ")
+ + (s2str.length() == 0 ? s2.getName() : ""));
return;
}
if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType))
{
output.append("Wrong type = dna or pep only");
- throw new Error(MessageManager.formatMessage(
- "error.unknown_type_dna_or_pep",
- new String[] { moleculeType }));
+ throw new Error(MessageManager
+ .formatMessage("error.unknown_type_dna_or_pep", new String[]
+ { moleculeType }));
}
type = moleculeType;
- scoreMatrix = ScoreModels.getInstance().getDefaultModel(
- PEP.equals(type));
+ scoreMatrix = ScoreModels.getInstance()
+ .getDefaultModel(PEP.equals(type));
}
/**
}
}
- // System.out.println(maxi + " " + maxj + " " + score[maxi][maxj]);
int i = maxi;
int j = maxj;
int trace;
- maxscore = score[i][j] / 10;
+ maxscore = score[i][j] / 10f;
+
+
+ aseq1 = new int[seq1.length + seq2.length];
+ aseq2 = new int[seq1.length + seq2.length];
+
+ StringBuilder sb1 = new StringBuilder(aseq1.length);
+ StringBuilder sb2 = new StringBuilder(aseq2.length);
+
+ count = (seq1.length + seq2.length) - 1;
+
+
+ while (i > 0 && j > 0)
+ {
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+
+ trace = findTrace(i, j);
+
+ if (trace == 0)
+ {
+ i--;
+ j--;
+ }
+ else if (trace == 1)
+ {
+ j--;
+ aseq1[count] = GAP_INDEX;
+ sb1.replace(sb1.length() - 1, sb1.length(), "-");
+ }
+ else if (trace == -1)
+ {
+ i--;
+ aseq2[count] = GAP_INDEX;
+ sb2.replace(sb2.length() - 1, sb2.length(), "-");
+ }
+
+ count--;
+ }
+
+ seq1start = i + 1;
+ seq2start = j + 1;
+
+ if (aseq1[count] != GAP_INDEX)
+ {
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ }
+
+ if (aseq2[count] != GAP_INDEX)
+ {
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+ }
+
+
+ /*
+ * we built the character strings backwards, so now
+ * reverse them to convert to sequence strings
+ */
+ astr1 = sb1.reverse().toString();
+ astr2 = sb2.reverse().toString();
+ }
+
+ /**
+ * DOCUMENT ME!
+ */
+ public void traceAlignmentWithEndGaps()
+ {
+ // Find the maximum score along the rhs or bottom row
+ float max = -Float.MAX_VALUE;
+
+ for (int i = 0; i < seq1.length; i++)
+ {
+ if (score[i][seq2.length - 1] > max)
+ {
+ max = score[i][seq2.length - 1];
+ maxi = i;
+ maxj = seq2.length - 1;
+ }
+ }
+
+ for (int j = 0; j < seq2.length; j++)
+ {
+ if (score[seq1.length - 1][j] > max)
+ {
+ max = score[seq1.length - 1][j];
+ maxi = seq1.length - 1;
+ maxj = j;
+ }
+ }
+
+ int i = maxi;
+ int j = maxj;
+ int trace;
+ maxscore = score[i][j] / 10f;
+
+ //&! get trailing gaps
+ while ((i < seq1.length - 1) || (j < seq2.length - 1))
+ {
+ i++;
+ j++;
+ }
+ seq1end = i + 1;
+ seq2end = j + 1;
- seq1end = maxi + 1;
- seq2end = maxj + 1;
aseq1 = new int[seq1.length + seq2.length];
aseq2 = new int[seq1.length + seq2.length];
count = (seq1.length + seq2.length) - 1;
+ //&! get trailing gaps
+ while ((i >= seq1.length) || (j >= seq2.length))
+ {
+ if (i >= seq1.length)
+ {
+ aseq1[count] = GAP_INDEX;
+ sb1.append("-");
+ aseq2[count] = seq2[j];
+ sb2.append(s2str.charAt(j));
+ } else if (j >= seq2.length) {
+ aseq1[count] = seq1[i];
+ sb1.append(s1str.charAt(i));
+ aseq2[count] = GAP_INDEX;
+ sb2.append("-");
+ }
+ i--;
+ j--;
+ }
+
+
while (i > 0 && j > 0)
{
aseq1[count] = seq1[i];
sb2.append(s2str.charAt(j));
}
+ //&! get initial gaps
+ while (j > 0 || i > 0)
+ {
+ if (j > 0)
+ {
+ sb1.append("-");
+ sb2.append(s2str.charAt(j));
+ j--;
+ } else if (i > 0) {
+ sb1.append(s1str.charAt(i));
+ sb2.append("-");
+ i--;
+ }
+ }
+
/*
* we built the character strings backwards, so now
* reverse them to convert to sequence strings
/**
* DOCUMENT ME!
*/
- public void printAlignment(java.io.PrintStream os)
+ public void printAlignment(PrintStream os)
{
// TODO: Use original sequence characters rather than re-translated
// characters in output
// Find the biggest id length for formatting purposes
- String s1id = s1.getName(), s2id = s2.getName();
- int maxid = s1.getName().length();
- if (s2.getName().length() > maxid)
- {
- maxid = s2.getName().length();
- }
- if (maxid > 30)
+ String s1id = getAlignedSeq1().getDisplayId(true);
+ String s2id = getAlignedSeq2().getDisplayId(true);
+ int nameLength = Math.max(s1id.length(), s2id.length());
+ if (nameLength > MAX_NAME_LENGTH)
{
- maxid = 30;
+ int truncateBy = nameLength - MAX_NAME_LENGTH;
+ nameLength = MAX_NAME_LENGTH;
// JAL-527 - truncate the sequence ids
- if (s1.getName().length() > maxid)
+ if (s1id.length() > nameLength)
{
- s1id = s1.getName().substring(0, 30);
+ int slashPos = s1id.lastIndexOf('/');
+ s1id = s1id.substring(0, slashPos - truncateBy)
+ + s1id.substring(slashPos);
}
- if (s2.getName().length() > maxid)
+ if (s2id.length() > nameLength)
{
- s2id = s2.getName().substring(0, 30);
+ int slashPos = s2id.lastIndexOf('/');
+ s2id = s2id.substring(0, slashPos - truncateBy)
+ + s2id.substring(slashPos);
}
}
- int len = 72 - maxid - 1;
+ int len = 72 - nameLength - 1;
int nochunks = ((aseq1.length - count) / len)
+ ((aseq1.length - count) % len > 0 ? 1 : 0);
- pid = 0;
+ float pid = 0f;
output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE);
output.append("Length of alignment = ")
.append(String.valueOf(aseq1.length - count)).append(NEWLINE);
output.append("Sequence ");
- output.append(new Format("%" + maxid + "s").form(s1.getName()));
- output.append(" : ").append(String.valueOf(s1.getStart()))
- .append(" - ").append(String.valueOf(s1.getEnd()));
+ Format nameFormat = new Format("%" + nameLength + "s");
+ output.append(nameFormat.form(s1id));
output.append(" (Sequence length = ")
.append(String.valueOf(s1str.length())).append(")")
.append(NEWLINE);
output.append("Sequence ");
- output.append(new Format("%" + maxid + "s").form(s2.getName()));
- output.append(" : ").append(String.valueOf(s2.getStart()))
- .append(" - ").append(String.valueOf(s2.getEnd()));
+ output.append(nameFormat.form(s2id));
output.append(" (Sequence length = ")
.append(String.valueOf(s2str.length())).append(")")
.append(NEWLINE).append(NEWLINE);
for (int j = 0; j < nochunks; j++)
{
// Print the first aligned sequence
- output.append(new Format("%" + (maxid) + "s").form(s1id)).append(" ");
+ output.append(nameFormat.form(s1id)).append(" ");
for (int i = 0; i < len; i++)
{
}
output.append(NEWLINE);
- output.append(new Format("%" + (maxid) + "s").form(" ")).append(" ");
+ output.append(nameFormat.form(" ")).append(" ");
/*
* Print out the match symbols:
pid++;
output.append("|");
}
- else if (type.equals("pep"))
+ else if (PEP.equals(type))
{
if (pam250.getPairwiseScore(c1, c2) > 0)
{
// Now print the second aligned sequence
output = output.append(NEWLINE);
- output = output.append(new Format("%" + (maxid) + "s").form(s2id))
- .append(" ");
+ output = output.append(nameFormat.form(s2id)).append(" ");
for (int i = 0; i < len; i++)
{
}
pid = pid / (aseq1.length - count) * 100;
- output = output.append(new Format("Percentage ID = %2.2f\n").form(pid));
+ output.append(new Format("Percentage ID = %3.2f\n").form(pid));
+ output.append(NEWLINE);
try
{
os.print(output.toString());
public int findTrace(int i, int j)
{
int t = 0;
- // float pairwiseScore = lookup[seq1[i]][seq2[j]];
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
s2str.charAt(j));
float max = score[i - 1][j - 1] + (pairwiseScore * 10);
// top left hand element
score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
s2str.charAt(0)) * 10;
- E[0][0] = -gapExtend;
+ E[0][0] = -GAP_EXTEND_COST;
F[0][0] = 0;
// Calculate the top row first
for (int j = 1; j < m; j++)
{
// What should these values be? 0 maybe
- E[0][j] = max(score[0][j - 1] - gapOpen, E[0][j - 1] - gapExtend);
- F[0][j] = -gapExtend;
+ E[0][j] = max(score[0][j - 1] - GAP_OPEN_COST,
+ E[0][j - 1] - GAP_EXTEND_COST);
+ F[0][j] = -GAP_EXTEND_COST;
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
s2str.charAt(j));
- score[0][j] = max(pairwiseScore * 10, -gapOpen, -gapExtend);
+ score[0][j] = max(pairwiseScore * 10, -GAP_OPEN_COST,
+ -GAP_EXTEND_COST);
traceback[0][j] = 1;
}
// Now do the left hand column
for (int i = 1; i < n; i++)
{
- E[i][0] = -gapOpen;
- F[i][0] = max(score[i - 1][0] - gapOpen, F[i - 1][0] - gapExtend);
+ E[i][0] = -GAP_OPEN_COST;
+ F[i][0] = max(score[i - 1][0] - GAP_OPEN_COST,
+ F[i - 1][0] - GAP_EXTEND_COST);
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
s2str.charAt(0));
{
for (int j = 1; j < m; j++)
{
- E[i][j] = max(score[i][j - 1] - gapOpen, E[i][j - 1] - gapExtend);
- F[i][j] = max(score[i - 1][j] - gapOpen, F[i - 1][j] - gapExtend);
+ E[i][j] = max(score[i][j - 1] - GAP_OPEN_COST,
+ E[i][j - 1] - GAP_EXTEND_COST);
+ F[i][j] = max(score[i - 1][j] - GAP_OPEN_COST,
+ F[i - 1][j] - GAP_EXTEND_COST);
float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
s2str.charAt(j));
- score[i][j] = max(score[i - 1][j - 1]
- + (pairwiseScore * 10), E[i][j], F[i][j]);
+ score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10),
+ E[i][j], F[i][j]);
traceback[i][j] = findTrace(i, j);
}
}
*/
public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch)
{
- ArrayList<Integer> as1 = new ArrayList<Integer>(), as2 = new ArrayList<Integer>();
+ ArrayList<Integer> as1 = new ArrayList<Integer>(),
+ as2 = new ArrayList<Integer>();
int pdbpos = s2.getStart() + getSeq2Start() - 2;
int alignpos = s1.getStart() + getSeq1Start() - 2;
int lp2 = pdbpos - 3, lp1 = alignpos - 3;
pdbpos++;
}
- if (allowmismatch || c1 == c2)
+ // ignore case differences
+ if (allowmismatch || (c1 == c2) || (Math.abs(c2-c1)==('a'-'A')))
{
// extend mapping interval
if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos)
}
// construct range pairs
- int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)], mapseq2 = new int[as2
- .size() + (lastmatch ? 1 : 0)];
+ int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)],
+ mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)];
int i = 0;
for (Integer ip : as1)
{
List<SequenceI> ochains, AlignmentI al, String dnaOrProtein,
boolean removeOldAnnots)
{
- List<SequenceI> orig = new ArrayList<SequenceI>(), repl = new ArrayList<SequenceI>();
+ List<SequenceI> orig = new ArrayList<SequenceI>(),
+ repl = new ArrayList<SequenceI>();
List<AlignSeq> aligs = new ArrayList<AlignSeq>();
if (al != null && al.getHeight() > 0)
{
}
return redundancy;
}
+
+ /**
+ * calculate the mean score of the alignment
+ * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
+ *
+ */
+ public void meanScore()
+ {
+ //int length = (indelfreeAstr1.length() > indelfreeAstr2.length()) ? indelfreeAstr1.length() : indelfreeAstr2.length();
+ int length = indelfreeAstr1.length(); //both have the same length
+ //create HashMap for counting residues in each sequence
+ HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
+ HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
+
+ // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
+ for (char residue: indelfreeAstr1.toCharArray())
+ {
+ seq1ResCount.putIfAbsent(residue, 0);
+ seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
+ }
+ for (char residue: indelfreeAstr2.toCharArray())
+ {
+ seq2ResCount.putIfAbsent(residue, 0);
+ seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
+ }
+
+ // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
+ // divide the meanscore by the sequence length afterwards
+ float _meanscore = 0;
+ for (char resA : seq1ResCount.keySet())
+ {
+ for (char resB : seq2ResCount.keySet())
+ {
+ int countA = seq1ResCount.get(resA);
+ int countB = seq2ResCount.get(resB);
+
+ float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
+
+ _meanscore += countA * countB * scoreAB;
+ }
+ }
+ _meanscore /= length;
+ this.meanScore = _meanscore;
+ }
+
+ public float getMeanScore()
+ {
+ return this.meanScore;
+ }
+
+ /**
+ * calculate the hypothetic max score using the self-alignment of the sequences
+ */
+ public void hypotheticMaxScore()
+ {
+ int _hmsA = 0;
+ int _hmsB = 0;
+ for (char residue: indelfreeAstr1.toCharArray())
+ {
+ _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
+ }
+ for (char residue: indelfreeAstr2.toCharArray())
+ {
+ _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
+ }
+ this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
+
+ }
+
+ public int getHypotheticMaxScore()
+ {
+ return this.hypotheticMaxScore;
+ }
+
+ /**
+ * create strings based of astr1 and astr2 but without gaps
+ */
+ public void getIndelfreeAstr()
+ {
+ int n = astr1.length(); // both have the same length
+ for (int i = 0; i < n; i++)
+ {
+ if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
+ {
+ this.indelfreeAstr1 += astr1.charAt(i);
+ this.indelfreeAstr2 += astr2.charAt(i);
+ }
+ }
+ }
+
+ /**
+ * calculates the overall score of the alignment
+ * preprescore = sum of all scores - all penalties
+ * if preprescore < 1 ~ alignmentScore = Float.NaN >
+ * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
+ */
+ public void scoreAlignment() throws RuntimeException
+ {
+
+ getIndelfreeAstr();
+ meanScore();
+ hypotheticMaxScore();
+ // cannot calculate score because denominator would be zero
+ if (this.hypotheticMaxScore == this.meanScore)
+ {
+ throw new IllegalArgumentException(String.format("hypotheticMaxScore (%8.2f) == meanScore (%8.2f) - division by 0", hypotheticMaxScore, meanScore));
+ }
+ //int n = (astr1.length() > astr2.length()) ? astr1.length() : astr2.length();
+ int n = indelfreeAstr1.length();
+
+ float score = 0;
+ boolean aGapOpen = false;
+ boolean bGapOpen = false;
+ for (int i = 0; i < n; i++)
+ {
+ char char1 = indelfreeAstr1.charAt(i);
+ char char2 = indelfreeAstr2.charAt(i);
+ boolean aIsLetter = Character.isLetter(char1);
+ boolean bIsLetter = Character.isLetter(char2);
+ if (aIsLetter && bIsLetter) // if pair -> get score
+ {
+ score += scoreMatrix.getPairwiseScore(char1, char2);
+ } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
+ } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
+ score -= GAP_EXTEND_COST;
+ } else { // no gap open -> score - gap_open
+ score -= GAP_OPEN_COST;
+ }
+ // adjust GapOpen status in both sequences
+ aGapOpen = (!aIsLetter) ? true : false;
+ bGapOpen = (!bIsLetter) ? true : false;
+ }
+
+ float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
+ score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
+ int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
+ float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
+ float prescore = score; // only debug
+ score *= coverage;
+
+ System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
+ this.alignmentScore = (preprescore < 1) ? Float.NaN : score;
+ }
}