X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignSeq.java;h=4ec0457c2980a7977245355c48bfc8d2cf1410cc;hb=6c36212c1e16557e6afb1b4dba9c28864d52ad4a;hp=86dd3bc6bcfb3131a08ecdd8306d6865ed5616da;hpb=c3de0b9d18ff5f445536fd5fc818768e28676367;p=jalview.git diff --git a/src/jalview/analysis/AlignSeq.java b/src/jalview/analysis/AlignSeq.java index 86dd3bc..4ec0457 100755 --- a/src/jalview/analysis/AlignSeq.java +++ b/src/jalview/analysis/AlignSeq.java @@ -20,14 +20,18 @@ */ package jalview.analysis; +import java.util.Locale; + +import jalview.analysis.scoremodels.PIDModel; import jalview.analysis.scoremodels.ScoreMatrix; import jalview.analysis.scoremodels.ScoreModels; +import jalview.analysis.scoremodels.SimilarityParams; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; -import jalview.schemes.ResidueProperties; +import jalview.math.MiscMath; import jalview.util.Comparison; import jalview.util.Format; import jalview.util.MapList; @@ -35,8 +39,11 @@ import jalview.util.MessageManager; import java.awt.Color; import java.awt.Graphics; +import java.io.PrintStream; +import java.lang.IllegalArgumentException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.StringTokenizer; @@ -48,6 +55,17 @@ import java.util.StringTokenizer; */ public class AlignSeq { + private static final int MAX_NAME_LENGTH = 30; + + //&! + private static final int GAP_OPEN_COST = 120; + //private static final int GAP_OPEN_COST = 100; + + private static final int GAP_EXTEND_COST = 20; + //private static final int GAP_EXTEND_COST = 5; + + private static final int GAP_INDEX = -1; + public static final String PEP = "pep"; public static final String DNA = "dna"; @@ -56,11 +74,13 @@ public class AlignSeq float[][] score; + float alignmentScore; + float[][] E; float[][] F; - int[][] traceback; + int[][] traceback; // todo is this actually used? int[] seq1; @@ -86,6 +106,10 @@ public class AlignSeq public String astr2 = ""; + public String indelfreeAstr1 = ""; + + public String indelfreeAstr2 = ""; + /** DOCUMENT ME!! */ public int seq1start; @@ -95,41 +119,33 @@ public class AlignSeq /** DOCUMENT ME!! */ public int seq2start; - /** DOCUMENT ME!! */ public int seq2end; int count; - /** DOCUMENT ME!! */ public float maxscore; - float pid; - - int prev = 0; - - int gapOpen = 120; - - int gapExtend = 20; + public float meanScore; //needed for PaSiMap - float[][] lookup = ScoreModels.getInstance().getBlosum62().getMatrix(); + public int hypotheticMaxScore; // needed for PaSiMap - int defInt = 23; + int prev = 0; StringBuffer output = new StringBuffer(); - String type; + String type; // AlignSeq.PEP or AlignSeq.DNA - private int[] charToInt; + private ScoreMatrix scoreMatrix; /** * Creates a new AlignSeq object. * * @param s1 - * DOCUMENT ME! + * first sequence for alignment * @param s2 - * DOCUMENT ME! + * second sequence for alignment * @param type - * DOCUMENT ME! + * molecule type, either AlignSeq.PEP or AlignSeq.DNA */ public AlignSeq(SequenceI s1, SequenceI s2, String type) { @@ -150,7 +166,8 @@ public class AlignSeq public AlignSeq(SequenceI s1, String string1, SequenceI s2, String string2, String type) { - seqInit(s1, string1.toUpperCase(), s2, string2.toUpperCase(), type); + seqInit(s1, string1.toUpperCase(Locale.ROOT), s2, + string2.toUpperCase(Locale.ROOT), type); } /** @@ -164,6 +181,16 @@ public class AlignSeq } /** + * returns the overall score of the alignment + * + * @return + */ + public float getAlignmentScore() + { + return alignmentScore; + } + + /** * DOCUMENT ME! * * @return DOCUMENT ME! @@ -262,8 +289,8 @@ public class AlignSeq SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1()); alSeq1.setStart(s1.getStart() + getSeq1Start() - 1); alSeq1.setEnd(s1.getStart() + getSeq1End() - 1); - alSeq1.setDatasetSequence(s1.getDatasetSequence() == null ? s1 : s1 - .getDatasetSequence()); + alSeq1.setDatasetSequence( + s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence()); return alSeq1; } @@ -276,8 +303,8 @@ public class AlignSeq SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2()); alSeq2.setStart(s2.getStart() + getSeq2Start() - 1); alSeq2.setEnd(s2.getStart() + getSeq2End() - 1); - alSeq2.setDatasetSequence(s2.getDatasetSequence() == null ? s2 : s2 - .getDatasetSequence()); + alSeq2.setDatasetSequence( + s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence()); return alSeq2; } @@ -318,97 +345,143 @@ public class AlignSeq if (s1str.length() == 0 || s2str.length() == 0) { - output.append("ALL GAPS: " - + (s1str.length() == 0 ? s1.getName() : " ") - + (s2str.length() == 0 ? s2.getName() : "")); + output.append( + "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ") + + (s2str.length() == 0 ? s2.getName() : "")); return; } - // System.out.println("lookuip " + rt.freeMemory() + " "+ rt.totalMemory()); - seq1 = new int[s1str.length()]; - - // System.out.println("seq1 " + rt.freeMemory() +" " + rt.totalMemory()); - seq2 = new int[s2str.length()]; - - // System.out.println("seq2 " + rt.freeMemory() + " " + rt.totalMemory()); score = new float[s1str.length()][s2str.length()]; - // System.out.println("score " + rt.freeMemory() + " " + rt.totalMemory()); E = new float[s1str.length()][s2str.length()]; - // System.out.println("E " + rt.freeMemory() + " " + rt.totalMemory()); F = new float[s1str.length()][s2str.length()]; traceback = new int[s1str.length()][s2str.length()]; - // System.out.println("F " + rt.freeMemory() + " " + rt.totalMemory()); - seq1 = stringToInt(s1str, type); - - // System.out.println("seq1 " + rt.freeMemory() + " " + rt.totalMemory()); - seq2 = stringToInt(s2str, type); - - // System.out.println("Seq2 " + rt.freeMemory() + " " + rt.totalMemory()); - // long tstart = System.currentTimeMillis(); - // calcScoreMatrix(); - // long tend = System.currentTimeMillis(); - // System.out.println("Time take to calculate score matrix = " + - // (tend-tstart) + " ms"); - // printScoreMatrix(score); - // System.out.println(); - // printScoreMatrix(traceback); - // System.out.println(); - // printScoreMatrix(E); - // System.out.println(); - // /printScoreMatrix(F); - // System.out.println(); - // tstart = System.currentTimeMillis(); - // traceAlignment(); - // tend = System.currentTimeMillis(); - // System.out.println("Time take to traceback alignment = " + (tend-tstart) - // + " ms"); + seq1 = indexEncode(s1str); + + seq2 = indexEncode(s2str); } - private void setDefaultParams(String type) + private void setDefaultParams(String moleculeType) { - setType(type); + if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType)) + { + output.append("Wrong type = dna or pep only"); + throw new Error(MessageManager + .formatMessage("error.unknown_type_dna_or_pep", new String[] + { moleculeType })); + } + + type = moleculeType; + scoreMatrix = ScoreModels.getInstance() + .getDefaultModel(PEP.equals(type)); + } - if (type.equals(AlignSeq.PEP)) + /** + * DOCUMENT ME! + */ + public void traceAlignment() + { + // Find the maximum score along the rhs or bottom row + float max = -Float.MAX_VALUE; + + for (int i = 0; i < seq1.length; i++) { - lookup = ScoreModels.getInstance().getDefaultModel(true).getMatrix(); + if (score[i][seq2.length - 1] > max) + { + max = score[i][seq2.length - 1]; + maxi = i; + maxj = seq2.length - 1; + } } - else if (type.equals(AlignSeq.DNA)) + + for (int j = 0; j < seq2.length; j++) { - lookup = ScoreModels.getInstance().getDefaultModel(false).getMatrix(); + if (score[seq1.length - 1][j] > max) + { + max = score[seq1.length - 1][j]; + maxi = seq1.length - 1; + maxj = j; + } } - } - private void setType(String type2) - { - this.type = type2; - if (type.equals(AlignSeq.PEP)) + int i = maxi; + int j = maxj; + int trace; + maxscore = score[i][j] / 10f; + + + aseq1 = new int[seq1.length + seq2.length]; + aseq2 = new int[seq1.length + seq2.length]; + + StringBuilder sb1 = new StringBuilder(aseq1.length); + StringBuilder sb2 = new StringBuilder(aseq2.length); + + count = (seq1.length + seq2.length) - 1; + + + while (i > 0 && j > 0) { - charToInt = ResidueProperties.aaIndex; - defInt = ResidueProperties.maxProteinIndex; + aseq1[count] = seq1[i]; + sb1.append(s1str.charAt(i)); + aseq2[count] = seq2[j]; + sb2.append(s2str.charAt(j)); + + trace = findTrace(i, j); + + if (trace == 0) + { + i--; + j--; + } + else if (trace == 1) + { + j--; + aseq1[count] = GAP_INDEX; + sb1.replace(sb1.length() - 1, sb1.length(), "-"); + } + else if (trace == -1) + { + i--; + aseq2[count] = GAP_INDEX; + sb2.replace(sb2.length() - 1, sb2.length(), "-"); + } + + count--; } - else if (type.equals(AlignSeq.DNA)) + + seq1start = i + 1; + seq2start = j + 1; + + if (aseq1[count] != GAP_INDEX) { - charToInt = ResidueProperties.nucleotideIndex; - defInt = ResidueProperties.maxNucleotideIndex; + aseq1[count] = seq1[i]; + sb1.append(s1str.charAt(i)); } - else + + if (aseq2[count] != GAP_INDEX) { - output.append("Wrong type = dna or pep only"); - throw new Error(MessageManager.formatMessage( - "error.unknown_type_dna_or_pep", new String[] { type2 })); + aseq2[count] = seq2[j]; + sb2.append(s2str.charAt(j)); } + + + /* + * we built the character strings backwards, so now + * reverse them to convert to sequence strings + */ + astr1 = sb1.reverse().toString(); + astr2 = sb2.reverse().toString(); } /** * DOCUMENT ME! */ - public void traceAlignment() + public void traceAlignmentWithEndGaps() { // Find the maximum score along the rhs or bottom row - float max = -9999; + float max = -Float.MAX_VALUE; for (int i = 0; i < seq1.length; i++) { @@ -430,33 +503,55 @@ public class AlignSeq } } - // System.out.println(maxi + " " + maxj + " " + score[maxi][maxj]); int i = maxi; int j = maxj; int trace; - maxscore = score[i][j] / 10; + maxscore = score[i][j] / 10f; + + //&! get trailing gaps + while ((i < seq1.length - 1) || (j < seq2.length - 1)) + { + i++; + j++; + } + seq1end = i + 1; + seq2end = j + 1; - seq1end = maxi + 1; - seq2end = maxj + 1; aseq1 = new int[seq1.length + seq2.length]; aseq2 = new int[seq1.length + seq2.length]; + StringBuilder sb1 = new StringBuilder(aseq1.length); + StringBuilder sb2 = new StringBuilder(aseq2.length); + count = (seq1.length + seq2.length) - 1; - while ((i > 0) && (j > 0)) + //&! get trailing gaps + while ((i >= seq1.length) || (j >= seq2.length)) { - if ((aseq1[count] != defInt) && (i >= 0)) + if (i >= seq1.length) { - aseq1[count] = seq1[i]; - astr1 = s1str.charAt(i) + astr1; + aseq1[count] = GAP_INDEX; + sb1.append("-"); + aseq2[count] = seq2[j]; + sb2.append(s2str.charAt(j)); + } else if (j >= seq2.length) { + aseq1[count] = seq1[i]; + sb1.append(s1str.charAt(i)); + aseq2[count] = GAP_INDEX; + sb2.append("-"); } + i--; + j--; + } - if ((aseq2[count] != defInt) && (j > 0)) - { - aseq2[count] = seq2[j]; - astr2 = s2str.charAt(j) + astr2; - } + + while (i > 0 && j > 0) + { + aseq1[count] = seq1[i]; + sb1.append(s1str.charAt(i)); + aseq2[count] = seq2[j]; + sb2.append(s2str.charAt(j)); trace = findTrace(i, j); @@ -468,14 +563,14 @@ public class AlignSeq else if (trace == 1) { j--; - aseq1[count] = defInt; - astr1 = "-" + astr1.substring(1); + aseq1[count] = GAP_INDEX; + sb1.replace(sb1.length() - 1, sb1.length(), "-"); } else if (trace == -1) { i--; - aseq2[count] = defInt; - astr2 = "-" + astr2.substring(1); + aseq2[count] = GAP_INDEX; + sb2.replace(sb2.length() - 1, sb2.length(), "-"); } count--; @@ -484,65 +579,86 @@ public class AlignSeq seq1start = i + 1; seq2start = j + 1; - if (aseq1[count] != defInt) + if (aseq1[count] != GAP_INDEX) { aseq1[count] = seq1[i]; - astr1 = s1str.charAt(i) + astr1; + sb1.append(s1str.charAt(i)); } - if (aseq2[count] != defInt) + if (aseq2[count] != GAP_INDEX) { aseq2[count] = seq2[j]; - astr2 = s2str.charAt(j) + astr2; + sb2.append(s2str.charAt(j)); + } + + //&! get initial gaps + while (j > 0 || i > 0) + { + if (j > 0) + { + sb1.append("-"); + sb2.append(s2str.charAt(j)); + j--; + } else if (i > 0) { + sb1.append(s1str.charAt(i)); + sb2.append("-"); + i--; + } } + + /* + * we built the character strings backwards, so now + * reverse them to convert to sequence strings + */ + astr1 = sb1.reverse().toString(); + astr2 = sb2.reverse().toString(); } /** * DOCUMENT ME! */ - public void printAlignment(java.io.PrintStream os) + public void printAlignment(PrintStream os) { // TODO: Use original sequence characters rather than re-translated // characters in output // Find the biggest id length for formatting purposes - String s1id = s1.getName(), s2id = s2.getName(); - int maxid = s1.getName().length(); - if (s2.getName().length() > maxid) + String s1id = getAlignedSeq1().getDisplayId(true); + String s2id = getAlignedSeq2().getDisplayId(true); + int nameLength = Math.max(s1id.length(), s2id.length()); + if (nameLength > MAX_NAME_LENGTH) { - maxid = s2.getName().length(); - } - if (maxid > 30) - { - maxid = 30; + int truncateBy = nameLength - MAX_NAME_LENGTH; + nameLength = MAX_NAME_LENGTH; // JAL-527 - truncate the sequence ids - if (s1.getName().length() > maxid) + if (s1id.length() > nameLength) { - s1id = s1.getName().substring(0, 30); + int slashPos = s1id.lastIndexOf('/'); + s1id = s1id.substring(0, slashPos - truncateBy) + + s1id.substring(slashPos); } - if (s2.getName().length() > maxid) + if (s2id.length() > nameLength) { - s2id = s2.getName().substring(0, 30); + int slashPos = s2id.lastIndexOf('/'); + s2id = s2id.substring(0, slashPos - truncateBy) + + s2id.substring(slashPos); } } - int len = 72 - maxid - 1; + int len = 72 - nameLength - 1; int nochunks = ((aseq1.length - count) / len) + ((aseq1.length - count) % len > 0 ? 1 : 0); - pid = 0; + float pid = 0f; output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE); output.append("Length of alignment = ") .append(String.valueOf(aseq1.length - count)).append(NEWLINE); output.append("Sequence "); - output.append(new Format("%" + maxid + "s").form(s1.getName())); - output.append(" : ").append(String.valueOf(s1.getStart())) - .append(" - ").append(String.valueOf(s1.getEnd())); + Format nameFormat = new Format("%" + nameLength + "s"); + output.append(nameFormat.form(s1id)); output.append(" (Sequence length = ") .append(String.valueOf(s1str.length())).append(")") .append(NEWLINE); output.append("Sequence "); - output.append(new Format("%" + maxid + "s").form(s2.getName())); - output.append(" : ").append(String.valueOf(s2.getStart())) - .append(" - ").append(String.valueOf(s2.getEnd())); + output.append(nameFormat.form(s2id)); output.append(" (Sequence length = ") .append(String.valueOf(s2str.length())).append(")") .append(NEWLINE).append(NEWLINE); @@ -552,7 +668,7 @@ public class AlignSeq for (int j = 0; j < nochunks; j++) { // Print the first aligned sequence - output.append(new Format("%" + (maxid) + "s").form(s1id)).append(" "); + output.append(nameFormat.form(s1id)).append(" "); for (int i = 0; i < len; i++) { @@ -563,7 +679,7 @@ public class AlignSeq } output.append(NEWLINE); - output.append(new Format("%" + (maxid) + "s").form(" ")).append(" "); + output.append(nameFormat.form(" ")).append(" "); /* * Print out the match symbols: @@ -583,7 +699,7 @@ public class AlignSeq pid++; output.append("|"); } - else if (type.equals("pep")) + else if (PEP.equals(type)) { if (pam250.getPairwiseScore(c1, c2) > 0) { @@ -603,8 +719,7 @@ public class AlignSeq // Now print the second aligned sequence output = output.append(NEWLINE); - output = output.append(new Format("%" + (maxid) + "s").form(s2id)) - .append(" "); + output = output.append(nameFormat.form(s2id)).append(" "); for (int i = 0; i < len; i++) { @@ -618,7 +733,8 @@ public class AlignSeq } pid = pid / (aseq1.length - count) * 100; - output = output.append(new Format("Percentage ID = %2.2f\n").form(pid)); + output.append(new Format("Percentage ID = %3.2f\n").form(pid)); + output.append(NEWLINE); try { os.print(output.toString()); @@ -640,7 +756,9 @@ public class AlignSeq public int findTrace(int i, int j) { int t = 0; - float max = score[i - 1][j - 1] + (lookup[seq1[i]][seq2[j]] * 10); + float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), + s2str.charAt(j)); + float max = score[i - 1][j - 1] + (pairwiseScore * 10); if (F[i][j] > max) { @@ -684,18 +802,23 @@ public class AlignSeq int m = seq2.length; // top left hand element - score[0][0] = lookup[seq1[0]][seq2[0]] * 10; - E[0][0] = -gapExtend; + score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0), + s2str.charAt(0)) * 10; + E[0][0] = -GAP_EXTEND_COST; F[0][0] = 0; // Calculate the top row first for (int j = 1; j < m; j++) { // What should these values be? 0 maybe - E[0][j] = max(score[0][j - 1] - gapOpen, E[0][j - 1] - gapExtend); - F[0][j] = -gapExtend; + E[0][j] = max(score[0][j - 1] - GAP_OPEN_COST, + E[0][j - 1] - GAP_EXTEND_COST); + F[0][j] = -GAP_EXTEND_COST; - score[0][j] = max(lookup[seq1[0]][seq2[j]] * 10, -gapOpen, -gapExtend); + float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0), + s2str.charAt(j)); + score[0][j] = max(pairwiseScore * 10, -GAP_OPEN_COST, + -GAP_EXTEND_COST); traceback[0][j] = 1; } @@ -703,10 +826,13 @@ public class AlignSeq // Now do the left hand column for (int i = 1; i < n; i++) { - E[i][0] = -gapOpen; - F[i][0] = max(score[i - 1][0] - gapOpen, F[i - 1][0] - gapExtend); + E[i][0] = -GAP_OPEN_COST; + F[i][0] = max(score[i - 1][0] - GAP_OPEN_COST, + F[i - 1][0] - GAP_EXTEND_COST); - score[i][0] = max(lookup[seq1[i]][seq2[0]] * 10, E[i][0], F[i][0]); + float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), + s2str.charAt(0)); + score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]); traceback[i][0] = -1; } @@ -715,11 +841,15 @@ public class AlignSeq { for (int j = 1; j < m; j++) { - E[i][j] = max(score[i][j - 1] - gapOpen, E[i][j - 1] - gapExtend); - F[i][j] = max(score[i - 1][j] - gapOpen, F[i - 1][j] - gapExtend); - - score[i][j] = max(score[i - 1][j - 1] - + (lookup[seq1[i]][seq2[j]] * 10), E[i][j], F[i][j]); + E[i][j] = max(score[i][j - 1] - GAP_OPEN_COST, + E[i][j - 1] - GAP_EXTEND_COST); + F[i][j] = max(score[i - 1][j] - GAP_OPEN_COST, + F[i - 1][j] - GAP_EXTEND_COST); + + float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), + s2str.charAt(j)); + score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10), + E[i][j], F[i][j]); traceback[i][j] = findTrace(i, j); } } @@ -804,45 +934,24 @@ public class AlignSeq } /** - * DOCUMENT ME! + * Converts the character string to an array of integers which are the + * corresponding indices to the characters in the score matrix * * @param s - * DOCUMENT ME! - * @param type - * DOCUMENT ME! * - * @return DOCUMENT ME! + * @return */ - int[] stringToInt(String s, String type) + int[] indexEncode(String s) { - int[] seq1 = new int[s.length()]; + int[] encoded = new int[s.length()]; for (int i = 0; i < s.length(); i++) { - // String ss = s.substring(i, i + 1).toUpperCase(); char c = s.charAt(i); - if ('a' <= c && c <= 'z') - { - // TO UPPERCASE !!! - c -= ('a' - 'A'); - } - - try - { - seq1[i] = charToInt[c]; // set accordingly from setType - if (seq1[i] < 0 || seq1[i] > defInt) // set from setType: 23 for - // peptides, or 4 for NA. - { - seq1[i] = defInt; - } - - } catch (Exception e) - { - seq1[i] = defInt; - } + encoded[i] = scoreMatrix.getMatrixIndex(c); } - return seq1; + return encoded; } /** @@ -862,7 +971,7 @@ public class AlignSeq public static void displayMatrix(Graphics g, int[][] mat, int n, int m, int psize) { - // TODO method dosen't seem to be referenced anywhere delete?? + // TODO method doesn't seem to be referenced anywhere delete?? int max = -1000; int min = 1000; @@ -926,7 +1035,8 @@ public class AlignSeq */ public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch) { - ArrayList as1 = new ArrayList(), as2 = new ArrayList(); + ArrayList as1 = new ArrayList(), + as2 = new ArrayList(); int pdbpos = s2.getStart() + getSeq2Start() - 2; int alignpos = s1.getStart() + getSeq1Start() - 2; int lp2 = pdbpos - 3, lp1 = alignpos - 3; @@ -945,7 +1055,8 @@ public class AlignSeq pdbpos++; } - if (allowmismatch || c1 == c2) + // ignore case differences + if (allowmismatch || (c1 == c2) || (Math.abs(c2-c1)==('a'-'A'))) { // extend mapping interval if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos) @@ -970,8 +1081,8 @@ public class AlignSeq } // construct range pairs - int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)], mapseq2 = new int[as2 - .size() + (lastmatch ? 1 : 0)]; + int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)], + mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)]; int i = 0; for (Integer ip : as1) { @@ -1014,7 +1125,8 @@ public class AlignSeq List ochains, AlignmentI al, String dnaOrProtein, boolean removeOldAnnots) { - List orig = new ArrayList(), repl = new ArrayList(); + List orig = new ArrayList(), + repl = new ArrayList(); List aligs = new ArrayList(); if (al != null && al.getHeight() > 0) { @@ -1036,8 +1148,8 @@ public class AlignSeq bestm = msq; } } - System.out.println("Best Score for " + (matches.size() + 1) + " :" - + bestscore); + // System.out.println("Best Score for " + (matches.size() + 1) + " :" + // + bestscore); matches.add(bestm); aligns.add(bestaseq); al.deleteSequence(bestm); @@ -1126,6 +1238,8 @@ public class AlignSeq // long start = System.currentTimeMillis(); + SimilarityParams pidParams = new SimilarityParams(true, true, true, + true); float pid; String seqi, seqj; for (int i = 0; i < height; i++) @@ -1166,7 +1280,7 @@ public class AlignSeq seqj = ug; } } - pid = Comparison.PID(seqi, seqj); + pid = (float) PIDModel.computePID(seqi, seqj, pidParams); // use real sequence length rather than string length if (lngth[j] < lngth[i]) @@ -1182,4 +1296,147 @@ public class AlignSeq } return redundancy; } + + /** + * calculate the mean score of the alignment + * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences + * + */ + public void meanScore() + { + //int length = (indelfreeAstr1.length() > indelfreeAstr2.length()) ? indelfreeAstr1.length() : indelfreeAstr2.length(); + int length = indelfreeAstr1.length(); //both have the same length + //create HashMap for counting residues in each sequence + HashMap seq1ResCount = new HashMap(); + HashMap seq2ResCount = new HashMap(); + + // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered + for (char residue: indelfreeAstr1.toCharArray()) + { + seq1ResCount.putIfAbsent(residue, 0); + seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1); + } + for (char residue: indelfreeAstr2.toCharArray()) + { + seq2ResCount.putIfAbsent(residue, 0); + seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1); + } + + // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB)) + // divide the meanscore by the sequence length afterwards + float _meanscore = 0; + for (char resA : seq1ResCount.keySet()) + { + for (char resB : seq2ResCount.keySet()) + { + int countA = seq1ResCount.get(resA); + int countB = seq2ResCount.get(resB); + + float scoreAB = scoreMatrix.getPairwiseScore(resA, resB); + + _meanscore += countA * countB * scoreAB; + } + } + _meanscore /= length; + this.meanScore = _meanscore; + } + + public float getMeanScore() + { + return this.meanScore; + } + + /** + * calculate the hypothetic max score using the self-alignment of the sequences + */ + public void hypotheticMaxScore() + { + int _hmsA = 0; + int _hmsB = 0; + for (char residue: indelfreeAstr1.toCharArray()) + { + _hmsA += scoreMatrix.getPairwiseScore(residue, residue); + } + for (char residue: indelfreeAstr2.toCharArray()) + { + _hmsB += scoreMatrix.getPairwiseScore(residue, residue); + } + this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment + + } + + public int getHypotheticMaxScore() + { + return this.hypotheticMaxScore; + } + + /** + * create strings based of astr1 and astr2 but without gaps + */ + public void getIndelfreeAstr() + { + int n = astr1.length(); // both have the same length + for (int i = 0; i < n; i++) + { + if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr + { + this.indelfreeAstr1 += astr1.charAt(i); + this.indelfreeAstr2 += astr2.charAt(i); + } + } + } + + /** + * calculates the overall score of the alignment + * preprescore = sum of all scores - all penalties + * if preprescore < 1 ~ alignmentScore = Float.NaN > + * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage + */ + public void scoreAlignment() throws RuntimeException + { + + getIndelfreeAstr(); + meanScore(); + hypotheticMaxScore(); + // cannot calculate score because denominator would be zero + if (this.hypotheticMaxScore == this.meanScore) + { + throw new IllegalArgumentException(String.format("hypotheticMaxScore (%8.2f) == meanScore (%8.2f) - division by 0", hypotheticMaxScore, meanScore)); + } + //int n = (astr1.length() > astr2.length()) ? astr1.length() : astr2.length(); + int n = indelfreeAstr1.length(); + + float score = 0; + boolean aGapOpen = false; + boolean bGapOpen = false; + for (int i = 0; i < n; i++) + { + char char1 = indelfreeAstr1.charAt(i); + char char2 = indelfreeAstr2.charAt(i); + boolean aIsLetter = Character.isLetter(char1); + boolean bIsLetter = Character.isLetter(char2); + if (aIsLetter && bIsLetter) // if pair -> get score + { + score += scoreMatrix.getPairwiseScore(char1, char2); + } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip + } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend + score -= GAP_EXTEND_COST; + } else { // no gap open -> score - gap_open + score -= GAP_OPEN_COST; + } + // adjust GapOpen status in both sequences + aGapOpen = (!aIsLetter) ? true : false; + bGapOpen = (!bIsLetter) ? true : false; + } + + float preprescore = score; // if this score < 1 --> alignment score = Float.NaN + score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore); + int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max} + float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length + float prescore = score; // only debug + score *= coverage; + + System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore)); + this.alignmentScore = (preprescore < 1) ? Float.NaN : score; + } }