2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import java.util.Locale;
25 import jalview.analysis.scoremodels.PIDModel;
26 import jalview.analysis.scoremodels.ScoreMatrix;
27 import jalview.analysis.scoremodels.ScoreModels;
28 import jalview.analysis.scoremodels.SimilarityParams;
29 import jalview.datamodel.AlignmentAnnotation;
30 import jalview.datamodel.AlignmentI;
31 import jalview.datamodel.Mapping;
32 import jalview.datamodel.Sequence;
33 import jalview.datamodel.SequenceI;
34 import jalview.math.MiscMath;
35 import jalview.util.Comparison;
36 import jalview.util.Format;
37 import jalview.util.MapList;
38 import jalview.util.MessageManager;
40 import java.awt.Color;
41 import java.awt.Graphics;
42 import java.io.PrintStream;
43 import java.lang.IllegalArgumentException;
44 import java.util.ArrayList;
45 import java.util.Arrays;
46 import java.util.HashMap;
47 import java.util.List;
48 import java.util.StringTokenizer;
58 private static final int MAX_NAME_LENGTH = 30;
61 private static final int GAP_OPEN_COST = 120;
62 //private static final int GAP_OPEN_COST = 100;
64 private static final int GAP_EXTEND_COST = 20;
65 //private static final int GAP_EXTEND_COST = 5;
67 private static final int GAP_INDEX = -1;
69 public static final String PEP = "pep";
71 public static final String DNA = "dna";
73 private static final String NEWLINE = System.lineSeparator();
83 int[][] traceback; // todo is this actually used?
105 public String astr1 = "";
107 public String astr2 = "";
109 public String indelfreeAstr1 = "";
111 public String indelfreeAstr2 = "";
114 public int seq1start;
120 public int seq2start;
126 public float maxscore;
128 public float meanScore; //needed for PaSiMap
130 public int hypotheticMaxScore; // needed for PaSiMap
134 StringBuffer output = new StringBuffer();
136 String type; // AlignSeq.PEP or AlignSeq.DNA
138 private ScoreMatrix scoreMatrix;
141 * Creates a new AlignSeq object.
144 * first sequence for alignment
146 * second sequence for alignment
148 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
150 public AlignSeq(SequenceI s1, SequenceI s2, String type)
152 seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(),
157 * Creates a new AlignSeq object.
166 public AlignSeq(SequenceI s1, String string1, SequenceI s2,
167 String string2, String type)
169 seqInit(s1, string1.toUpperCase(Locale.ROOT), s2,
170 string2.toUpperCase(Locale.ROOT), type);
176 * @return DOCUMENT ME!
178 public float getMaxScore()
184 * returns the overall score of the alignment
188 public float getAlignmentScore()
190 return alignmentScore;
196 * @return DOCUMENT ME!
198 public int getSeq2Start()
206 * @return DOCUMENT ME!
208 public int getSeq2End()
216 * @return DOCUMENT ME!
218 public int getSeq1Start()
226 * @return DOCUMENT ME!
228 public int getSeq1End()
236 * @return DOCUMENT ME!
238 public String getOutput()
240 return output.toString();
246 * @return DOCUMENT ME!
248 public String getAStr1()
256 * @return DOCUMENT ME!
258 public String getAStr2()
266 * @return DOCUMENT ME!
268 public int[] getASeq1()
276 * @return DOCUMENT ME!
278 public int[] getASeq2()
285 * @return aligned instance of Seq 1
287 public SequenceI getAlignedSeq1()
289 SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1());
290 alSeq1.setStart(s1.getStart() + getSeq1Start() - 1);
291 alSeq1.setEnd(s1.getStart() + getSeq1End() - 1);
292 alSeq1.setDatasetSequence(
293 s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence());
299 * @return aligned instance of Seq 2
301 public SequenceI getAlignedSeq2()
303 SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2());
304 alSeq2.setStart(s2.getStart() + getSeq2Start() - 1);
305 alSeq2.setEnd(s2.getStart() + getSeq2End() - 1);
306 alSeq2.setDatasetSequence(
307 s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
312 * Construct score matrix for sequences with standard DNA or PEPTIDE matrix
317 * - string to use for s1
321 * - string to use for s2
325 public void seqInit(SequenceI s1, String string1, SequenceI s2,
326 String string2, String type)
330 setDefaultParams(type);
331 seqInit(string1, string2);
335 * construct score matrix for string1 and string2 (after removing any existing
341 private void seqInit(String string1, String string2)
343 s1str = extractGaps(jalview.util.Comparison.GapChars, string1);
344 s2str = extractGaps(jalview.util.Comparison.GapChars, string2);
346 if (s1str.length() == 0 || s2str.length() == 0)
349 "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ")
350 + (s2str.length() == 0 ? s2.getName() : ""));
354 score = new float[s1str.length()][s2str.length()];
356 E = new float[s1str.length()][s2str.length()];
358 F = new float[s1str.length()][s2str.length()];
359 traceback = new int[s1str.length()][s2str.length()];
361 seq1 = indexEncode(s1str);
363 seq2 = indexEncode(s2str);
366 private void setDefaultParams(String moleculeType)
368 if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType))
370 output.append("Wrong type = dna or pep only");
371 throw new Error(MessageManager
372 .formatMessage("error.unknown_type_dna_or_pep", new String[]
377 scoreMatrix = ScoreModels.getInstance()
378 .getDefaultModel(PEP.equals(type));
384 public void traceAlignment()
386 // Find the maximum score along the rhs or bottom row
387 float max = -Float.MAX_VALUE;
389 for (int i = 0; i < seq1.length; i++)
391 if (score[i][seq2.length - 1] > max)
393 max = score[i][seq2.length - 1];
395 maxj = seq2.length - 1;
399 for (int j = 0; j < seq2.length; j++)
401 if (score[seq1.length - 1][j] > max)
403 max = score[seq1.length - 1][j];
404 maxi = seq1.length - 1;
412 maxscore = score[i][j] / 10f;
415 aseq1 = new int[seq1.length + seq2.length];
416 aseq2 = new int[seq1.length + seq2.length];
418 StringBuilder sb1 = new StringBuilder(aseq1.length);
419 StringBuilder sb2 = new StringBuilder(aseq2.length);
421 count = (seq1.length + seq2.length) - 1;
424 while (i > 0 && j > 0)
426 aseq1[count] = seq1[i];
427 sb1.append(s1str.charAt(i));
428 aseq2[count] = seq2[j];
429 sb2.append(s2str.charAt(j));
431 trace = findTrace(i, j);
441 aseq1[count] = GAP_INDEX;
442 sb1.replace(sb1.length() - 1, sb1.length(), "-");
444 else if (trace == -1)
447 aseq2[count] = GAP_INDEX;
448 sb2.replace(sb2.length() - 1, sb2.length(), "-");
457 if (aseq1[count] != GAP_INDEX)
459 aseq1[count] = seq1[i];
460 sb1.append(s1str.charAt(i));
463 if (aseq2[count] != GAP_INDEX)
465 aseq2[count] = seq2[j];
466 sb2.append(s2str.charAt(j));
471 * we built the character strings backwards, so now
472 * reverse them to convert to sequence strings
474 astr1 = sb1.reverse().toString();
475 astr2 = sb2.reverse().toString();
481 public void traceAlignmentWithEndGaps()
483 // Find the maximum score along the rhs or bottom row
484 float max = -Float.MAX_VALUE;
486 for (int i = 0; i < seq1.length; i++)
488 if (score[i][seq2.length - 1] > max)
490 max = score[i][seq2.length - 1];
492 maxj = seq2.length - 1;
496 for (int j = 0; j < seq2.length; j++)
498 if (score[seq1.length - 1][j] > max)
500 max = score[seq1.length - 1][j];
501 maxi = seq1.length - 1;
509 maxscore = score[i][j] / 10f;
511 //&! get trailing gaps
512 while ((i < seq1.length - 1) || (j < seq2.length - 1))
521 aseq1 = new int[seq1.length + seq2.length];
522 aseq2 = new int[seq1.length + seq2.length];
524 StringBuilder sb1 = new StringBuilder(aseq1.length);
525 StringBuilder sb2 = new StringBuilder(aseq2.length);
527 count = (seq1.length + seq2.length) - 1;
529 //&! get trailing gaps
530 while ((i >= seq1.length) || (j >= seq2.length))
532 if (i >= seq1.length)
534 aseq1[count] = GAP_INDEX;
536 aseq2[count] = seq2[j];
537 sb2.append(s2str.charAt(j));
538 } else if (j >= seq2.length) {
539 aseq1[count] = seq1[i];
540 sb1.append(s1str.charAt(i));
541 aseq2[count] = GAP_INDEX;
549 while (i > 0 && j > 0)
551 aseq1[count] = seq1[i];
552 sb1.append(s1str.charAt(i));
553 aseq2[count] = seq2[j];
554 sb2.append(s2str.charAt(j));
556 trace = findTrace(i, j);
566 aseq1[count] = GAP_INDEX;
567 sb1.replace(sb1.length() - 1, sb1.length(), "-");
569 else if (trace == -1)
572 aseq2[count] = GAP_INDEX;
573 sb2.replace(sb2.length() - 1, sb2.length(), "-");
582 if (aseq1[count] != GAP_INDEX)
584 aseq1[count] = seq1[i];
585 sb1.append(s1str.charAt(i));
588 if (aseq2[count] != GAP_INDEX)
590 aseq2[count] = seq2[j];
591 sb2.append(s2str.charAt(j));
594 //&! get initial gaps
595 while (j > 0 || i > 0)
600 sb2.append(s2str.charAt(j));
603 sb1.append(s1str.charAt(i));
610 * we built the character strings backwards, so now
611 * reverse them to convert to sequence strings
613 astr1 = sb1.reverse().toString();
614 astr2 = sb2.reverse().toString();
620 public void printAlignment(PrintStream os)
622 // TODO: Use original sequence characters rather than re-translated
623 // characters in output
624 // Find the biggest id length for formatting purposes
625 String s1id = getAlignedSeq1().getDisplayId(true);
626 String s2id = getAlignedSeq2().getDisplayId(true);
627 int nameLength = Math.max(s1id.length(), s2id.length());
628 if (nameLength > MAX_NAME_LENGTH)
630 int truncateBy = nameLength - MAX_NAME_LENGTH;
631 nameLength = MAX_NAME_LENGTH;
632 // JAL-527 - truncate the sequence ids
633 if (s1id.length() > nameLength)
635 int slashPos = s1id.lastIndexOf('/');
636 s1id = s1id.substring(0, slashPos - truncateBy)
637 + s1id.substring(slashPos);
639 if (s2id.length() > nameLength)
641 int slashPos = s2id.lastIndexOf('/');
642 s2id = s2id.substring(0, slashPos - truncateBy)
643 + s2id.substring(slashPos);
646 int len = 72 - nameLength - 1;
647 int nochunks = ((aseq1.length - count) / len)
648 + ((aseq1.length - count) % len > 0 ? 1 : 0);
651 output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE);
652 output.append("Length of alignment = ")
653 .append(String.valueOf(aseq1.length - count)).append(NEWLINE);
654 output.append("Sequence ");
655 Format nameFormat = new Format("%" + nameLength + "s");
656 output.append(nameFormat.form(s1id));
657 output.append(" (Sequence length = ")
658 .append(String.valueOf(s1str.length())).append(")")
660 output.append("Sequence ");
661 output.append(nameFormat.form(s2id));
662 output.append(" (Sequence length = ")
663 .append(String.valueOf(s2str.length())).append(")")
664 .append(NEWLINE).append(NEWLINE);
666 ScoreMatrix pam250 = ScoreModels.getInstance().getPam250();
668 for (int j = 0; j < nochunks; j++)
670 // Print the first aligned sequence
671 output.append(nameFormat.form(s1id)).append(" ");
673 for (int i = 0; i < len; i++)
675 if ((i + (j * len)) < astr1.length())
677 output.append(astr1.charAt(i + (j * len)));
681 output.append(NEWLINE);
682 output.append(nameFormat.form(" ")).append(" ");
685 * Print out the match symbols:
686 * | for exact match (ignoring case)
687 * . if PAM250 score is positive
690 for (int i = 0; i < len; i++)
692 if ((i + (j * len)) < astr1.length())
694 char c1 = astr1.charAt(i + (j * len));
695 char c2 = astr2.charAt(i + (j * len));
696 boolean sameChar = Comparison.isSameResidue(c1, c2, false);
697 if (sameChar && !Comparison.isGap(c1))
702 else if (PEP.equals(type))
704 if (pam250.getPairwiseScore(c1, c2) > 0)
720 // Now print the second aligned sequence
721 output = output.append(NEWLINE);
722 output = output.append(nameFormat.form(s2id)).append(" ");
724 for (int i = 0; i < len; i++)
726 if ((i + (j * len)) < astr2.length())
728 output.append(astr2.charAt(i + (j * len)));
732 output.append(NEWLINE).append(NEWLINE);
735 pid = pid / (aseq1.length - count) * 100;
736 output.append(new Format("Percentage ID = %3.2f\n").form(pid));
737 output.append(NEWLINE);
740 os.print(output.toString());
741 } catch (Exception ex)
754 * @return DOCUMENT ME!
756 public int findTrace(int i, int j)
759 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
761 float max = score[i - 1][j - 1] + (pairwiseScore * 10);
768 else if (F[i][j] == max)
782 else if (E[i][j] == max)
799 public void calcScoreMatrix()
804 // top left hand element
805 score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
806 s2str.charAt(0)) * 10;
807 E[0][0] = -GAP_EXTEND_COST;
810 // Calculate the top row first
811 for (int j = 1; j < m; j++)
813 // What should these values be? 0 maybe
814 E[0][j] = max(score[0][j - 1] - GAP_OPEN_COST,
815 E[0][j - 1] - GAP_EXTEND_COST);
816 F[0][j] = -GAP_EXTEND_COST;
818 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
820 score[0][j] = max(pairwiseScore * 10, -GAP_OPEN_COST,
826 // Now do the left hand column
827 for (int i = 1; i < n; i++)
829 E[i][0] = -GAP_OPEN_COST;
830 F[i][0] = max(score[i - 1][0] - GAP_OPEN_COST,
831 F[i - 1][0] - GAP_EXTEND_COST);
833 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
835 score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]);
836 traceback[i][0] = -1;
839 // Now do all the other rows
840 for (int i = 1; i < n; i++)
842 for (int j = 1; j < m; j++)
844 E[i][j] = max(score[i][j - 1] - GAP_OPEN_COST,
845 E[i][j - 1] - GAP_EXTEND_COST);
846 F[i][j] = max(score[i - 1][j] - GAP_OPEN_COST,
847 F[i - 1][j] - GAP_EXTEND_COST);
849 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
851 score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10),
853 traceback[i][j] = findTrace(i, j);
859 * Returns the given sequence with all of the given gap characters removed.
862 * a string of characters to be treated as gaps
868 public static String extractGaps(String gapChars, String seq)
870 if (gapChars == null || seq == null)
874 StringTokenizer str = new StringTokenizer(seq, gapChars);
875 StringBuilder newString = new StringBuilder(seq.length());
877 while (str.hasMoreTokens())
879 newString.append(str.nextToken());
882 return newString.toString();
895 * @return DOCUMENT ME!
897 private static float max(float f1, float f2, float f3)
922 * @return DOCUMENT ME!
924 private static float max(float f1, float f2)
937 * Converts the character string to an array of integers which are the
938 * corresponding indices to the characters in the score matrix
944 int[] indexEncode(String s)
946 int[] encoded = new int[s.length()];
948 for (int i = 0; i < s.length(); i++)
950 char c = s.charAt(i);
951 encoded[i] = scoreMatrix.getMatrixIndex(c);
971 public static void displayMatrix(Graphics g, int[][] mat, int n, int m,
974 // TODO method doesn't seem to be referenced anywhere delete??
978 for (int i = 0; i < n; i++)
980 for (int j = 0; j < m; j++)
982 if (mat[i][j] >= max)
987 if (mat[i][j] <= min)
994 System.out.println(max + " " + min);
996 for (int i = 0; i < n; i++)
998 for (int j = 0; j < m; j++)
1003 // System.out.println(mat[i][j]);
1004 float score = (float) (mat[i][j] - min) / (float) (max - min);
1005 g.setColor(new Color(score, 0, 0));
1006 g.fillRect(x, y, psize, psize);
1008 // System.out.println(x + " " + y + " " + score);
1014 * Compute a globally optimal needleman and wunsch alignment between two
1020 * AlignSeq.DNA or AlignSeq.PEP
1022 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1025 AlignSeq as = new AlignSeq(s1, s2, type);
1027 as.calcScoreMatrix();
1028 as.traceAlignment();
1034 * @return mapping from positions in S1 to corresponding positions in S2
1036 public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch)
1038 ArrayList<Integer> as1 = new ArrayList<Integer>(),
1039 as2 = new ArrayList<Integer>();
1040 int pdbpos = s2.getStart() + getSeq2Start() - 2;
1041 int alignpos = s1.getStart() + getSeq1Start() - 2;
1042 int lp2 = pdbpos - 3, lp1 = alignpos - 3;
1043 boolean lastmatch = false;
1044 // and now trace the alignment onto the atom set.
1045 for (int i = 0; i < astr1.length(); i++)
1047 char c1 = astr1.charAt(i), c2 = astr2.charAt(i);
1058 // ignore case differences
1059 if (allowmismatch || (c1 == c2) || (Math.abs(c2-c1)==('a'-'A')))
1061 // extend mapping interval
1062 if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos)
1064 as1.add(Integer.valueOf(alignpos));
1065 as2.add(Integer.valueOf(pdbpos));
1073 // extend mapping interval
1076 as1.add(Integer.valueOf(lp1));
1077 as2.add(Integer.valueOf(lp2));
1082 // construct range pairs
1084 int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)],
1085 mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)];
1087 for (Integer ip : as1)
1093 for (Integer ip : as2)
1100 mapseq1[mapseq1.length - 1] = alignpos;
1101 mapseq2[mapseq2.length - 1] = pdbpos;
1103 MapList map = new MapList(mapseq1, mapseq2, 1, 1);
1105 jalview.datamodel.Mapping mapping = new Mapping(map);
1111 * matches ochains against al and populates seqs with the best match between
1112 * each ochain and the set in al
1116 * @param dnaOrProtein
1117 * @param removeOldAnnots
1118 * when true, old annotation is cleared before new annotation
1120 * @return List<List<SequenceI> originals, List<SequenceI> replacement,
1121 * List<AlignSeq> alignment between each>
1123 public static List<List<? extends Object>> replaceMatchingSeqsWith(
1124 List<SequenceI> seqs, List<AlignmentAnnotation> annotations,
1125 List<SequenceI> ochains, AlignmentI al, String dnaOrProtein,
1126 boolean removeOldAnnots)
1128 List<SequenceI> orig = new ArrayList<SequenceI>(),
1129 repl = new ArrayList<SequenceI>();
1130 List<AlignSeq> aligs = new ArrayList<AlignSeq>();
1131 if (al != null && al.getHeight() > 0)
1133 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
1134 ArrayList<AlignSeq> aligns = new ArrayList<AlignSeq>();
1136 for (SequenceI sq : ochains)
1138 SequenceI bestm = null;
1139 AlignSeq bestaseq = null;
1140 float bestscore = 0;
1141 for (SequenceI msq : al.getSequences())
1143 AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein);
1144 if (bestm == null || aseq.getMaxScore() > bestscore)
1146 bestscore = aseq.getMaxScore();
1151 // System.out.println("Best Score for " + (matches.size() + 1) + " :"
1154 aligns.add(bestaseq);
1155 al.deleteSequence(bestm);
1157 for (int p = 0, pSize = seqs.size(); p < pSize; p++)
1159 SequenceI sq, sp = seqs.get(p);
1161 if ((q = ochains.indexOf(sp)) > -1)
1163 seqs.set(p, sq = matches.get(q));
1166 sq.setName(sp.getName());
1167 sq.setDescription(sp.getDescription());
1169 sq.transferAnnotation(sp,
1170 sp2sq = aligns.get(q).getMappingFromS1(false));
1171 aligs.add(aligns.get(q));
1173 for (int ap = 0; ap < annotations.size();)
1175 if (annotations.get(ap).sequenceRef == sp)
1181 if (removeOldAnnots)
1183 annotations.remove(ap);
1187 AlignmentAnnotation alan = annotations.remove(ap);
1188 alan.liftOver(sq, sp2sq);
1189 alan.setSequenceRef(sq);
1190 sq.addAlignmentAnnotation(alan);
1198 if (sq.getAnnotation() != null && sq.getAnnotation().length > 0)
1200 annotations.addAll(inspos == -1 ? annotations.size() : inspos,
1201 Arrays.asList(sq.getAnnotation()));
1206 return Arrays.asList(orig, repl, aligs);
1210 * compute the PID vector used by the redundancy filter.
1212 * @param originalSequences
1213 * - sequences in alignment that are to filtered
1215 * - null or strings to be analysed (typically, visible portion of
1216 * each sequence in alignment)
1218 * - first column in window for calculation
1220 * - last column in window for calculation
1222 * - if true then use ungapped sequence to compute PID
1223 * @return vector containing maximum PID for i-th sequence and any sequences
1224 * longer than that seuqence
1226 public static float[] computeRedundancyMatrix(
1227 SequenceI[] originalSequences, String[] omitHidden, int start,
1228 int end, boolean ungapped)
1230 int height = originalSequences.length;
1231 float[] redundancy = new float[height];
1232 int[] lngth = new int[height];
1233 for (int i = 0; i < height; i++)
1239 // long start = System.currentTimeMillis();
1241 SimilarityParams pidParams = new SimilarityParams(true, true, true,
1245 for (int i = 0; i < height; i++)
1248 for (int j = 0; j < i; j++)
1255 if (omitHidden == null)
1257 seqi = originalSequences[i].getSequenceAsString(start, end);
1258 seqj = originalSequences[j].getSequenceAsString(start, end);
1262 seqi = omitHidden[i];
1263 seqj = omitHidden[j];
1267 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqi);
1268 lngth[i] = ug.length();
1276 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqj);
1277 lngth[j] = ug.length();
1283 pid = (float) PIDModel.computePID(seqi, seqj, pidParams);
1285 // use real sequence length rather than string length
1286 if (lngth[j] < lngth[i])
1288 redundancy[j] = Math.max(pid, redundancy[j]);
1292 redundancy[i] = Math.max(pid, redundancy[i]);
1301 * calculate the mean score of the alignment
1302 * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
1305 public void meanScore()
1307 //int length = (indelfreeAstr1.length() > indelfreeAstr2.length()) ? indelfreeAstr1.length() : indelfreeAstr2.length();
1308 int length = indelfreeAstr1.length(); //both have the same length
1309 //create HashMap for counting residues in each sequence
1310 HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
1311 HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
1313 // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
1314 for (char residue: indelfreeAstr1.toCharArray())
1316 seq1ResCount.putIfAbsent(residue, 0);
1317 seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
1319 for (char residue: indelfreeAstr2.toCharArray())
1321 seq2ResCount.putIfAbsent(residue, 0);
1322 seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
1325 // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
1326 // divide the meanscore by the sequence length afterwards
1327 float _meanscore = 0;
1328 for (char resA : seq1ResCount.keySet())
1330 for (char resB : seq2ResCount.keySet())
1332 int countA = seq1ResCount.get(resA);
1333 int countB = seq2ResCount.get(resB);
1335 float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
1337 _meanscore += countA * countB * scoreAB;
1340 _meanscore /= length;
1341 this.meanScore = _meanscore;
1344 public float getMeanScore()
1346 return this.meanScore;
1350 * calculate the hypothetic max score using the self-alignment of the sequences
1352 public void hypotheticMaxScore()
1356 for (char residue: indelfreeAstr1.toCharArray())
1358 _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
1360 for (char residue: indelfreeAstr2.toCharArray())
1362 _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
1364 this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
1368 public int getHypotheticMaxScore()
1370 return this.hypotheticMaxScore;
1374 * create strings based of astr1 and astr2 but without gaps
1376 public void getIndelfreeAstr()
1378 int n = astr1.length(); // both have the same length
1379 for (int i = 0; i < n; i++)
1381 if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
1383 this.indelfreeAstr1 += astr1.charAt(i);
1384 this.indelfreeAstr2 += astr2.charAt(i);
1390 * calculates the overall score of the alignment
1391 * preprescore = sum of all scores - all penalties
1392 * if preprescore < 1 ~ alignmentScore = Float.NaN >
1393 * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
1395 public void scoreAlignment() throws RuntimeException
1400 hypotheticMaxScore();
1401 // cannot calculate score because denominator would be zero
1402 if (this.hypotheticMaxScore == this.meanScore)
1404 throw new IllegalArgumentException(String.format("hypotheticMaxScore (%8.2f) == meanScore (%8.2f) - division by 0", hypotheticMaxScore, meanScore));
1406 //int n = (astr1.length() > astr2.length()) ? astr1.length() : astr2.length();
1407 int n = indelfreeAstr1.length();
1410 boolean aGapOpen = false;
1411 boolean bGapOpen = false;
1412 for (int i = 0; i < n; i++)
1414 char char1 = indelfreeAstr1.charAt(i);
1415 char char2 = indelfreeAstr2.charAt(i);
1416 boolean aIsLetter = Character.isLetter(char1);
1417 boolean bIsLetter = Character.isLetter(char2);
1418 if (aIsLetter && bIsLetter) // if pair -> get score
1420 score += scoreMatrix.getPairwiseScore(char1, char2);
1421 } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
1422 } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
1423 score -= GAP_EXTEND_COST;
1424 } else { // no gap open -> score - gap_open
1425 score -= GAP_OPEN_COST;
1427 // adjust GapOpen status in both sequences
1428 aGapOpen = (!aIsLetter) ? true : false;
1429 bGapOpen = (!bIsLetter) ? true : false;
1432 float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
1433 score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
1434 int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
1435 float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
1436 float prescore = score; // only debug
1439 System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
1440 this.alignmentScore = (preprescore < 1) ? Float.NaN : score;