2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.analysis.scoremodels.PIDModel;
24 import jalview.analysis.scoremodels.ScoreMatrix;
25 import jalview.analysis.scoremodels.ScoreModels;
26 import jalview.analysis.scoremodels.SimilarityParams;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentI;
29 import jalview.datamodel.Mapping;
30 import jalview.datamodel.Sequence;
31 import jalview.datamodel.SequenceI;
32 import jalview.math.MiscMath;
33 import jalview.util.Comparison;
34 import jalview.util.Format;
35 import jalview.util.MapList;
36 import jalview.util.MessageManager;
38 import java.awt.Color;
39 import java.awt.Graphics;
40 import java.io.PrintStream;
41 import java.lang.IllegalArgumentException;
42 import java.util.ArrayList;
43 import java.util.Arrays;
44 import java.util.HashMap;
45 import java.util.List;
46 import java.util.StringTokenizer;
47 import java.util.Locale;
55 private static final int MAX_NAME_LENGTH = 30;
57 private static final int DEFAULT_OPENCOST = 120;
59 private static final int DEFAULT_EXTENDCOST = 20;
61 private int GAP_OPEN_COST=DEFAULT_OPENCOST;
63 private int GAP_EXTEND_COST=DEFAULT_EXTENDCOST;
65 private static final int GAP_INDEX = -1;
67 public static final String PEP = "pep";
69 public static final String DNA = "dna";
71 private static final String NEWLINE = System.lineSeparator();
81 int[][] traceback; // todo is this actually used?
103 public String astr1 = "";
105 public String astr2 = "";
107 public String indelfreeAstr1 = "";
109 public String indelfreeAstr2 = "";
112 public int seq1start;
118 public int seq2start;
124 public float maxscore;
126 public float meanScore; //needed for PaSiMap
128 public int hypotheticMaxScore; // needed for PaSiMap
132 StringBuffer output = new StringBuffer();
134 String type; // AlignSeq.PEP or AlignSeq.DNA
136 private ScoreMatrix scoreMatrix;
139 * Creates a new AlignSeq object.
142 * first sequence for alignment
144 * second sequence for alignment
146 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
148 public AlignSeq(SequenceI s1, SequenceI s2, String type)
150 seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(),
155 * Creates a new AlignSeq object.
158 * s1 reference sequence for string1
160 * s2 reference sequence for string2
162 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
164 public AlignSeq(SequenceI s1, String string1, SequenceI s2,
165 String string2, String type)
167 seqInit(s1, string1.toUpperCase(Locale.ROOT), s2,
168 string2.toUpperCase(Locale.ROOT), type);
171 public AlignSeq(SequenceI s1, SequenceI s2, String type, int opencost,
175 GAP_OPEN_COST=opencost;
176 GAP_EXTEND_COST=extcost;
179 public AlignSeq(SequenceI s12, String string1, SequenceI s22,
180 String string2, String type2, int defaultOpencost,
181 int defaultExtendcost)
183 this(s12,string1,s22,string2,type2);
184 GAP_OPEN_COST=defaultOpencost;
185 GAP_EXTEND_COST=defaultExtendcost;
191 * @return DOCUMENT ME!
193 public float getMaxScore()
199 * returns the overall score of the alignment
203 public float getAlignmentScore()
205 return alignmentScore;
211 * @return DOCUMENT ME!
213 public int getSeq2Start()
221 * @return DOCUMENT ME!
223 public int getSeq2End()
231 * @return DOCUMENT ME!
233 public int getSeq1Start()
241 * @return DOCUMENT ME!
243 public int getSeq1End()
251 * @return DOCUMENT ME!
253 public String getOutput()
255 return output.toString();
261 * @return DOCUMENT ME!
263 public String getAStr1()
271 * @return DOCUMENT ME!
273 public String getAStr2()
281 * @return DOCUMENT ME!
283 public int[] getASeq1()
291 * @return DOCUMENT ME!
293 public int[] getASeq2()
300 * @return aligned instance of Seq 1
302 public SequenceI getAlignedSeq1()
304 SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1());
305 alSeq1.setStart(s1.getStart() + getSeq1Start() - 1);
306 alSeq1.setEnd(s1.getStart() + getSeq1End() - 1);
307 alSeq1.setDatasetSequence(
308 s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence());
314 * @return aligned instance of Seq 2
316 public SequenceI getAlignedSeq2()
318 SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2());
319 alSeq2.setStart(s2.getStart() + getSeq2Start() - 1);
320 alSeq2.setEnd(s2.getStart() + getSeq2End() - 1);
321 alSeq2.setDatasetSequence(
322 s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
327 * Construct score matrix for sequences with standard DNA or PEPTIDE matrix
332 * - string to use for s1
336 * - string to use for s2
340 public void seqInit(SequenceI s1, String string1, SequenceI s2,
341 String string2, String type)
343 seqInit(s1,string1,s2,string2,type,GAP_OPEN_COST,GAP_EXTEND_COST);
345 public void seqInit(SequenceI s1, String string1, SequenceI s2,
346 String string2, String type, int opening,int extension)
348 GAP_OPEN_COST=opening;
349 GAP_EXTEND_COST=extension;
352 setDefaultParams(type);
353 seqInit(string1, string2);
357 * construct score matrix for string1 and string2 (after removing any existing
363 private void seqInit(String string1, String string2)
365 s1str = extractGaps(jalview.util.Comparison.GapChars, string1);
366 s2str = extractGaps(jalview.util.Comparison.GapChars, string2);
368 if (s1str.length() == 0 || s2str.length() == 0)
371 "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ")
372 + (s2str.length() == 0 ? s2.getName() : ""));
376 score = new float[s1str.length()][s2str.length()];
378 E = new float[s1str.length()][s2str.length()];
380 F = new float[s1str.length()][s2str.length()];
381 traceback = new int[s1str.length()][s2str.length()];
383 seq1 = indexEncode(s1str);
385 seq2 = indexEncode(s2str);
388 private void setDefaultParams(String moleculeType)
390 if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType))
392 output.append("Wrong type = dna or pep only");
393 throw new Error(MessageManager
394 .formatMessage("error.unknown_type_dna_or_pep", new String[]
399 scoreMatrix = ScoreModels.getInstance()
400 .getDefaultModel(PEP.equals(type));
406 public void traceAlignment()
408 // Find the maximum score along the rhs or bottom row
409 float max = -Float.MAX_VALUE;
411 for (int i = 0; i < seq1.length; i++)
413 if (score[i][seq2.length - 1] > max)
415 max = score[i][seq2.length - 1];
417 maxj = seq2.length - 1;
421 for (int j = 0; j < seq2.length; j++)
423 if (score[seq1.length - 1][j] > max)
425 max = score[seq1.length - 1][j];
426 maxi = seq1.length - 1;
434 maxscore = score[i][j] / 10f;
437 aseq1 = new int[seq1.length + seq2.length];
438 aseq2 = new int[seq1.length + seq2.length];
440 StringBuilder sb1 = new StringBuilder(aseq1.length);
441 StringBuilder sb2 = new StringBuilder(aseq2.length);
443 count = (seq1.length + seq2.length) - 1;
446 while (i > 0 && j > 0)
448 aseq1[count] = seq1[i];
449 sb1.append(s1str.charAt(i));
450 aseq2[count] = seq2[j];
451 sb2.append(s2str.charAt(j));
453 trace = findTrace(i, j);
463 aseq1[count] = GAP_INDEX;
464 sb1.replace(sb1.length() - 1, sb1.length(), "-");
466 else if (trace == -1)
469 aseq2[count] = GAP_INDEX;
470 sb2.replace(sb2.length() - 1, sb2.length(), "-");
479 if (aseq1[count] != GAP_INDEX)
481 aseq1[count] = seq1[i];
482 sb1.append(s1str.charAt(i));
485 if (aseq2[count] != GAP_INDEX)
487 aseq2[count] = seq2[j];
488 sb2.append(s2str.charAt(j));
493 * we built the character strings backwards, so now
494 * reverse them to convert to sequence strings
496 astr1 = sb1.reverse().toString();
497 astr2 = sb2.reverse().toString();
503 public void traceAlignmentWithEndGaps()
505 // Find the maximum score along the rhs or bottom row
506 float max = -Float.MAX_VALUE;
508 for (int i = 0; i < seq1.length; i++)
510 if (score[i][seq2.length - 1] > max)
512 max = score[i][seq2.length - 1];
514 maxj = seq2.length - 1;
518 for (int j = 0; j < seq2.length; j++)
520 if (score[seq1.length - 1][j] > max)
522 max = score[seq1.length - 1][j];
523 maxi = seq1.length - 1;
531 maxscore = score[i][j] / 10f;
533 //prepare trailing gaps
534 while ((i < seq1.length - 1) || (j < seq2.length - 1))
542 aseq1 = new int[seq1.length + seq2.length];
543 aseq2 = new int[seq1.length + seq2.length];
545 StringBuilder sb1 = new StringBuilder(aseq1.length);
546 StringBuilder sb2 = new StringBuilder(aseq2.length);
548 count = (seq1.length + seq2.length) - 1;
551 while ((i >= seq1.length) || (j >= seq2.length))
553 if (i >= seq1.length)
555 aseq1[count] = GAP_INDEX;
557 aseq2[count] = seq2[j];
558 sb2.append(s2str.charAt(j));
559 } else if (j >= seq2.length) {
560 aseq1[count] = seq1[i];
561 sb1.append(s1str.charAt(i));
562 aseq2[count] = GAP_INDEX;
569 while (i > 0 && j > 0)
571 aseq1[count] = seq1[i];
572 sb1.append(s1str.charAt(i));
573 aseq2[count] = seq2[j];
574 sb2.append(s2str.charAt(j));
576 trace = findTrace(i, j);
586 aseq1[count] = GAP_INDEX;
587 sb1.replace(sb1.length() - 1, sb1.length(), "-");
589 else if (trace == -1)
592 aseq2[count] = GAP_INDEX;
593 sb2.replace(sb2.length() - 1, sb2.length(), "-");
602 aseq1[count] = seq1[i];
603 sb1.append(s1str.charAt(i));
604 aseq2[count] = seq2[j];
605 sb2.append(s2str.charAt(j));
608 while (j > 0 || i > 0)
614 sb2.append(s2str.charAt(j));
617 sb1.append(s1str.charAt(i));
623 * we built the character strings backwards, so now
624 * reverse them to convert to sequence strings
626 astr1 = sb1.reverse().toString();
627 astr2 = sb2.reverse().toString();
633 public void printAlignment(PrintStream os)
635 // TODO: Use original sequence characters rather than re-translated
636 // characters in output
637 // Find the biggest id length for formatting purposes
638 String s1id = getAlignedSeq1().getDisplayId(true);
639 String s2id = getAlignedSeq2().getDisplayId(true);
640 int nameLength = Math.max(s1id.length(), s2id.length());
641 if (nameLength > MAX_NAME_LENGTH)
643 int truncateBy = nameLength - MAX_NAME_LENGTH;
644 nameLength = MAX_NAME_LENGTH;
645 // JAL-527 - truncate the sequence ids
646 if (s1id.length() > nameLength)
648 int slashPos = s1id.lastIndexOf('/');
649 s1id = s1id.substring(0, slashPos - truncateBy)
650 + s1id.substring(slashPos);
652 if (s2id.length() > nameLength)
654 int slashPos = s2id.lastIndexOf('/');
655 s2id = s2id.substring(0, slashPos - truncateBy)
656 + s2id.substring(slashPos);
659 int len = 72 - nameLength - 1;
660 int nochunks = ((aseq1.length - count) / len)
661 + ((aseq1.length - count) % len > 0 ? 1 : 0);
664 output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE);
665 output.append("Length of alignment = ")
666 .append(String.valueOf(aseq1.length - count)).append(NEWLINE);
667 output.append("Sequence ");
668 Format nameFormat = new Format("%" + nameLength + "s");
669 output.append(nameFormat.form(s1id));
670 output.append(" (Sequence length = ")
671 .append(String.valueOf(s1str.length())).append(")")
673 output.append("Sequence ");
674 output.append(nameFormat.form(s2id));
675 output.append(" (Sequence length = ")
676 .append(String.valueOf(s2str.length())).append(")")
677 .append(NEWLINE).append(NEWLINE);
679 ScoreMatrix pam250 = ScoreModels.getInstance().getPam250();
681 for (int j = 0; j < nochunks; j++)
683 // Print the first aligned sequence
684 output.append(nameFormat.form(s1id)).append(" ");
686 for (int i = 0; i < len; i++)
688 if ((i + (j * len)) < astr1.length())
690 output.append(astr1.charAt(i + (j * len)));
694 output.append(NEWLINE);
695 output.append(nameFormat.form(" ")).append(" ");
698 * Print out the match symbols:
699 * | for exact match (ignoring case)
700 * . if PAM250 score is positive
703 for (int i = 0; i < len; i++)
705 if ((i + (j * len)) < astr1.length())
707 char c1 = astr1.charAt(i + (j * len));
708 char c2 = astr2.charAt(i + (j * len));
709 boolean sameChar = Comparison.isSameResidue(c1, c2, false);
710 if (sameChar && !Comparison.isGap(c1))
715 else if (PEP.equals(type))
717 if (pam250.getPairwiseScore(c1, c2) > 0)
733 // Now print the second aligned sequence
734 output = output.append(NEWLINE);
735 output = output.append(nameFormat.form(s2id)).append(" ");
737 for (int i = 0; i < len; i++)
739 if ((i + (j * len)) < astr2.length())
741 output.append(astr2.charAt(i + (j * len)));
745 output.append(NEWLINE).append(NEWLINE);
748 pid = pid / (aseq1.length - count) * 100;
749 output.append(new Format("Percentage ID = %3.2f\n").form(pid));
750 output.append(NEWLINE);
753 os.print(output.toString());
754 } catch (Exception ex)
767 * @return DOCUMENT ME!
769 public int findTrace(int i, int j)
772 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
774 float max = score[i - 1][j - 1] + (pairwiseScore * 10);
781 else if (F[i][j] == max)
795 else if (E[i][j] == max)
812 public void calcScoreMatrix()
816 final int GAP_EX_COST=GAP_EXTEND_COST;
817 final int GAP_OP_COST = GAP_OPEN_COST;
818 // top left hand element
819 score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
820 s2str.charAt(0)) * 10;
821 E[0][0] = -GAP_EX_COST;
824 // Calculate the top row first
825 for (int j = 1; j < m; j++)
827 // What should these values be? 0 maybe
828 E[0][j] = max(score[0][j - 1] - GAP_OP_COST,
829 E[0][j - 1] - GAP_EX_COST);
830 F[0][j] = -GAP_EX_COST;
832 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
834 score[0][j] = max(pairwiseScore * 10, -GAP_OP_COST,
840 // Now do the left hand column
841 for (int i = 1; i < n; i++)
843 E[i][0] = -GAP_OP_COST;
844 F[i][0] = max(score[i - 1][0] - GAP_OP_COST,
845 F[i - 1][0] - GAP_EX_COST);
847 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
849 score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]);
850 traceback[i][0] = -1;
853 // Now do all the other rows
854 for (int i = 1; i < n; i++)
856 for (int j = 1; j < m; j++)
858 E[i][j] = max(score[i][j - 1] - GAP_OP_COST,
859 E[i][j - 1] - GAP_EX_COST);
860 F[i][j] = max(score[i - 1][j] - GAP_OP_COST,
861 F[i - 1][j] - GAP_EX_COST);
863 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
865 score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10),
867 traceback[i][j] = findTrace(i, j);
873 * Returns the given sequence with all of the given gap characters removed.
876 * a string of characters to be treated as gaps
882 public static String extractGaps(String gapChars, String seq)
884 if (gapChars == null || seq == null)
888 StringTokenizer str = new StringTokenizer(seq, gapChars);
889 StringBuilder newString = new StringBuilder(seq.length());
891 while (str.hasMoreTokens())
893 newString.append(str.nextToken());
896 return newString.toString();
909 * @return DOCUMENT ME!
911 private static float max(float f1, float f2, float f3)
936 * @return DOCUMENT ME!
938 private static float max(float f1, float f2)
951 * Converts the character string to an array of integers which are the
952 * corresponding indices to the characters in the score matrix
958 int[] indexEncode(String s)
960 int[] encoded = new int[s.length()];
962 for (int i = 0; i < s.length(); i++)
964 char c = s.charAt(i);
965 encoded[i] = scoreMatrix.getMatrixIndex(c);
985 public static void displayMatrix(Graphics g, int[][] mat, int n, int m,
988 // TODO method doesn't seem to be referenced anywhere delete??
992 for (int i = 0; i < n; i++)
994 for (int j = 0; j < m; j++)
996 if (mat[i][j] >= max)
1001 if (mat[i][j] <= min)
1008 System.out.println(max + " " + min);
1010 for (int i = 0; i < n; i++)
1012 for (int j = 0; j < m; j++)
1017 // System.out.println(mat[i][j]);
1018 float score = (float) (mat[i][j] - min) / (float) (max - min);
1019 g.setColor(new Color(score, 0, 0));
1020 g.fillRect(x, y, psize, psize);
1022 // System.out.println(x + " " + y + " " + score);
1028 * Compute a globally optimal needleman and wunsch alignment between two
1034 * AlignSeq.DNA or AlignSeq.PEP
1036 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1039 return doGlobalNWAlignment(s1, s2, type, DEFAULT_OPENCOST,DEFAULT_EXTENDCOST);
1041 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1042 String type, int opencost,int extcost)
1045 AlignSeq as = new AlignSeq(s1, s2, type,opencost,extcost);
1047 as.calcScoreMatrix();
1048 as.traceAlignment();
1054 * @return mapping from positions in S1 to corresponding positions in S2
1056 public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch)
1058 ArrayList<Integer> as1 = new ArrayList<Integer>(),
1059 as2 = new ArrayList<Integer>();
1060 int pdbpos = s2.getStart() + getSeq2Start() - 2;
1061 int alignpos = s1.getStart() + getSeq1Start() - 2;
1062 int lp2 = pdbpos - 3, lp1 = alignpos - 3;
1063 boolean lastmatch = false;
1064 // and now trace the alignment onto the atom set.
1065 for (int i = 0; i < astr1.length(); i++)
1067 char c1 = astr1.charAt(i), c2 = astr2.charAt(i);
1078 // ignore case differences
1079 if (allowmismatch || (c1 == c2) || (Math.abs(c2-c1)==('a'-'A')))
1081 // extend mapping interval
1082 if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos)
1084 as1.add(Integer.valueOf(alignpos));
1085 as2.add(Integer.valueOf(pdbpos));
1093 // extend mapping interval
1096 as1.add(Integer.valueOf(lp1));
1097 as2.add(Integer.valueOf(lp2));
1102 // construct range pairs
1104 int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)],
1105 mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)];
1107 for (Integer ip : as1)
1113 for (Integer ip : as2)
1120 mapseq1[mapseq1.length - 1] = alignpos;
1121 mapseq2[mapseq2.length - 1] = pdbpos;
1123 MapList map = new MapList(mapseq1, mapseq2, 1, 1);
1125 jalview.datamodel.Mapping mapping = new Mapping(map);
1131 * matches ochains against al and populates seqs with the best match between
1132 * each ochain and the set in al
1136 * @param dnaOrProtein
1137 * @param removeOldAnnots
1138 * when true, old annotation is cleared before new annotation
1140 * @return List<List<SequenceI> originals, List<SequenceI> replacement,
1141 * List<AlignSeq> alignment between each>
1143 public static List<List<? extends Object>> replaceMatchingSeqsWith(
1144 List<SequenceI> seqs, List<AlignmentAnnotation> annotations,
1145 List<SequenceI> ochains, AlignmentI al, String dnaOrProtein,
1146 boolean removeOldAnnots)
1148 List<SequenceI> orig = new ArrayList<SequenceI>(),
1149 repl = new ArrayList<SequenceI>();
1150 List<AlignSeq> aligs = new ArrayList<AlignSeq>();
1151 if (al != null && al.getHeight() > 0)
1153 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
1154 ArrayList<AlignSeq> aligns = new ArrayList<AlignSeq>();
1156 for (SequenceI sq : ochains)
1158 SequenceI bestm = null;
1159 AlignSeq bestaseq = null;
1160 float bestscore = 0;
1161 for (SequenceI msq : al.getSequences())
1163 AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein);
1164 if (bestm == null || aseq.getMaxScore() > bestscore)
1166 bestscore = aseq.getMaxScore();
1171 // System.out.println("Best Score for " + (matches.size() + 1) + " :"
1174 aligns.add(bestaseq);
1175 al.deleteSequence(bestm);
1177 for (int p = 0, pSize = seqs.size(); p < pSize; p++)
1179 SequenceI sq, sp = seqs.get(p);
1181 if ((q = ochains.indexOf(sp)) > -1)
1183 seqs.set(p, sq = matches.get(q));
1186 sq.setName(sp.getName());
1187 sq.setDescription(sp.getDescription());
1189 sq.transferAnnotation(sp,
1190 sp2sq = aligns.get(q).getMappingFromS1(false));
1191 aligs.add(aligns.get(q));
1193 for (int ap = 0; ap < annotations.size();)
1195 if (annotations.get(ap).sequenceRef == sp)
1201 if (removeOldAnnots)
1203 annotations.remove(ap);
1207 AlignmentAnnotation alan = annotations.remove(ap);
1208 alan.liftOver(sq, sp2sq);
1209 alan.setSequenceRef(sq);
1210 sq.addAlignmentAnnotation(alan);
1218 if (sq.getAnnotation() != null && sq.getAnnotation().length > 0)
1220 annotations.addAll(inspos == -1 ? annotations.size() : inspos,
1221 Arrays.asList(sq.getAnnotation()));
1226 return Arrays.asList(orig, repl, aligs);
1230 * compute the PID vector used by the redundancy filter.
1232 * @param originalSequences
1233 * - sequences in alignment that are to filtered
1235 * - null or strings to be analysed (typically, visible portion of
1236 * each sequence in alignment)
1238 * - first column in window for calculation
1240 * - last column in window for calculation
1242 * - if true then use ungapped sequence to compute PID
1243 * @return vector containing maximum PID for i-th sequence and any sequences
1244 * longer than that seuqence
1246 public static float[] computeRedundancyMatrix(
1247 SequenceI[] originalSequences, String[] omitHidden, int start,
1248 int end, boolean ungapped)
1250 int height = originalSequences.length;
1251 float[] redundancy = new float[height];
1252 int[] lngth = new int[height];
1253 for (int i = 0; i < height; i++)
1259 // long start = System.currentTimeMillis();
1261 SimilarityParams pidParams = new SimilarityParams(true, true, true,
1265 for (int i = 0; i < height; i++)
1268 for (int j = 0; j < i; j++)
1275 if (omitHidden == null)
1277 seqi = originalSequences[i].getSequenceAsString(start, end);
1278 seqj = originalSequences[j].getSequenceAsString(start, end);
1282 seqi = omitHidden[i];
1283 seqj = omitHidden[j];
1287 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqi);
1288 lngth[i] = ug.length();
1296 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqj);
1297 lngth[j] = ug.length();
1303 pid = (float) PIDModel.computePID(seqi, seqj, pidParams);
1305 // use real sequence length rather than string length
1306 if (lngth[j] < lngth[i])
1308 redundancy[j] = Math.max(pid, redundancy[j]);
1312 redundancy[i] = Math.max(pid, redundancy[i]);
1321 * calculate the mean score of the alignment
1322 * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
1325 public void meanScore()
1327 int length = indelfreeAstr1.length(); //both have the same length
1328 //create HashMap for counting residues in each sequence
1329 HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
1330 HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
1332 // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
1333 for (char residue: indelfreeAstr1.toCharArray())
1335 seq1ResCount.putIfAbsent(residue, 0);
1336 seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
1338 for (char residue: indelfreeAstr2.toCharArray())
1340 seq2ResCount.putIfAbsent(residue, 0);
1341 seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
1344 // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
1345 // divide the meanscore by the sequence length afterwards
1346 float _meanscore = 0;
1347 for (char resA : seq1ResCount.keySet())
1349 for (char resB : seq2ResCount.keySet())
1351 int countA = seq1ResCount.get(resA);
1352 int countB = seq2ResCount.get(resB);
1354 float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
1356 _meanscore += countA * countB * scoreAB;
1359 _meanscore /= length;
1360 this.meanScore = _meanscore;
1363 public float getMeanScore()
1365 return this.meanScore;
1369 * calculate the hypothetic max score using the self-alignment of the sequences
1371 public void hypotheticMaxScore()
1375 for (char residue: indelfreeAstr1.toCharArray())
1377 _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
1379 for (char residue: indelfreeAstr2.toCharArray())
1381 _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
1383 this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
1387 public int getHypotheticMaxScore()
1389 return this.hypotheticMaxScore;
1393 * create strings based of astr1 and astr2 but without gaps
1395 public void getIndelfreeAstr()
1397 int n = astr1.length(); // both have the same length
1398 for (int i = 0; i < n; i++)
1400 if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
1402 this.indelfreeAstr1 += astr1.charAt(i);
1403 this.indelfreeAstr2 += astr2.charAt(i);
1409 * calculates the overall score of the alignment
1410 * preprescore = sum of all scores - all penalties
1411 * if preprescore < 1 ~ alignmentScore = Float.NaN >
1412 * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
1414 public void scoreAlignment()
1419 hypotheticMaxScore();
1420 // cannot calculate score because denominator would be zero
1421 if (this.hypotheticMaxScore == this.meanScore)
1423 this.alignmentScore = Float.NaN;
1426 int n = indelfreeAstr1.length();
1429 boolean aGapOpen = false;
1430 boolean bGapOpen = false;
1431 for (int i = 0; i < n; i++)
1433 char char1 = indelfreeAstr1.charAt(i);
1434 char char2 = indelfreeAstr2.charAt(i);
1435 boolean aIsLetter = Character.isLetter(char1);
1436 boolean bIsLetter = Character.isLetter(char2);
1437 if (aIsLetter && bIsLetter) // if pair -> get score
1439 score += scoreMatrix.getPairwiseScore(char1, char2);
1440 } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
1441 } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
1442 score -= GAP_EXTEND_COST;
1443 } else { // no gap open -> score - gap_open
1444 score -= GAP_OPEN_COST;
1446 // adjust GapOpen status in both sequences
1447 aGapOpen = (!aIsLetter) ? true : false;
1448 bGapOpen = (!bIsLetter) ? true : false;
1451 float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
1452 score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
1453 int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
1454 float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
1455 float prescore = score; // only debug
1458 //System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %1.16f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
1459 float minScore = 1f;
1460 this.alignmentScore = (preprescore < minScore) ? Float.NaN : score;