2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.analysis.scoremodels.PIDModel;
24 import jalview.analysis.scoremodels.ScoreMatrix;
25 import jalview.analysis.scoremodels.ScoreModels;
26 import jalview.analysis.scoremodels.SimilarityParams;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentI;
29 import jalview.datamodel.Mapping;
30 import jalview.datamodel.Sequence;
31 import jalview.datamodel.SequenceI;
32 import jalview.math.MiscMath;
33 import jalview.util.Comparison;
34 import jalview.util.Format;
35 import jalview.util.MapList;
36 import jalview.util.MessageManager;
38 import java.awt.Color;
39 import java.awt.Graphics;
40 import java.io.PrintStream;
41 import java.lang.IllegalArgumentException;
42 import java.util.ArrayList;
43 import java.util.Arrays;
44 import java.util.HashMap;
45 import java.util.List;
46 import java.util.StringTokenizer;
47 import java.util.Locale;
55 private static final int MAX_NAME_LENGTH = 30;
57 private static final int DEFAULT_OPENCOST = 120;
59 private static final int DEFAULT_EXTENDCOST = 20;
61 private int GAP_OPEN_COST=DEFAULT_OPENCOST;
63 private int GAP_EXTEND_COST=DEFAULT_EXTENDCOST;
65 private static final int GAP_INDEX = -1;
67 public static final String PEP = "pep";
69 public static final String DNA = "dna";
71 private static final String NEWLINE = System.lineSeparator();
81 int[][] traceback; // todo is this actually used?
103 public String astr1 = "";
105 public String astr2 = "";
107 public String indelfreeAstr1 = "";
109 public String indelfreeAstr2 = "";
112 public int seq1start;
118 public int seq2start;
124 public float maxscore;
126 public float meanScore; //needed for PaSiMap
128 public int hypotheticMaxScore; // needed for PaSiMap
132 StringBuffer output = new StringBuffer();
134 String type; // AlignSeq.PEP or AlignSeq.DNA
136 private ScoreMatrix scoreMatrix;
139 * Creates a new AlignSeq object.
142 * first sequence for alignment
144 * second sequence for alignment
146 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
148 public AlignSeq(int opencost, int extcost)
150 GAP_OPEN_COST = opencost;
151 GAP_EXTEND_COST = extcost;
153 public AlignSeq(SequenceI s1, SequenceI s2, String type)
155 seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(),
160 * Creates a new AlignSeq object.
163 * s1 reference sequence for string1
165 * s2 reference sequence for string2
167 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
169 public AlignSeq(SequenceI s1, String string1, SequenceI s2,
170 String string2, String type)
172 seqInit(s1, string1.toUpperCase(Locale.ROOT), s2,
173 string2.toUpperCase(Locale.ROOT), type);
176 public AlignSeq(SequenceI s1, SequenceI s2, String type, int opencost,
180 GAP_OPEN_COST=opencost;
181 GAP_EXTEND_COST=extcost;
184 public AlignSeq(SequenceI s12, String string1, SequenceI s22,
185 String string2, String type2, int defaultOpencost,
186 int defaultExtendcost)
188 this(s12,string1,s22,string2,type2);
189 GAP_OPEN_COST=defaultOpencost;
190 GAP_EXTEND_COST=defaultExtendcost;
196 * @return DOCUMENT ME!
198 public float getMaxScore()
204 * returns the overall score of the alignment
208 public float getAlignmentScore()
210 return alignmentScore;
216 * @return DOCUMENT ME!
218 public int getSeq2Start()
226 * @return DOCUMENT ME!
228 public int getSeq2End()
236 * @return DOCUMENT ME!
238 public int getSeq1Start()
246 * @return DOCUMENT ME!
248 public int getSeq1End()
256 * @return DOCUMENT ME!
258 public String getOutput()
260 return output.toString();
266 * @return DOCUMENT ME!
268 public String getAStr1()
276 * @return DOCUMENT ME!
278 public String getAStr2()
286 * @return DOCUMENT ME!
288 public int[] getASeq1()
296 * @return DOCUMENT ME!
298 public int[] getASeq2()
305 * @return aligned instance of Seq 1
307 public SequenceI getAlignedSeq1()
309 SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1());
310 alSeq1.setStart(s1.getStart() + getSeq1Start() - 1);
311 alSeq1.setEnd(s1.getStart() + getSeq1End() - 1);
312 alSeq1.setDatasetSequence(
313 s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence());
319 * @return aligned instance of Seq 2
321 public SequenceI getAlignedSeq2()
323 SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2());
324 alSeq2.setStart(s2.getStart() + getSeq2Start() - 1);
325 alSeq2.setEnd(s2.getStart() + getSeq2End() - 1);
326 alSeq2.setDatasetSequence(
327 s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
332 * Construct score matrix for sequences with standard DNA or PEPTIDE matrix
337 * - string to use for s1
341 * - string to use for s2
345 public void seqInit(SequenceI s1, String string1, SequenceI s2,
346 String string2, String type)
348 seqInit(s1,string1,s2,string2,type,GAP_OPEN_COST,GAP_EXTEND_COST);
350 public void seqInit(SequenceI s1, String string1, SequenceI s2,
351 String string2, String type, int opening,int extension)
353 GAP_OPEN_COST=opening;
354 GAP_EXTEND_COST=extension;
357 setDefaultParams(type);
358 seqInit(string1, string2);
362 * construct score matrix for string1 and string2 (after removing any existing
368 private void seqInit(String string1, String string2)
370 s1str = extractGaps(jalview.util.Comparison.GapChars, string1);
371 s2str = extractGaps(jalview.util.Comparison.GapChars, string2);
373 if (s1str.length() == 0 || s2str.length() == 0)
376 "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ")
377 + (s2str.length() == 0 ? s2.getName() : ""));
381 score = new float[s1str.length()][s2str.length()];
383 E = new float[s1str.length()][s2str.length()];
385 F = new float[s1str.length()][s2str.length()];
386 traceback = new int[s1str.length()][s2str.length()];
388 seq1 = indexEncode(s1str);
390 seq2 = indexEncode(s2str);
393 private void setDefaultParams(String moleculeType)
395 if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType))
397 output.append("Wrong type = dna or pep only");
398 throw new Error(MessageManager
399 .formatMessage("error.unknown_type_dna_or_pep", new String[]
404 scoreMatrix = ScoreModels.getInstance()
405 .getDefaultModel(PEP.equals(type));
411 public void traceAlignment()
413 // Find the maximum score along the rhs or bottom row
414 float max = -Float.MAX_VALUE;
416 for (int i = 0; i < seq1.length; i++)
418 if (score[i][seq2.length - 1] > max)
420 max = score[i][seq2.length - 1];
422 maxj = seq2.length - 1;
426 for (int j = 0; j < seq2.length; j++)
428 if (score[seq1.length - 1][j] > max)
430 max = score[seq1.length - 1][j];
431 maxi = seq1.length - 1;
439 maxscore = score[i][j] / 10f;
442 aseq1 = new int[seq1.length + seq2.length];
443 aseq2 = new int[seq1.length + seq2.length];
445 StringBuilder sb1 = new StringBuilder(aseq1.length);
446 StringBuilder sb2 = new StringBuilder(aseq2.length);
448 count = (seq1.length + seq2.length) - 1;
451 while (i > 0 && j > 0)
453 aseq1[count] = seq1[i];
454 sb1.append(s1str.charAt(i));
455 aseq2[count] = seq2[j];
456 sb2.append(s2str.charAt(j));
458 trace = findTrace(i, j);
468 aseq1[count] = GAP_INDEX;
469 sb1.replace(sb1.length() - 1, sb1.length(), "-");
471 else if (trace == -1)
474 aseq2[count] = GAP_INDEX;
475 sb2.replace(sb2.length() - 1, sb2.length(), "-");
484 if (aseq1[count] != GAP_INDEX)
486 aseq1[count] = seq1[i];
487 sb1.append(s1str.charAt(i));
490 if (aseq2[count] != GAP_INDEX)
492 aseq2[count] = seq2[j];
493 sb2.append(s2str.charAt(j));
498 * we built the character strings backwards, so now
499 * reverse them to convert to sequence strings
501 astr1 = sb1.reverse().toString();
502 astr2 = sb2.reverse().toString();
508 public void traceAlignmentWithEndGaps()
510 // Find the maximum score along the rhs or bottom row
511 float max = -Float.MAX_VALUE;
513 for (int i = 0; i < seq1.length; i++)
515 if (score[i][seq2.length - 1] > max)
517 max = score[i][seq2.length - 1];
519 maxj = seq2.length - 1;
523 for (int j = 0; j < seq2.length; j++)
525 if (score[seq1.length - 1][j] > max)
527 max = score[seq1.length - 1][j];
528 maxi = seq1.length - 1;
536 maxscore = score[i][j] / 10f;
538 //prepare trailing gaps
539 while ((i < seq1.length - 1) || (j < seq2.length - 1))
547 aseq1 = new int[seq1.length + seq2.length];
548 aseq2 = new int[seq1.length + seq2.length];
550 StringBuilder sb1 = new StringBuilder(aseq1.length);
551 StringBuilder sb2 = new StringBuilder(aseq2.length);
553 count = (seq1.length + seq2.length) - 1;
556 while ((i >= seq1.length) || (j >= seq2.length))
558 if (i >= seq1.length)
560 aseq1[count] = GAP_INDEX;
562 aseq2[count] = seq2[j];
563 sb2.append(s2str.charAt(j));
564 } else if (j >= seq2.length) {
565 aseq1[count] = seq1[i];
566 sb1.append(s1str.charAt(i));
567 aseq2[count] = GAP_INDEX;
574 while (i > 0 && j > 0)
576 aseq1[count] = seq1[i];
577 sb1.append(s1str.charAt(i));
578 aseq2[count] = seq2[j];
579 sb2.append(s2str.charAt(j));
581 trace = findTrace(i, j);
591 aseq1[count] = GAP_INDEX;
592 sb1.replace(sb1.length() - 1, sb1.length(), "-");
594 else if (trace == -1)
597 aseq2[count] = GAP_INDEX;
598 sb2.replace(sb2.length() - 1, sb2.length(), "-");
607 aseq1[count] = seq1[i];
608 sb1.append(s1str.charAt(i));
609 aseq2[count] = seq2[j];
610 sb2.append(s2str.charAt(j));
613 while (j > 0 || i > 0)
619 sb2.append(s2str.charAt(j));
622 sb1.append(s1str.charAt(i));
628 * we built the character strings backwards, so now
629 * reverse them to convert to sequence strings
631 astr1 = sb1.reverse().toString();
632 astr2 = sb2.reverse().toString();
638 public void printAlignment(PrintStream os)
640 // TODO: Use original sequence characters rather than re-translated
641 // characters in output
642 // Find the biggest id length for formatting purposes
643 String s1id = getAlignedSeq1().getDisplayId(true);
644 String s2id = getAlignedSeq2().getDisplayId(true);
645 int nameLength = Math.max(s1id.length(), s2id.length());
646 if (nameLength > MAX_NAME_LENGTH)
648 int truncateBy = nameLength - MAX_NAME_LENGTH;
649 nameLength = MAX_NAME_LENGTH;
650 // JAL-527 - truncate the sequence ids
651 if (s1id.length() > nameLength)
653 int slashPos = s1id.lastIndexOf('/');
654 s1id = s1id.substring(0, slashPos - truncateBy)
655 + s1id.substring(slashPos);
657 if (s2id.length() > nameLength)
659 int slashPos = s2id.lastIndexOf('/');
660 s2id = s2id.substring(0, slashPos - truncateBy)
661 + s2id.substring(slashPos);
664 int len = 72 - nameLength - 1;
665 int nochunks = ((aseq1.length - count) / len)
666 + ((aseq1.length - count) % len > 0 ? 1 : 0);
669 output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE);
670 output.append("Length of alignment = ")
671 .append(String.valueOf(aseq1.length - count)).append(NEWLINE);
672 output.append("Sequence ");
673 Format nameFormat = new Format("%" + nameLength + "s");
674 output.append(nameFormat.form(s1id));
675 output.append(" (Sequence length = ")
676 .append(String.valueOf(s1str.length())).append(")")
678 output.append("Sequence ");
679 output.append(nameFormat.form(s2id));
680 output.append(" (Sequence length = ")
681 .append(String.valueOf(s2str.length())).append(")")
682 .append(NEWLINE).append(NEWLINE);
684 ScoreMatrix pam250 = ScoreModels.getInstance().getPam250();
686 for (int j = 0; j < nochunks; j++)
688 // Print the first aligned sequence
689 output.append(nameFormat.form(s1id)).append(" ");
691 for (int i = 0; i < len; i++)
693 if ((i + (j * len)) < astr1.length())
695 output.append(astr1.charAt(i + (j * len)));
699 output.append(NEWLINE);
700 output.append(nameFormat.form(" ")).append(" ");
703 * Print out the match symbols:
704 * | for exact match (ignoring case)
705 * . if PAM250 score is positive
708 for (int i = 0; i < len; i++)
710 if ((i + (j * len)) < astr1.length())
712 char c1 = astr1.charAt(i + (j * len));
713 char c2 = astr2.charAt(i + (j * len));
714 boolean sameChar = Comparison.isSameResidue(c1, c2, false);
715 if (sameChar && !Comparison.isGap(c1))
720 else if (PEP.equals(type))
722 if (pam250.getPairwiseScore(c1, c2) > 0)
738 // Now print the second aligned sequence
739 output = output.append(NEWLINE);
740 output = output.append(nameFormat.form(s2id)).append(" ");
742 for (int i = 0; i < len; i++)
744 if ((i + (j * len)) < astr2.length())
746 output.append(astr2.charAt(i + (j * len)));
750 output.append(NEWLINE).append(NEWLINE);
753 pid = pid / (aseq1.length - count) * 100;
754 output.append(new Format("Percentage ID = %3.2f\n").form(pid));
755 output.append(NEWLINE);
758 os.print(output.toString());
759 } catch (Exception ex)
772 * @return DOCUMENT ME!
774 public int findTrace(int i, int j)
777 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
779 float max = score[i - 1][j - 1] + (pairwiseScore * 10);
786 else if (F[i][j] == max)
800 else if (E[i][j] == max)
817 public void calcScoreMatrix()
821 final int GAP_EX_COST=GAP_EXTEND_COST;
822 final int GAP_OP_COST = GAP_OPEN_COST;
823 // top left hand element
824 score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
825 s2str.charAt(0)) * 10;
826 E[0][0] = -GAP_EX_COST;
829 // Calculate the top row first
830 for (int j = 1; j < m; j++)
832 // What should these values be? 0 maybe
833 E[0][j] = max(score[0][j - 1] - GAP_OP_COST,
834 E[0][j - 1] - GAP_EX_COST);
835 F[0][j] = -GAP_EX_COST;
837 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
839 score[0][j] = max(pairwiseScore * 10, -GAP_OP_COST,
845 // Now do the left hand column
846 for (int i = 1; i < n; i++)
848 E[i][0] = -GAP_OP_COST;
849 F[i][0] = max(score[i - 1][0] - GAP_OP_COST,
850 F[i - 1][0] - GAP_EX_COST);
852 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
854 score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]);
855 traceback[i][0] = -1;
858 // Now do all the other rows
859 for (int i = 1; i < n; i++)
861 for (int j = 1; j < m; j++)
863 E[i][j] = max(score[i][j - 1] - GAP_OP_COST,
864 E[i][j - 1] - GAP_EX_COST);
865 F[i][j] = max(score[i - 1][j] - GAP_OP_COST,
866 F[i - 1][j] - GAP_EX_COST);
868 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
870 score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10),
872 traceback[i][j] = findTrace(i, j);
878 * Returns the given sequence with all of the given gap characters removed.
881 * a string of characters to be treated as gaps
887 public static String extractGaps(String gapChars, String seq)
889 if (gapChars == null || seq == null)
893 StringTokenizer str = new StringTokenizer(seq, gapChars);
894 StringBuilder newString = new StringBuilder(seq.length());
896 while (str.hasMoreTokens())
898 newString.append(str.nextToken());
901 return newString.toString();
914 * @return DOCUMENT ME!
916 private static float max(float f1, float f2, float f3)
941 * @return DOCUMENT ME!
943 private static float max(float f1, float f2)
956 * Converts the character string to an array of integers which are the
957 * corresponding indices to the characters in the score matrix
963 int[] indexEncode(String s)
965 int[] encoded = new int[s.length()];
967 for (int i = 0; i < s.length(); i++)
969 char c = s.charAt(i);
970 encoded[i] = scoreMatrix.getMatrixIndex(c);
990 public static void displayMatrix(Graphics g, int[][] mat, int n, int m,
993 // TODO method doesn't seem to be referenced anywhere delete??
997 for (int i = 0; i < n; i++)
999 for (int j = 0; j < m; j++)
1001 if (mat[i][j] >= max)
1006 if (mat[i][j] <= min)
1013 jalview.bin.Console.outPrintln(max + " " + min);
1015 for (int i = 0; i < n; i++)
1017 for (int j = 0; j < m; j++)
1022 // jalview.bin.Console.outPrintln(mat[i][j]);
1023 float score = (float) (mat[i][j] - min) / (float) (max - min);
1024 g.setColor(new Color(score, 0, 0));
1025 g.fillRect(x, y, psize, psize);
1027 // jalview.bin.Console.outPrintln(x + " " + y + " " + score);
1033 * Compute a globally optimal needleman and wunsch alignment between two
1039 * AlignSeq.DNA or AlignSeq.PEP
1041 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1044 return doGlobalNWAlignment(s1, s2, type, DEFAULT_OPENCOST,DEFAULT_EXTENDCOST);
1046 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1047 String type, int opencost,int extcost)
1050 AlignSeq as = new AlignSeq(s1, s2, type,opencost,extcost);
1052 as.calcScoreMatrix();
1053 as.traceAlignment();
1059 * @return mapping from positions in S1 to corresponding positions in S2
1061 public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch)
1063 ArrayList<Integer> as1 = new ArrayList<Integer>(),
1064 as2 = new ArrayList<Integer>();
1065 int pdbpos = s2.getStart() + getSeq2Start() - 2;
1066 int alignpos = s1.getStart() + getSeq1Start() - 2;
1067 int lp2 = pdbpos - 3, lp1 = alignpos - 3;
1068 boolean lastmatch = false;
1069 // and now trace the alignment onto the atom set.
1070 for (int i = 0; i < astr1.length(); i++)
1072 char c1 = astr1.charAt(i), c2 = astr2.charAt(i);
1083 // ignore case differences
1084 if (allowmismatch || (c1 == c2) || (Math.abs(c2 - c1) == ('a' - 'A')))
1086 // extend mapping interval
1087 if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos)
1089 as1.add(Integer.valueOf(alignpos));
1090 as2.add(Integer.valueOf(pdbpos));
1098 // extend mapping interval
1101 as1.add(Integer.valueOf(lp1));
1102 as2.add(Integer.valueOf(lp2));
1107 // construct range pairs
1109 int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)],
1110 mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)];
1112 for (Integer ip : as1)
1118 for (Integer ip : as2)
1125 mapseq1[mapseq1.length - 1] = alignpos;
1126 mapseq2[mapseq2.length - 1] = pdbpos;
1128 MapList map = new MapList(mapseq1, mapseq2, 1, 1);
1130 jalview.datamodel.Mapping mapping = new Mapping(map);
1136 * matches ochains against al and populates seqs with the best match between
1137 * each ochain and the set in al
1141 * @param dnaOrProtein
1142 * @param removeOldAnnots
1143 * when true, old annotation is cleared before new annotation
1145 * @return List<List<SequenceI> originals, List<SequenceI> replacement,
1146 * List<AlignSeq> alignment between each>
1148 public static List<List<? extends Object>> replaceMatchingSeqsWith(
1149 List<SequenceI> seqs, List<AlignmentAnnotation> annotations,
1150 List<SequenceI> ochains, AlignmentI al, String dnaOrProtein,
1151 boolean removeOldAnnots)
1153 List<SequenceI> orig = new ArrayList<SequenceI>(),
1154 repl = new ArrayList<SequenceI>();
1155 List<AlignSeq> aligs = new ArrayList<AlignSeq>();
1156 if (al != null && al.getHeight() > 0)
1158 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
1159 ArrayList<AlignSeq> aligns = new ArrayList<AlignSeq>();
1161 for (SequenceI sq : ochains)
1163 SequenceI bestm = null;
1164 AlignSeq bestaseq = null;
1165 float bestscore = 0;
1166 for (SequenceI msq : al.getSequences())
1168 AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein);
1169 if (bestm == null || aseq.getMaxScore() > bestscore)
1171 bestscore = aseq.getMaxScore();
1176 // jalview.bin.Console.outPrintln("Best Score for " + (matches.size() +
1180 aligns.add(bestaseq);
1181 al.deleteSequence(bestm);
1183 for (int p = 0, pSize = seqs.size(); p < pSize; p++)
1185 SequenceI sq, sp = seqs.get(p);
1187 if ((q = ochains.indexOf(sp)) > -1)
1189 seqs.set(p, sq = matches.get(q));
1192 sq.setName(sp.getName());
1193 sq.setDescription(sp.getDescription());
1195 sq.transferAnnotation(sp,
1196 sp2sq = aligns.get(q).getMappingFromS1(false));
1197 aligs.add(aligns.get(q));
1199 for (int ap = 0; ap < annotations.size();)
1201 if (annotations.get(ap).sequenceRef == sp)
1207 if (removeOldAnnots)
1209 annotations.remove(ap);
1213 AlignmentAnnotation alan = annotations.remove(ap);
1214 alan.liftOver(sq, sp2sq);
1215 alan.setSequenceRef(sq);
1216 sq.addAlignmentAnnotation(alan);
1224 if (sq.getAnnotation() != null && sq.getAnnotation().length > 0)
1226 annotations.addAll(inspos == -1 ? annotations.size() : inspos,
1227 Arrays.asList(sq.getAnnotation()));
1232 return Arrays.asList(orig, repl, aligs);
1236 * compute the PID vector used by the redundancy filter.
1238 * @param originalSequences
1239 * - sequences in alignment that are to filtered
1241 * - null or strings to be analysed (typically, visible portion of
1242 * each sequence in alignment)
1244 * - first column in window for calculation
1246 * - last column in window for calculation
1248 * - if true then use ungapped sequence to compute PID
1249 * @return vector containing maximum PID for i-th sequence and any sequences
1250 * longer than that seuqence
1252 public static float[] computeRedundancyMatrix(
1253 SequenceI[] originalSequences, String[] omitHidden, int start,
1254 int end, boolean ungapped)
1256 int height = originalSequences.length;
1257 float[] redundancy = new float[height];
1258 int[] lngth = new int[height];
1259 for (int i = 0; i < height; i++)
1265 // long start = System.currentTimeMillis();
1267 SimilarityParams pidParams = new SimilarityParams(true, true, true,
1271 for (int i = 0; i < height; i++)
1274 for (int j = 0; j < i; j++)
1281 if (omitHidden == null)
1283 seqi = originalSequences[i].getSequenceAsString(start, end);
1284 seqj = originalSequences[j].getSequenceAsString(start, end);
1288 seqi = omitHidden[i];
1289 seqj = omitHidden[j];
1293 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqi);
1294 lngth[i] = ug.length();
1302 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqj);
1303 lngth[j] = ug.length();
1309 pid = (float) PIDModel.computePID(seqi, seqj, pidParams);
1311 // use real sequence length rather than string length
1312 if (lngth[j] < lngth[i])
1314 redundancy[j] = Math.max(pid, redundancy[j]);
1318 redundancy[i] = Math.max(pid, redundancy[i]);
1327 * calculate the mean score of the alignment
1328 * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
1331 public void meanScore()
1333 int length = indelfreeAstr1.length(); //both have the same length
1334 //create HashMap for counting residues in each sequence
1335 HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
1336 HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
1338 // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
1339 for (char residue: indelfreeAstr1.toCharArray())
1341 seq1ResCount.putIfAbsent(residue, 0);
1342 seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
1344 for (char residue: indelfreeAstr2.toCharArray())
1346 seq2ResCount.putIfAbsent(residue, 0);
1347 seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
1350 // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
1351 // divide the meanscore by the sequence length afterwards
1352 float _meanscore = 0;
1353 for (char resA : seq1ResCount.keySet())
1355 for (char resB : seq2ResCount.keySet())
1357 int countA = seq1ResCount.get(resA);
1358 int countB = seq2ResCount.get(resB);
1360 float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
1362 _meanscore += countA * countB * scoreAB;
1365 _meanscore /= length;
1366 this.meanScore = _meanscore;
1369 public float getMeanScore()
1371 return this.meanScore;
1375 * calculate the hypothetic max score using the self-alignment of the sequences
1377 public void hypotheticMaxScore()
1381 for (char residue: indelfreeAstr1.toCharArray())
1383 _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
1385 for (char residue: indelfreeAstr2.toCharArray())
1387 _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
1389 this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
1393 public int getHypotheticMaxScore()
1395 return this.hypotheticMaxScore;
1399 * create strings based of astr1 and astr2 but without gaps
1401 public void getIndelfreeAstr()
1403 int n = astr1.length(); // both have the same length
1404 for (int i = 0; i < n; i++)
1406 if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
1408 this.indelfreeAstr1 += astr1.charAt(i);
1409 this.indelfreeAstr2 += astr2.charAt(i);
1415 * calculates the overall score of the alignment
1416 * preprescore = sum of all scores - all penalties
1417 * if preprescore < 1 ~ alignmentScore = Float.NaN >
1418 * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
1420 public void scoreAlignment()
1425 hypotheticMaxScore();
1426 // cannot calculate score because denominator would be zero
1427 if (this.hypotheticMaxScore == this.meanScore)
1429 this.alignmentScore = Float.NaN;
1432 int n = indelfreeAstr1.length();
1435 boolean aGapOpen = false;
1436 boolean bGapOpen = false;
1437 for (int i = 0; i < n; i++)
1439 char char1 = indelfreeAstr1.charAt(i);
1440 char char2 = indelfreeAstr2.charAt(i);
1441 boolean aIsLetter = Character.isLetter(char1);
1442 boolean bIsLetter = Character.isLetter(char2);
1443 if (aIsLetter && bIsLetter) // if pair -> get score
1445 score += scoreMatrix.getPairwiseScore(char1, char2);
1446 } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
1447 } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
1448 score -= GAP_EXTEND_COST;
1449 } else { // no gap open -> score - gap_open
1450 score -= GAP_OPEN_COST;
1452 // adjust GapOpen status in both sequences
1453 aGapOpen = (!aIsLetter) ? true : false;
1454 bGapOpen = (!bIsLetter) ? true : false;
1457 float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
1458 score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
1459 int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
1460 float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
1461 float prescore = score; // only debug
1464 //System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %1.16f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
1465 float minScore = 0f;
1466 this.alignmentScore = (score <= minScore) ? Float.NaN : score;
1472 alignmentScore = 0f;
1475 traceback = null; // todo is this actually used?
1488 indelfreeAstr1 = "";
1489 indelfreeAstr2 = "";
1496 meanScore = 0f; //needed for PaSiMap
1497 hypotheticMaxScore = 0; // needed for PaSiMap
1499 StringBuffer output = new StringBuffer();
1500 String type = null; // AlignSeq.PEP or AlignSeq.DNA