2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.analysis.scoremodels.PIDModel;
24 import jalview.analysis.scoremodels.ScoreMatrix;
25 import jalview.analysis.scoremodels.ScoreModels;
26 import jalview.analysis.scoremodels.SimilarityParams;
27 import jalview.api.analysis.SimilarityParamsI;
28 import jalview.datamodel.AlignmentAnnotation;
29 import jalview.datamodel.AlignmentI;
30 import jalview.datamodel.Mapping;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceI;
33 import jalview.math.MiscMath;
34 import jalview.util.Comparison;
35 import jalview.util.Format;
36 import jalview.util.MapList;
37 import jalview.util.MessageManager;
39 import java.awt.Color;
40 import java.awt.Graphics;
41 import java.io.PrintStream;
42 import java.lang.IllegalArgumentException;
43 import java.util.ArrayList;
44 import java.util.Arrays;
45 import java.util.HashMap;
46 import java.util.List;
47 import java.util.StringTokenizer;
48 import java.util.Locale;
56 private static final int MAX_NAME_LENGTH = 30;
58 private static final int DEFAULT_OPENCOST = 120;
60 private static final int DEFAULT_EXTENDCOST = 20;
62 private int GAP_OPEN_COST=DEFAULT_OPENCOST;
64 private int GAP_EXTEND_COST=DEFAULT_EXTENDCOST;
66 private static final int GAP_INDEX = -1;
68 public static final String PEP = "pep";
70 public static final String DNA = "dna";
72 private static final String NEWLINE = System.lineSeparator();
82 int[][] traceback; // todo is this actually used?
105 * matches in alignment
109 public String astr1 = "";
111 public String astr2 = "";
113 public String indelfreeAstr1 = "";
115 public String indelfreeAstr2 = "";
118 public int seq1start;
124 public int seq2start;
130 public float maxscore;
132 public float meanScore; //needed for PaSiMap
134 public int hypotheticMaxScore; // needed for PaSiMap
138 StringBuffer output = new StringBuffer();
140 String type; // AlignSeq.PEP or AlignSeq.DNA
142 private ScoreMatrix scoreMatrix;
145 * Creates a new AlignSeq object.
148 * first sequence for alignment
150 * second sequence for alignment
152 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
154 public AlignSeq(int opencost, int extcost)
156 GAP_OPEN_COST = opencost;
157 GAP_EXTEND_COST = extcost;
159 public AlignSeq(SequenceI s1, SequenceI s2, String type)
161 seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(),
166 * Creates a new AlignSeq object.
169 * s1 reference sequence for string1
171 * s2 reference sequence for string2
173 * molecule type, either AlignSeq.PEP or AlignSeq.DNA
175 public AlignSeq(SequenceI s1, String string1, SequenceI s2,
176 String string2, String type)
178 seqInit(s1, string1.toUpperCase(Locale.ROOT), s2,
179 string2.toUpperCase(Locale.ROOT), type);
182 public AlignSeq(SequenceI s1, SequenceI s2, String type, int opencost,
186 GAP_OPEN_COST=opencost;
187 GAP_EXTEND_COST=extcost;
190 public AlignSeq(SequenceI s12, String string1, SequenceI s22,
191 String string2, String type2, int defaultOpencost,
192 int defaultExtendcost)
194 this(s12,string1,s22,string2,type2);
195 GAP_OPEN_COST=defaultOpencost;
196 GAP_EXTEND_COST=defaultExtendcost;
202 * @return DOCUMENT ME!
204 public float getMaxScore()
210 * returns the overall score of the alignment
214 public float getAlignmentScore()
216 return alignmentScore;
222 * @return DOCUMENT ME!
224 public int getSeq2Start()
232 * @return DOCUMENT ME!
234 public int getSeq2End()
242 * @return DOCUMENT ME!
244 public int getSeq1Start()
252 * @return DOCUMENT ME!
254 public int getSeq1End()
262 * @return DOCUMENT ME!
264 public String getOutput()
266 return output.toString();
272 * @return DOCUMENT ME!
274 public String getAStr1()
282 * @return DOCUMENT ME!
284 public String getAStr2()
292 * @return DOCUMENT ME!
294 public int[] getASeq1()
302 * @return DOCUMENT ME!
304 public int[] getASeq2()
311 * @return aligned instance of Seq 1
313 public SequenceI getAlignedSeq1()
315 SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1());
316 alSeq1.setStart(s1.getStart() + getSeq1Start() - 1);
317 alSeq1.setEnd(s1.getStart() + getSeq1End() - 1);
318 alSeq1.setDatasetSequence(
319 s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence());
325 * @return aligned instance of Seq 2
327 public SequenceI getAlignedSeq2()
329 SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2());
330 alSeq2.setStart(s2.getStart() + getSeq2Start() - 1);
331 alSeq2.setEnd(s2.getStart() + getSeq2End() - 1);
332 alSeq2.setDatasetSequence(
333 s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence());
337 * fraction of seq2 matched in the alignment
338 * @return NaN or [0..1]
340 public double getS2Coverage()
344 return ((double)match)/((double)s2.getEnd()-s2.getStart()+1);
349 * fraction of seq1 matched in the alignment
350 * @return NaN or [0..1]
352 public double getS1Coverage()
356 return ((double)match)/((double)s1.getEnd()-s1.getStart()+1);
362 * Construct score matrix for sequences with standard DNA or PEPTIDE matrix
367 * - string to use for s1
371 * - string to use for s2
375 public void seqInit(SequenceI s1, String string1, SequenceI s2,
376 String string2, String type)
378 seqInit(s1,string1,s2,string2,type,GAP_OPEN_COST,GAP_EXTEND_COST);
380 public void seqInit(SequenceI s1, String string1, SequenceI s2,
381 String string2, String type, int opening,int extension)
383 GAP_OPEN_COST=opening;
384 GAP_EXTEND_COST=extension;
387 setDefaultParams(type);
388 seqInit(string1, string2);
392 * construct score matrix for string1 and string2 (after removing any existing
398 private void seqInit(String string1, String string2)
400 s1str = extractGaps(jalview.util.Comparison.GapChars, string1);
401 s2str = extractGaps(jalview.util.Comparison.GapChars, string2);
403 if (s1str.length() == 0 || s2str.length() == 0)
406 "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ")
407 + (s2str.length() == 0 ? s2.getName() : ""));
411 score = new float[s1str.length()][s2str.length()];
413 E = new float[s1str.length()][s2str.length()];
415 F = new float[s1str.length()][s2str.length()];
416 traceback = new int[s1str.length()][s2str.length()];
418 seq1 = indexEncode(s1str);
420 seq2 = indexEncode(s2str);
423 private void setDefaultParams(String moleculeType)
425 if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType))
427 output.append("Wrong type = dna or pep only");
428 throw new Error(MessageManager
429 .formatMessage("error.unknown_type_dna_or_pep", new String[]
434 scoreMatrix = ScoreModels.getInstance()
435 .getDefaultModel(PEP.equals(type));
441 public void traceAlignment()
443 // Find the maximum score along the rhs or bottom row
444 float max = -Float.MAX_VALUE;
446 for (int i = 0; i < seq1.length; i++)
448 if (score[i][seq2.length - 1] > max)
450 max = score[i][seq2.length - 1];
452 maxj = seq2.length - 1;
456 for (int j = 0; j < seq2.length; j++)
458 if (score[seq1.length - 1][j] > max)
460 max = score[seq1.length - 1][j];
461 maxi = seq1.length - 1;
469 maxscore = score[i][j] / 10f;
474 aseq1 = new int[seq1.length + seq2.length];
475 aseq2 = new int[seq1.length + seq2.length];
477 StringBuilder sb1 = new StringBuilder(aseq1.length);
478 StringBuilder sb2 = new StringBuilder(aseq2.length);
480 count = (seq1.length + seq2.length) - 1;
483 while (i > 0 && j > 0)
485 aseq1[count] = seq1[i];
486 sb1.append(s1str.charAt(i));
487 aseq2[count] = seq2[j];
488 sb2.append(s2str.charAt(j));
489 trace = findTrace(i, j);
500 aseq1[count] = GAP_INDEX;
501 sb1.replace(sb1.length() - 1, sb1.length(), "-");
503 else if (trace == -1)
506 aseq2[count] = GAP_INDEX;
507 sb2.replace(sb2.length() - 1, sb2.length(), "-");
516 if (aseq1[count] != GAP_INDEX)
518 aseq1[count] = seq1[i];
519 sb1.append(s1str.charAt(i));
522 if (aseq2[count] != GAP_INDEX)
524 aseq2[count] = seq2[j];
525 sb2.append(s2str.charAt(j));
526 if (aseq1[count]!=GAP_INDEX) {
533 * we built the character strings backwards, so now
534 * reverse them to convert to sequence strings
536 astr1 = sb1.reverse().toString();
537 astr2 = sb2.reverse().toString();
543 public void traceAlignmentWithEndGaps()
545 // Find the maximum score along the rhs or bottom row
546 float max = -Float.MAX_VALUE;
548 for (int i = 0; i < seq1.length; i++)
550 if (score[i][seq2.length - 1] > max)
552 max = score[i][seq2.length - 1];
554 maxj = seq2.length - 1;
558 for (int j = 0; j < seq2.length; j++)
560 if (score[seq1.length - 1][j] > max)
562 max = score[seq1.length - 1][j];
563 maxi = seq1.length - 1;
571 maxscore = score[i][j] / 10f;
573 //prepare trailing gaps
574 while ((i < seq1.length - 1) || (j < seq2.length - 1))
582 aseq1 = new int[seq1.length + seq2.length];
583 aseq2 = new int[seq1.length + seq2.length];
585 StringBuilder sb1 = new StringBuilder(aseq1.length);
586 StringBuilder sb2 = new StringBuilder(aseq2.length);
588 count = (seq1.length + seq2.length) - 1;
591 while ((i >= seq1.length) || (j >= seq2.length))
593 if (i >= seq1.length)
595 aseq1[count] = GAP_INDEX;
597 aseq2[count] = seq2[j];
598 sb2.append(s2str.charAt(j));
599 } else if (j >= seq2.length) {
600 aseq1[count] = seq1[i];
601 sb1.append(s1str.charAt(i));
602 aseq2[count] = GAP_INDEX;
609 while (i > 0 && j > 0)
611 aseq1[count] = seq1[i];
612 sb1.append(s1str.charAt(i));
613 aseq2[count] = seq2[j];
614 sb2.append(s2str.charAt(j));
616 trace = findTrace(i, j);
626 aseq1[count] = GAP_INDEX;
627 sb1.replace(sb1.length() - 1, sb1.length(), "-");
629 else if (trace == -1)
632 aseq2[count] = GAP_INDEX;
633 sb2.replace(sb2.length() - 1, sb2.length(), "-");
642 aseq1[count] = seq1[i];
643 sb1.append(s1str.charAt(i));
644 aseq2[count] = seq2[j];
645 sb2.append(s2str.charAt(j));
648 while (j > 0 || i > 0)
654 sb2.append(s2str.charAt(j));
657 sb1.append(s1str.charAt(i));
663 * we built the character strings backwards, so now
664 * reverse them to convert to sequence strings
666 astr1 = sb1.reverse().toString();
667 astr2 = sb2.reverse().toString();
673 public void printAlignment(PrintStream os)
675 // TODO: Use original sequence characters rather than re-translated
676 // characters in output
677 // Find the biggest id length for formatting purposes
678 String s1id = getAlignedSeq1().getDisplayId(true);
679 String s2id = getAlignedSeq2().getDisplayId(true);
680 int nameLength = Math.max(s1id.length(), s2id.length());
681 if (nameLength > MAX_NAME_LENGTH)
683 int truncateBy = nameLength - MAX_NAME_LENGTH;
684 nameLength = MAX_NAME_LENGTH;
685 // JAL-527 - truncate the sequence ids
686 if (s1id.length() > nameLength)
688 int slashPos = s1id.lastIndexOf('/');
689 s1id = s1id.substring(0, slashPos - truncateBy)
690 + s1id.substring(slashPos);
692 if (s2id.length() > nameLength)
694 int slashPos = s2id.lastIndexOf('/');
695 s2id = s2id.substring(0, slashPos - truncateBy)
696 + s2id.substring(slashPos);
699 int len = 72 - nameLength - 1;
700 int nochunks = ((aseq1.length - count) / len)
701 + ((aseq1.length - count) % len > 0 ? 1 : 0);
704 output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE);
705 output.append("Length of alignment = ")
706 .append(String.valueOf(aseq1.length - count)).append(NEWLINE);
707 output.append("Sequence ");
708 Format nameFormat = new Format("%" + nameLength + "s");
709 output.append(nameFormat.form(s1id));
710 output.append(" (Sequence length = ")
711 .append(String.valueOf(s1str.length())).append(")")
713 output.append("Sequence ");
714 output.append(nameFormat.form(s2id));
715 output.append(" (Sequence length = ")
716 .append(String.valueOf(s2str.length())).append(")")
717 .append(NEWLINE).append(NEWLINE);
719 ScoreMatrix pam250 = ScoreModels.getInstance().getPam250();
721 for (int j = 0; j < nochunks; j++)
723 // Print the first aligned sequence
724 output.append(nameFormat.form(s1id)).append(" ");
726 for (int i = 0; i < len; i++)
728 if ((i + (j * len)) < astr1.length())
730 output.append(astr1.charAt(i + (j * len)));
734 output.append(NEWLINE);
735 output.append(nameFormat.form(" ")).append(" ");
738 * Print out the match symbols:
739 * | for exact match (ignoring case)
740 * . if PAM250 score is positive
743 for (int i = 0; i < len; i++)
745 if ((i + (j * len)) < astr1.length())
747 char c1 = astr1.charAt(i + (j * len));
748 char c2 = astr2.charAt(i + (j * len));
749 boolean sameChar = Comparison.isSameResidue(c1, c2, false);
750 if (sameChar && !Comparison.isGap(c1))
755 else if (PEP.equals(type))
757 if (pam250.getPairwiseScore(c1, c2) > 0)
773 // Now print the second aligned sequence
774 output = output.append(NEWLINE);
775 output = output.append(nameFormat.form(s2id)).append(" ");
777 for (int i = 0; i < len; i++)
779 if ((i + (j * len)) < astr2.length())
781 output.append(astr2.charAt(i + (j * len)));
785 output.append(NEWLINE).append(NEWLINE);
788 pid = pid / (aseq1.length - count) * 100;
789 output.append(new Format("Percentage ID = %3.2f\n").form(pid));
790 output.append(NEWLINE);
793 os.print(output.toString());
794 } catch (Exception ex)
807 * @return DOCUMENT ME!
809 public int findTrace(int i, int j)
812 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
814 float max = score[i - 1][j - 1] + (pairwiseScore * 10);
821 else if (F[i][j] == max)
835 else if (E[i][j] == max)
852 public void calcScoreMatrix()
856 final int GAP_EX_COST=GAP_EXTEND_COST;
857 final int GAP_OP_COST = GAP_OPEN_COST;
858 // top left hand element
859 score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0),
860 s2str.charAt(0)) * 10;
861 E[0][0] = -GAP_EX_COST;
864 // Calculate the top row first
865 for (int j = 1; j < m; j++)
867 // What should these values be? 0 maybe
868 E[0][j] = max(score[0][j - 1] - GAP_OP_COST,
869 E[0][j - 1] - GAP_EX_COST);
870 F[0][j] = -GAP_EX_COST;
872 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0),
874 score[0][j] = max(pairwiseScore * 10, -GAP_OP_COST,
880 // Now do the left hand column
881 for (int i = 1; i < n; i++)
883 E[i][0] = -GAP_OP_COST;
884 F[i][0] = max(score[i - 1][0] - GAP_OP_COST,
885 F[i - 1][0] - GAP_EX_COST);
887 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
889 score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]);
890 traceback[i][0] = -1;
893 // Now do all the other rows
894 for (int i = 1; i < n; i++)
896 for (int j = 1; j < m; j++)
898 E[i][j] = max(score[i][j - 1] - GAP_OP_COST,
899 E[i][j - 1] - GAP_EX_COST);
900 F[i][j] = max(score[i - 1][j] - GAP_OP_COST,
901 F[i - 1][j] - GAP_EX_COST);
903 float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i),
905 score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10),
907 traceback[i][j] = findTrace(i, j);
913 * Returns the given sequence with all of the given gap characters removed.
916 * a string of characters to be treated as gaps
922 public static String extractGaps(String gapChars, String seq)
924 if (gapChars == null || seq == null)
928 StringTokenizer str = new StringTokenizer(seq, gapChars);
929 StringBuilder newString = new StringBuilder(seq.length());
931 while (str.hasMoreTokens())
933 newString.append(str.nextToken());
936 return newString.toString();
949 * @return DOCUMENT ME!
951 private static float max(float f1, float f2, float f3)
976 * @return DOCUMENT ME!
978 private static float max(float f1, float f2)
991 * Converts the character string to an array of integers which are the
992 * corresponding indices to the characters in the score matrix
998 int[] indexEncode(String s)
1000 int[] encoded = new int[s.length()];
1002 for (int i = 0; i < s.length(); i++)
1004 char c = s.charAt(i);
1005 encoded[i] = scoreMatrix.getMatrixIndex(c);
1025 public static void displayMatrix(Graphics g, int[][] mat, int n, int m,
1028 // TODO method doesn't seem to be referenced anywhere delete??
1032 for (int i = 0; i < n; i++)
1034 for (int j = 0; j < m; j++)
1036 if (mat[i][j] >= max)
1041 if (mat[i][j] <= min)
1048 jalview.bin.Console.outPrintln(max + " " + min);
1050 for (int i = 0; i < n; i++)
1052 for (int j = 0; j < m; j++)
1057 // jalview.bin.Console.outPrintln(mat[i][j]);
1058 float score = (float) (mat[i][j] - min) / (float) (max - min);
1059 g.setColor(new Color(score, 0, 0));
1060 g.fillRect(x, y, psize, psize);
1062 // jalview.bin.Console.outPrintln(x + " " + y + " " + score);
1068 * Compute a globally optimal needleman and wunsch alignment between two
1074 * AlignSeq.DNA or AlignSeq.PEP
1076 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1079 return doGlobalNWAlignment(s1, s2, type, DEFAULT_OPENCOST,DEFAULT_EXTENDCOST);
1081 public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2,
1082 String type, int opencost,int extcost)
1085 AlignSeq as = new AlignSeq(s1, s2, type,opencost,extcost);
1087 as.calcScoreMatrix();
1088 as.traceAlignment();
1094 * @return mapping from positions in S1 to corresponding positions in S2
1096 public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch)
1098 ArrayList<Integer> as1 = new ArrayList<Integer>(),
1099 as2 = new ArrayList<Integer>();
1100 int pdbpos = s2.getStart() + getSeq2Start() - 2;
1101 int alignpos = s1.getStart() + getSeq1Start() - 2;
1102 int lp2 = pdbpos - 3, lp1 = alignpos - 3;
1103 boolean lastmatch = false;
1104 // and now trace the alignment onto the atom set.
1105 for (int i = 0; i < astr1.length(); i++)
1107 char c1 = astr1.charAt(i), c2 = astr2.charAt(i);
1118 // ignore case differences
1119 if (allowmismatch || (c1 == c2) || (Math.abs(c2 - c1) == ('a' - 'A')))
1121 // extend mapping interval
1122 if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos)
1124 as1.add(Integer.valueOf(alignpos));
1125 as2.add(Integer.valueOf(pdbpos));
1133 // extend mapping interval
1136 as1.add(Integer.valueOf(lp1));
1137 as2.add(Integer.valueOf(lp2));
1142 // construct range pairs
1144 int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)],
1145 mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)];
1147 for (Integer ip : as1)
1153 for (Integer ip : as2)
1160 mapseq1[mapseq1.length - 1] = alignpos;
1161 mapseq2[mapseq2.length - 1] = pdbpos;
1163 MapList map = new MapList(mapseq1, mapseq2, 1, 1);
1165 jalview.datamodel.Mapping mapping = new Mapping(map);
1171 * matches ochains against al and populates seqs with the best match between
1172 * each ochain and the set in al
1176 * @param dnaOrProtein
1177 * @param removeOldAnnots
1178 * when true, old annotation is cleared before new annotation
1180 * @return List<List<SequenceI> originals, List<SequenceI> replacement,
1181 * List<AlignSeq> alignment between each>
1183 public static List<List<? extends Object>> replaceMatchingSeqsWith(
1184 List<SequenceI> seqs, List<AlignmentAnnotation> annotations,
1185 List<SequenceI> ochains, AlignmentI al, String dnaOrProtein,
1186 boolean removeOldAnnots)
1188 List<SequenceI> orig = new ArrayList<SequenceI>(),
1189 repl = new ArrayList<SequenceI>();
1190 List<AlignSeq> aligs = new ArrayList<AlignSeq>();
1191 if (al != null && al.getHeight() > 0)
1193 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
1194 ArrayList<AlignSeq> aligns = new ArrayList<AlignSeq>();
1196 for (SequenceI sq : ochains)
1198 SequenceI bestm = null;
1199 AlignSeq bestaseq = null;
1200 float bestscore = 0;
1201 for (SequenceI msq : al.getSequences())
1203 AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein);
1204 if (bestm == null || aseq.getMaxScore() > bestscore)
1206 bestscore = aseq.getMaxScore();
1211 // jalview.bin.Console.outPrintln("Best Score for " + (matches.size() +
1215 aligns.add(bestaseq);
1216 al.deleteSequence(bestm);
1218 for (int p = 0, pSize = seqs.size(); p < pSize; p++)
1220 SequenceI sq, sp = seqs.get(p);
1222 if ((q = ochains.indexOf(sp)) > -1)
1224 seqs.set(p, sq = matches.get(q));
1227 sq.setName(sp.getName());
1228 sq.setDescription(sp.getDescription());
1230 sq.transferAnnotation(sp,
1231 sp2sq = aligns.get(q).getMappingFromS1(false));
1232 aligs.add(aligns.get(q));
1234 for (int ap = 0; ap < annotations.size();)
1236 if (annotations.get(ap).sequenceRef == sp)
1242 if (removeOldAnnots)
1244 annotations.remove(ap);
1248 AlignmentAnnotation alan = annotations.remove(ap);
1249 alan.liftOver(sq, sp2sq);
1250 alan.setSequenceRef(sq);
1251 sq.addAlignmentAnnotation(alan);
1259 if (sq.getAnnotation() != null && sq.getAnnotation().length > 0)
1261 annotations.addAll(inspos == -1 ? annotations.size() : inspos,
1262 Arrays.asList(sq.getAnnotation()));
1267 return Arrays.asList(orig, repl, aligs);
1271 * compute the PID vector used by the redundancy filter.
1273 * @param originalSequences
1274 * - sequences in alignment that are to filtered
1276 * - null or strings to be analysed (typically, visible portion of
1277 * each sequence in alignment)
1279 * - first column in window for calculation
1281 * - last column in window for calculation
1283 * - if true then use ungapped sequence to compute PID
1284 * @return vector containing maximum PID for i-th sequence and any sequences
1285 * longer than that seuqence
1287 public static float[] computeRedundancyMatrix(
1288 SequenceI[] originalSequences, String[] omitHidden, int start,
1289 int end, boolean ungapped)
1291 int height = originalSequences.length;
1292 float[] redundancy = new float[height];
1293 int[] lngth = new int[height];
1294 for (int i = 0; i < height; i++)
1300 // long start = System.currentTimeMillis();
1302 SimilarityParams pidParams = new SimilarityParams(true, true, true,
1306 for (int i = 0; i < height; i++)
1309 for (int j = 0; j < i; j++)
1316 if (omitHidden == null)
1318 seqi = originalSequences[i].getSequenceAsString(start, end);
1319 seqj = originalSequences[j].getSequenceAsString(start, end);
1323 seqi = omitHidden[i];
1324 seqj = omitHidden[j];
1328 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqi);
1329 lngth[i] = ug.length();
1337 String ug = AlignSeq.extractGaps(Comparison.GapChars, seqj);
1338 lngth[j] = ug.length();
1344 pid = (float) PIDModel.computePID(seqi, seqj, pidParams);
1346 // use real sequence length rather than string length
1347 if (lngth[j] < lngth[i])
1349 redundancy[j] = Math.max(pid, redundancy[j]);
1353 redundancy[i] = Math.max(pid, redundancy[i]);
1362 * calculate the mean score of the alignment
1363 * mean score is equal to the score of an alignmenet of two sequences with randomly shuffled AA sequence composited of the same AA as the two original sequences
1366 public void meanScore()
1368 int length = indelfreeAstr1.length(); //both have the same length
1369 //create HashMap for counting residues in each sequence
1370 HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>();
1371 HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>();
1373 // for both sequences (String indelfreeAstr1 or 2) create a key for the residue and add 1 each time its encountered
1374 for (char residue: indelfreeAstr1.toCharArray())
1376 seq1ResCount.putIfAbsent(residue, 0);
1377 seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1);
1379 for (char residue: indelfreeAstr2.toCharArray())
1381 seq2ResCount.putIfAbsent(residue, 0);
1382 seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1);
1385 // meanscore = for each residue pair get the number of appearance and add (countA * countB * pairwiseScore(AB))
1386 // divide the meanscore by the sequence length afterwards
1387 float _meanscore = 0;
1388 for (char resA : seq1ResCount.keySet())
1390 for (char resB : seq2ResCount.keySet())
1392 int countA = seq1ResCount.get(resA);
1393 int countB = seq2ResCount.get(resB);
1395 float scoreAB = scoreMatrix.getPairwiseScore(resA, resB);
1397 _meanscore += countA * countB * scoreAB;
1400 _meanscore /= length;
1401 this.meanScore = _meanscore;
1404 public float getMeanScore()
1406 return this.meanScore;
1410 * calculate the hypothetic max score using the self-alignment of the sequences
1412 public void hypotheticMaxScore()
1416 for (char residue: indelfreeAstr1.toCharArray())
1418 _hmsA += scoreMatrix.getPairwiseScore(residue, residue);
1420 for (char residue: indelfreeAstr2.toCharArray())
1422 _hmsB += scoreMatrix.getPairwiseScore(residue, residue);
1424 this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower self alignment
1428 public int getHypotheticMaxScore()
1430 return this.hypotheticMaxScore;
1434 * create strings based of astr1 and astr2 but without gaps
1436 public void getIndelfreeAstr()
1438 int n = astr1.length(); // both have the same length
1439 for (int i = 0; i < n; i++)
1441 if (Character.isLetter(astr1.charAt(i)) && Character.isLetter(astr2.charAt(i))) // if both sequences dont have a gap -> add to indelfreeAstr
1443 this.indelfreeAstr1 += astr1.charAt(i);
1444 this.indelfreeAstr2 += astr2.charAt(i);
1450 * calculates the overall score of the alignment
1451 * preprescore = sum of all scores - all penalties
1452 * if preprescore < 1 ~ alignmentScore = Float.NaN >
1453 * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - meanScore)) * coverage
1455 public void scoreAlignment()
1460 hypotheticMaxScore();
1461 // cannot calculate score because denominator would be zero
1462 if (this.hypotheticMaxScore == this.meanScore)
1464 this.alignmentScore = Float.NaN;
1467 int n = indelfreeAstr1.length();
1470 boolean aGapOpen = false;
1471 boolean bGapOpen = false;
1472 for (int i = 0; i < n; i++)
1474 char char1 = indelfreeAstr1.charAt(i);
1475 char char2 = indelfreeAstr2.charAt(i);
1476 boolean aIsLetter = Character.isLetter(char1);
1477 boolean bIsLetter = Character.isLetter(char2);
1478 if (aIsLetter && bIsLetter) // if pair -> get score
1480 score += scoreMatrix.getPairwiseScore(char1, char2);
1481 } else if (!aIsLetter && !bIsLetter) { // both are gap -> skip
1482 } else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) { // one side gapopen -> score - gap_extend
1483 score -= GAP_EXTEND_COST;
1484 } else { // no gap open -> score - gap_open
1485 score -= GAP_OPEN_COST;
1487 // adjust GapOpen status in both sequences
1488 aGapOpen = (!aIsLetter) ? true : false;
1489 bGapOpen = (!bIsLetter) ? true : false;
1492 float preprescore = score; // if this score < 1 --> alignment score = Float.NaN
1493 score = (score - this.meanScore) / (this.hypotheticMaxScore - this.meanScore);
1494 int[] _max = MiscMath.findMax(new int[]{astr1.replace("-","").length(), astr2.replace("-","").length()}); // {index of max, max}
1495 float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / longest sequence length
1496 float prescore = score; // only debug
1499 //System.out.println(String.format("prepre-score: %f, pre-score: %f, longlength: %d\nscore: %1.16f, mean: %f, max: %d", preprescore, prescore, _max[1], score, this.meanScore, this.hypotheticMaxScore));
1500 float minScore = 0f;
1501 this.alignmentScore = (score <= minScore) ? Float.NaN : score;
1504 public void setScoreMatrix(ScoreMatrix sm)