2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.DistanceScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.api.analysis.ViewBasedAnalysisI;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.SeqCigar;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
41 public class FeatureDistanceModel implements DistanceScoreModelI,
44 private static final String NAME = "Sequence Feature Similarity";
46 private String description;
53 public FeatureDistanceModel()
58 public boolean configureFromAlignmentView(AlignmentViewPanel view)
61 fr = view.cloneFeatureRenderer();
66 * Calculates a distance measure [i][j] between each pair of sequences as the
67 * average number of features they have but do not share. That is, find the
68 * features each sequence pair has at each column, ignore feature types they
69 * have in common, and count the rest. The totals are normalised by the number
70 * of columns processed.
72 * The parameters argument provides settings for treatment of gap-residue
73 * aligned positions, and whether the score is over the longer or shorter of
74 * each pair of sequences
80 public MatrixI findDistances(AlignmentView seqData,
81 SimilarityParamsI params)
83 SeqCigar[] seqs = seqData.getSequences();
84 int noseqs = seqs.length;
85 int cpwidth = 0;// = seqData.getWidth();
86 double[][] distances = new double[noseqs][noseqs];
87 List<String> dft = null;
90 dft = fr.getDisplayedFeatureTypes();
92 if (dft == null || dft.isEmpty())
94 return new Matrix(distances);
97 // need to get real position for view position
98 int[] viscont = seqData.getVisibleContigs();
101 * scan each column, compute and add to each distance[i, j]
102 * the number of feature types that seqi and seqj do not share
104 for (int vc = 0; vc < viscont.length; vc += 2)
106 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
111 * first record feature types in this column for each sequence
113 Map<SeqCigar, Set<String>> sfap = findFeatureTypesAtColumn(
117 * count feature types on either i'th or j'th sequence but not both
118 * and add this 'distance' measure to the total for [i, j] for j > i
120 for (int i = 0; i < (noseqs - 1); i++)
122 for (int j = i + 1; j < noseqs; j++)
124 SeqCigar sc1 = seqs[i];
125 SeqCigar sc2 = seqs[j];
126 Set<String> set1 = sfap.get(sc1);
127 Set<String> set2 = sfap.get(sc2);
128 boolean gap1 = set1 == null;
129 boolean gap2 = set2 == null;
132 * gap-gap always scores zero
133 * residue-residue is always scored
134 * include gap-residue score if params say to do so
136 if ((!gap1 && !gap2) || params.includeGaps())
138 int seqDistance = SetUtils.countDisjunction(set1, set2);
139 distances[i][j] += seqDistance;
147 * normalise the distance scores (summed over columns) by the
148 * number of visible columns used in the calculation
149 * and fill in the bottom half of the matrix
151 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
152 for (int i = 0; i < noseqs; i++)
154 for (int j = i + 1; j < noseqs; j++)
156 distances[i][j] /= cpwidth;
157 distances[j][i] = distances[i][j];
160 return new Matrix(distances);
164 * Builds and returns a map containing a (possibly empty) list (one per
165 * SeqCigar) of visible feature types at the given column position. The map
166 * has no entry for sequences which are gapped at the column position.
169 * @param columnPosition
172 protected Map<SeqCigar, Set<String>> findFeatureTypesAtColumn(
173 SeqCigar[] seqs, int columnPosition)
175 Map<SeqCigar, Set<String>> sfap = new HashMap<SeqCigar, Set<String>>();
176 for (SeqCigar seq : seqs)
178 int spos = seq.findPosition(columnPosition);
181 Set<String> types = new HashSet<String>();
182 List<SequenceFeature> sfs = fr.findFeaturesAtRes(seq.getRefSeq(),
184 for (SequenceFeature sf : sfs)
186 types.add(sf.getType());
188 sfap.put(seq, types);
195 public String getName()
201 public String getDescription()
207 public boolean isDNA()
213 public boolean isProtein()
219 public String toString()
221 return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";