2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentView;
28 import jalview.datamodel.SeqCigar;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.math.Matrix;
31 import jalview.math.MatrixI;
32 import jalview.util.SetUtils;
34 import java.util.HashMap;
35 import java.util.HashSet;
36 import java.util.List;
40 public class FeatureDistanceModel extends DistanceScoreModel
42 private static final String NAME = "Sequence Feature Similarity";
44 private String description;
51 public FeatureDistanceModel()
56 public ScoreModelI getInstance(AlignmentViewPanel view)
58 FeatureDistanceModel instance;
61 instance = this.getClass().newInstance();
62 instance.configureFromAlignmentView(view);
64 } catch (InstantiationException | IllegalAccessException e)
66 System.err.println("Error in " + getClass().getName()
67 + ".getInstance(): " + e.getMessage());
72 boolean configureFromAlignmentView(AlignmentViewPanel view)
75 fr = view.cloneFeatureRenderer();
80 * Calculates a distance measure [i][j] between each pair of sequences as the
81 * average number of features they have but do not share. That is, find the
82 * features each sequence pair has at each column, ignore feature types they
83 * have in common, and count the rest. The totals are normalised by the number
84 * of columns processed.
86 * The parameters argument provides settings for treatment of gap-residue
87 * aligned positions, and whether the score is over the longer or shorter of
88 * each pair of sequences
94 public MatrixI findDistances(AlignmentView seqData,
95 SimilarityParamsI params)
97 SeqCigar[] seqs = seqData.getSequences();
98 int noseqs = seqs.length;
99 int cpwidth = 0;// = seqData.getWidth();
100 double[][] distances = new double[noseqs][noseqs];
101 List<String> dft = null;
104 dft = fr.getDisplayedFeatureTypes();
106 if (dft == null || dft.isEmpty())
108 return new Matrix(distances);
111 // need to get real position for view position
112 int[] viscont = seqData.getVisibleContigs();
115 * scan each column, compute and add to each distance[i, j]
116 * the number of feature types that seqi and seqj do not share
118 for (int vc = 0; vc < viscont.length; vc += 2)
120 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
125 * first record feature types in this column for each sequence
127 Map<SeqCigar, Set<String>> sfap = findFeatureTypesAtColumn(seqs,
131 * count feature types on either i'th or j'th sequence but not both
132 * and add this 'distance' measure to the total for [i, j] for j > i
134 for (int i = 0; i < (noseqs - 1); i++)
136 for (int j = i + 1; j < noseqs; j++)
138 SeqCigar sc1 = seqs[i];
139 SeqCigar sc2 = seqs[j];
140 Set<String> set1 = sfap.get(sc1);
141 Set<String> set2 = sfap.get(sc2);
142 boolean gap1 = set1 == null;
143 boolean gap2 = set2 == null;
146 * gap-gap always scores zero
147 * residue-residue is always scored
148 * include gap-residue score if params say to do so
150 if ((!gap1 && !gap2) || params.includeGaps())
152 int seqDistance = SetUtils.countDisjunction(set1, set2);
153 distances[i][j] += seqDistance;
161 * normalise the distance scores (summed over columns) by the
162 * number of visible columns used in the calculation
163 * and fill in the bottom half of the matrix
165 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
166 for (int i = 0; i < noseqs; i++)
168 for (int j = i + 1; j < noseqs; j++)
170 distances[i][j] /= cpwidth;
171 distances[j][i] = distances[i][j];
174 return new Matrix(distances);
178 * Builds and returns a map containing a (possibly empty) list (one per
179 * SeqCigar) of visible feature types at the given column position. The map
180 * does not include entries for features which straddle a gapped column
184 * @param columnPosition
188 protected Map<SeqCigar, Set<String>> findFeatureTypesAtColumn(
189 SeqCigar[] seqs, int columnPosition)
191 Map<SeqCigar, Set<String>> sfap = new HashMap<SeqCigar, Set<String>>();
192 for (SeqCigar seq : seqs)
194 int spos = seq.findPosition(columnPosition);
198 * position is not a gap
200 Set<String> types = new HashSet<String>();
201 List<SequenceFeature> sfs = fr.findFeaturesAtResidue(
202 seq.getRefSeq(), spos);
203 for (SequenceFeature sf : sfs)
205 types.add(sf.getType());
207 sfap.put(seq, types);
214 public String getName()
220 public String getDescription()
226 public boolean isDNA()
232 public boolean isProtein()
238 public String toString()
240 return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";