2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.DistanceScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.api.analysis.ViewBasedAnalysisI;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.SeqCigar;
30 import jalview.datamodel.SequenceFeature;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
41 public class FeatureDistanceModel implements DistanceScoreModelI,
47 public boolean configureFromAlignmentView(AlignmentViewPanel view)
50 fr = view.cloneFeatureRenderer();
55 * Calculates a distance measure [i][j] between each pair of sequences as the
56 * average number of features they have but do not share. That is, find the
57 * features each sequence pair has at each column, ignore feature types they
58 * have in common, and count the rest. The totals are normalised by the number
59 * of columns processed.
61 * The parameters argument provides settings for treatment of gap-residue
62 * aligned positions, and whether the score is over the longer or shorter of
63 * each pair of sequences
69 public MatrixI findDistances(AlignmentView seqData,
70 SimilarityParamsI params)
72 SeqCigar[] seqs = seqData.getSequences();
73 int noseqs = seqs.length;
74 int cpwidth = 0;// = seqData.getWidth();
75 double[][] distances = new double[noseqs][noseqs];
76 List<String> dft = null;
79 dft = fr.getDisplayedFeatureTypes();
81 if (dft == null || dft.isEmpty())
83 return new Matrix(distances);
86 // need to get real position for view position
87 int[] viscont = seqData.getVisibleContigs();
90 * scan each column, compute and add to each distance[i, j]
91 * the number of feature types that seqi and seqj do not share
93 for (int vc = 0; vc < viscont.length; vc += 2)
95 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
100 * first record feature types in this column for each sequence
102 Map<SeqCigar, Set<String>> sfap = findFeatureTypesAtColumn(
106 * count feature types on either i'th or j'th sequence but not both
107 * and add this 'distance' measure to the total for [i, j] for j > i
109 for (int i = 0; i < (noseqs - 1); i++)
111 for (int j = i + 1; j < noseqs; j++)
113 SeqCigar sc1 = seqs[i];
114 SeqCigar sc2 = seqs[j];
115 Set<String> set1 = sfap.get(sc1);
116 Set<String> set2 = sfap.get(sc2);
117 boolean gap1 = set1 == null;
118 boolean gap2 = set2 == null;
121 * gap-gap always scores zero
122 * residue-residue is always scored
123 * include gap-residue score if params say to do so
125 if ((!gap1 && !gap2) || params.includeGaps())
127 int seqDistance = SetUtils.countDisjunction(set1, set2);
128 distances[i][j] += seqDistance;
136 * normalise the distance scores (summed over columns) by the
137 * number of visible columns used in the calculation
138 * and fill in the bottom half of the matrix
140 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
141 for (int i = 0; i < noseqs; i++)
143 for (int j = i + 1; j < noseqs; j++)
145 distances[i][j] /= cpwidth;
146 distances[j][i] = distances[i][j];
149 return new Matrix(distances);
153 * Builds and returns a map containing a (possibly empty) list (one per
154 * SeqCigar) of visible feature types at the given column position. The map
155 * has no entry for sequences which are gapped at the column position.
158 * @param columnPosition
161 protected Map<SeqCigar, Set<String>> findFeatureTypesAtColumn(
162 SeqCigar[] seqs, int columnPosition)
164 Map<SeqCigar, Set<String>> sfap = new HashMap<SeqCigar, Set<String>>();
165 for (SeqCigar seq : seqs)
167 int spos = seq.findPosition(columnPosition);
170 Set<String> types = new HashSet<String>();
171 List<SequenceFeature> sfs = fr.findFeaturesAtRes(seq.getRefSeq(),
173 for (SequenceFeature sf : sfs)
175 types.add(sf.getType());
177 sfap.put(seq, types);
184 public String getName()
186 return "Sequence Feature Similarity";
190 public boolean isDNA()
196 public boolean isProtein()
202 public String toString()
204 return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";