2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.SimilarityParamsI;
26 import jalview.api.analysis.ViewBasedAnalysisI;
27 import jalview.datamodel.AlignmentView;
28 import jalview.datamodel.SeqCigar;
29 import jalview.datamodel.SequenceFeature;
30 import jalview.math.Matrix;
31 import jalview.math.MatrixI;
32 import jalview.util.SetUtils;
34 import java.util.HashMap;
35 import java.util.HashSet;
36 import java.util.List;
40 public class FeatureDistanceModel extends DistanceScoreModel implements
43 private static final String NAME = "Sequence Feature Similarity";
45 private String description;
52 public FeatureDistanceModel()
57 public boolean configureFromAlignmentView(AlignmentViewPanel view)
60 fr = view.cloneFeatureRenderer();
65 * Calculates a distance measure [i][j] between each pair of sequences as the
66 * average number of features they have but do not share. That is, find the
67 * features each sequence pair has at each column, ignore feature types they
68 * have in common, and count the rest. The totals are normalised by the number
69 * of columns processed.
71 * The parameters argument provides settings for treatment of gap-residue
72 * aligned positions, and whether the score is over the longer or shorter of
73 * each pair of sequences
79 public MatrixI findDistances(AlignmentView seqData,
80 SimilarityParamsI params)
82 SeqCigar[] seqs = seqData.getSequences();
83 int noseqs = seqs.length;
84 int cpwidth = 0;// = seqData.getWidth();
85 double[][] distances = new double[noseqs][noseqs];
86 List<String> dft = null;
89 dft = fr.getDisplayedFeatureTypes();
91 if (dft == null || dft.isEmpty())
93 return new Matrix(distances);
96 // need to get real position for view position
97 int[] viscont = seqData.getVisibleContigs();
100 * scan each column, compute and add to each distance[i, j]
101 * the number of feature types that seqi and seqj do not share
103 for (int vc = 0; vc < viscont.length; vc += 2)
105 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
110 * first record feature types in this column for each sequence
112 Map<SeqCigar, Set<String>> sfap = findFeatureTypesAtColumn(
116 * count feature types on either i'th or j'th sequence but not both
117 * and add this 'distance' measure to the total for [i, j] for j > i
119 for (int i = 0; i < (noseqs - 1); i++)
121 for (int j = i + 1; j < noseqs; j++)
123 SeqCigar sc1 = seqs[i];
124 SeqCigar sc2 = seqs[j];
125 Set<String> set1 = sfap.get(sc1);
126 Set<String> set2 = sfap.get(sc2);
127 boolean gap1 = set1 == null;
128 boolean gap2 = set2 == null;
131 * gap-gap always scores zero
132 * residue-residue is always scored
133 * include gap-residue score if params say to do so
135 if ((!gap1 && !gap2) || params.includeGaps())
137 int seqDistance = SetUtils.countDisjunction(set1, set2);
138 distances[i][j] += seqDistance;
146 * normalise the distance scores (summed over columns) by the
147 * number of visible columns used in the calculation
148 * and fill in the bottom half of the matrix
150 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
151 for (int i = 0; i < noseqs; i++)
153 for (int j = i + 1; j < noseqs; j++)
155 distances[i][j] /= cpwidth;
156 distances[j][i] = distances[i][j];
159 return new Matrix(distances);
163 * Builds and returns a map containing a (possibly empty) list (one per
164 * SeqCigar) of visible feature types at the given column position. The map
165 * has no entry for sequences which are gapped at the column position.
168 * @param columnPosition
171 protected Map<SeqCigar, Set<String>> findFeatureTypesAtColumn(
172 SeqCigar[] seqs, int columnPosition)
174 Map<SeqCigar, Set<String>> sfap = new HashMap<SeqCigar, Set<String>>();
175 for (SeqCigar seq : seqs)
177 int spos = seq.findPosition(columnPosition);
180 Set<String> types = new HashSet<String>();
181 List<SequenceFeature> sfs = fr.findFeaturesAtRes(seq.getRefSeq(),
183 for (SequenceFeature sf : sfs)
185 types.add(sf.getType());
187 sfap.put(seq, types);
194 public String getName()
200 public String getDescription()
206 public boolean isDNA()
212 public boolean isProtein()
218 public String toString()
220 return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";