X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2Fscoremodels%2FFeatureScoreModel.java;h=6da55c33fc5c29385b8f7b190215a94ed76e4808;hb=4724cafdcdcc4f3acc77c09261afe56f518a1e87;hp=69538d553cdc08612e96967b779badb10c09c750;hpb=db4eacee27b836db4126dca551887bfc6652d72a;p=jalview.git diff --git a/src/jalview/analysis/scoremodels/FeatureScoreModel.java b/src/jalview/analysis/scoremodels/FeatureScoreModel.java index 69538d5..6da55c3 100644 --- a/src/jalview/analysis/scoremodels/FeatureScoreModel.java +++ b/src/jalview/analysis/scoremodels/FeatureScoreModel.java @@ -23,14 +23,14 @@ package jalview.analysis.scoremodels; import jalview.api.analysis.ScoreModelI; import jalview.api.analysis.ViewBasedAnalysisI; import jalview.datamodel.AlignmentView; +import jalview.datamodel.SeqCigar; import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.util.Comparison; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Hashtable; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI { @@ -44,84 +44,66 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI return true; } + /** + * Calculates a distance measure [i][j] between each pair of sequences as the + * average number of features they have but do not share. That is, find the + * features each sequence pair has at each column, ignore feature types they + * have in common, and count the rest. The totals are normalised by the number + * of columns processed. + */ @Override public float[][] findDistances(AlignmentView seqData) { - int nofeats = 0; - List dft = Arrays.asList(fr.getDisplayedFeatureTypes()); - - if (dft != null) - { - nofeats = dft.size(); - } - - SequenceI[] sequenceString = seqData.getVisibleAlignment( - Comparison.GapChars.charAt(0)).getSequencesArray(); - int noseqs = sequenceString.length; - int cpwidth = seqData.getWidth(); + List dft = fr.getDisplayedFeatureTypes(); + SeqCigar[] seqs = seqData.getSequences(); + int noseqs = seqs.length; + int cpwidth = 0;// = seqData.getWidth(); float[][] distance = new float[noseqs][noseqs]; - if (nofeats == 0) + if (dft.isEmpty()) { - for (float[] d : distance) - { - for (int i = 0; i < d.length; d[i++] = 0f) - { - ; - } - } return distance; } - float max = 0; - for (int cpos = 0; cpos < cpwidth; cpos++) + + // need to get real position for view position + int[] viscont = seqData.getVisibleContigs(); + + /* + * scan each column, compute and add to each distance[i, j] + * the number of feature types that seqi and seqj do not share + */ + for (int vc = 0; vc < viscont.length; vc += 2) { - // get visible features at cpos under view's display settings and compare - // them - List> sfap = new ArrayList>(); - for (int i = 0; i < noseqs; i++) - { - Hashtable types = new Hashtable(); - List sfs = fr.findFeaturesAtRes(sequenceString[i], - sequenceString[i].findPosition(cpos)); - for (SequenceFeature sf : sfs) - { - types.put(sf.getType(), sf); - } - sfap.add(types); - } - for (int i = 0; i < (noseqs - 1); i++) + for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++) { - if (cpos == 0) - { - distance[i][i] = 0f; - } - for (int j = i + 1; j < noseqs; j++) + cpwidth++; + + /* + * first pass: record features types in column for each sequence + */ + Map> sfap = findFeatureTypesAtColumn( + seqs, cpos); + + /* + * count feature types on either i'th or j'th sequence but not both + * and add this 'distance' measure to the total for [i, j] for j > i + */ + for (int i = 0; i < (noseqs - 1); i++) { - int sfcommon = 0; - // compare the two lists of features... - Hashtable fi = sfap.get(i), fk, fj = sfap - .get(j); - if (fi.size() > fj.size()) + for (int j = i + 1; j < noseqs; j++) { - fk = fj; + int seqDistance = countUnsharedFeatureTypes(sfap.get(seqs[i]), + sfap.get(seqs[j])); + distance[i][j] += seqDistance; + // distance[j][i] += distance[i][j]; } - else - { - fk = fi; - fi = fj; - } - for (String k : fi.keySet()) - { - SequenceFeature sfj = fk.get(k); - if (sfj != null) - { - sfcommon++; - } - } - distance[i][j] += (fi.size() + fk.size() - 2f * sfcommon); - distance[j][i] += distance[i][j]; } } } + + /* + * normalise the distance scores (summed over columns) by the + * number of visible columns used in the calculation + */ for (int i = 0; i < noseqs; i++) { for (int j = i + 1; j < noseqs; j++) @@ -133,6 +115,62 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI return distance; } + /** + * Returns the count of values that are set1 or set2 but not in both + * + * @param set1 + * @param set2 + * @return + */ + protected int countUnsharedFeatureTypes(Set set1, Set set2) + { + int size1 = set1.size(); + int size2 = set2.size(); + Set smallerSet = size1 < size2 ? set1 : set2; + Set largerSet = (smallerSet == set1 ? set2 : set1); + int inCommon = 0; + for (String k : smallerSet) + { + if (largerSet.contains(k)) + { + inCommon++; + } + } + + int notInCommon = (size1 - inCommon) + (size2 - inCommon); + return notInCommon; + } + + /** + * Builds and returns a list (one per SeqCigar) of visible feature types at + * the given column position + * + * @param seqs + * @param columnPosition + * @return + */ + protected Map> findFeatureTypesAtColumn( + SeqCigar[] seqs, int columnPosition) + { + Map> sfap = new HashMap>(); + for (SeqCigar seq : seqs) + { + Set types = new HashSet(); + int spos = seq.findPosition(columnPosition); + if (spos != -1) + { + List sfs = fr.findFeaturesAtRes(seq.getRefSeq(), + spos); + for (SequenceFeature sf : sfs) + { + types.add(sf.getType()); + } + } + sfap.put(seq, types); + } + return sfap; + } + @Override public String getName() { @@ -151,6 +189,7 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI return true; } + @Override public String toString() { return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";