1 package jalview.ws.datamodel.alphafold;
4 import java.io.BufferedInputStream;
6 import java.io.FileInputStream;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.BitSet;
10 import java.util.HashMap;
11 import java.util.Iterator;
12 import java.util.List;
15 import org.json.simple.JSONObject;
17 import jalview.analysis.AverageDistanceEngine;
18 import jalview.bin.Console;
19 import jalview.datamodel.BinaryNode;
20 import jalview.datamodel.ContactListI;
21 import jalview.datamodel.ContactListImpl;
22 import jalview.datamodel.ContactListProviderI;
23 import jalview.datamodel.ContactMatrixI;
24 import jalview.datamodel.SequenceDummy;
25 import jalview.datamodel.SequenceI;
26 import jalview.io.DataSourceType;
27 import jalview.io.FileFormatException;
28 import jalview.io.FileParse;
29 import jalview.util.MapUtils;
30 import jalview.ws.dbsources.EBIAlfaFold;
32 public class PAEContactMatrix implements ContactMatrixI
35 SequenceI refSeq = null;
38 * the length that refSeq is expected to be (excluding gaps, of course)
42 int maxrow = 0, maxcol = 0;
44 int[] indices1, indices2;
50 private void setRefSeq(SequenceI _refSeq)
53 while (refSeq.getDatasetSequence() != null)
55 refSeq = refSeq.getDatasetSequence();
57 length = _refSeq.getEnd() - _refSeq.getStart() + 1;
60 @SuppressWarnings("unchecked")
61 public PAEContactMatrix(SequenceI _refSeq, Map<String, Object> pae_obj) throws FileFormatException
64 // convert the lists to primitive arrays and store
66 if (!MapUtils.containsAKey(pae_obj, "predicted_aligned_error", "pae"))
68 parse_version_1_pAE(pae_obj);
73 parse_version_2_pAE(pae_obj);
78 * construct a sequence associated PAE matrix directly from a float array
83 public PAEContactMatrix(SequenceI _refSeq, float[][] matrix)
87 for (float[] row : matrix)
89 if (row.length > maxcol)
102 maxrow = matrix.length;
108 * parse a sane JSON representation of the pAE
112 @SuppressWarnings("unchecked")
113 private void parse_version_2_pAE(Map<String, Object> pae_obj)
116 // look for a maxscore element - if there is one...
119 // this is never going to be reached by the integer rounding.. or is it ?
120 maxscore = ((Double) MapUtils.getFirst(pae_obj,
121 "max_predicted_aligned_error", "max_pae")).floatValue();
122 } catch (Throwable t)
124 // ignore if a key is not found.
126 List<List<Long>> scoreRows = ((List<List<Long>>) MapUtils
127 .getFirst(pae_obj, "predicted_aligned_error", "pae"));
128 elements = new float[scoreRows.size()][scoreRows.size()];
129 int row = 0, col = 0;
130 for (List<Long> scoreRow : scoreRows)
132 Iterator<Long> scores = scoreRow.iterator();
133 while (scores.hasNext())
135 Object d = scores.next();
137 if (d instanceof Double)
139 elements[row][col++] = ((Double) d).longValue();
143 elements[row][col++] = (float) ((Long) d).longValue();
146 if (maxscore < elements[row][col - 1])
148 maxscore = elements[row][col - 1];
159 * v1 format got ditched 28th July 2022 see
160 * https://alphafold.ebi.ac.uk/faq#:~:text=We%20updated%20the%20PAE%20JSON%20file%20format%20on%2028th%20July%202022
164 @SuppressWarnings("unchecked")
165 private void parse_version_1_pAE(Map<String, Object> pae_obj)
167 // assume indices are with respect to range defined by _refSeq on the
169 Iterator<Long> rows = ((List<Long>) pae_obj.get("residue1")).iterator();
170 Iterator<Long> cols = ((List<Long>) pae_obj.get("residue2")).iterator();
171 // two pass - to allocate the elements array
172 while (rows.hasNext())
174 int row = rows.next().intValue();
175 int col = cols.next().intValue();
186 rows = ((List<Long>) pae_obj.get("residue1")).iterator();
187 cols = ((List<Long>) pae_obj.get("residue2")).iterator();
188 Iterator<Double> scores = ((List<Double>) pae_obj.get("distance"))
190 elements = new float[maxrow][maxcol];
191 while (scores.hasNext())
193 float escore = scores.next().floatValue();
194 int row = rows.next().intValue();
195 int col = cols.next().intValue();
204 elements[row - 1][col - 1] = escore;
207 maxscore = ((Double) MapUtils.getFirst(pae_obj,
208 "max_predicted_aligned_error", "max_pae")).floatValue();
212 public ContactListI getContactList(final int _column)
214 if (_column < 0 || _column >= elements.length)
219 return new ContactListImpl(new ContactListProviderI()
222 public int getPosition()
228 public int getContactHeight()
234 public double getContactAt(int column)
236 if (column < 0 || column >= elements[_column].length)
240 return elements[_column][column];
246 public float getMin()
252 public float getMax()
258 public boolean hasReferenceSeq()
260 return (refSeq != null);
264 public SequenceI getReferenceSeq()
270 public String getAnnotDescr()
272 return "Predicted Alignment Error"+((refSeq==null) ? "" : (" for " + refSeq.getName()));
276 public String getAnnotLabel()
278 StringBuilder label = new StringBuilder("PAE Matrix");
279 //if (this.getReferenceSeq() != null)
281 // label.append(":").append(this.getReferenceSeq().getDisplayId(false));
283 return label.toString();
286 public static final String PAEMATRIX = "PAE_MATRIX";
289 public String getType()
295 public int getWidth()
301 public int getHeight()
305 List<BitSet> groups=null;
307 public boolean hasGroups()
313 public String getNewick()
318 public boolean hasTree()
320 return newick!=null && newick.length()>0;
324 String treeType=null;
325 public void makeGroups(float thresh,boolean abs)
327 AverageDistanceEngine clusterer = new AverageDistanceEngine(null, null, this);
328 double height = clusterer.findHeight(clusterer.getTopNode());
329 newick = new jalview.io.NewickFile(clusterer.getTopNode(),false,true).print();
331 Console.trace("Newick string\n"+newick);
333 List<BinaryNode> nodegroups;
334 if (abs ? height > thresh : 0 < thresh && thresh < 1)
336 float cut = abs ? (float) (thresh / height) : thresh;
337 Console.debug("Threshold "+cut+" for height="+height);
339 nodegroups = clusterer.groupNodes(cut);
343 nodegroups = new ArrayList<BinaryNode>();
344 nodegroups.add(clusterer.getTopNode());
348 groups = new ArrayList<>();
349 for (BinaryNode root:nodegroups)
351 BitSet gpset=new BitSet();
352 for (BinaryNode leaf:clusterer.findLeaves(root))
354 gpset.set((Integer)leaf.element());
360 public void updateGroups(List<BitSet> colGroups)
368 public BitSet getGroupsFor(int column)
370 for (BitSet gp:groups) {
376 return ContactMatrixI.super.getGroupsFor(column);
379 HashMap<BitSet,Color> colorMap = new HashMap<>();
381 public Color getColourForGroup(BitSet bs)
386 Color groupCol=colorMap.get(bs);
394 public void setColorForGroup(BitSet bs,Color color)
396 colorMap.put(bs,color);
398 public void restoreGroups(List<BitSet> newgroups, String treeMethod,
399 String tree, double thresh2)
408 public boolean hasCutHeight() {
409 return groups!=null && thresh!=0;
412 public double getCutHeight()
417 public String getTreeMethod()
422 public static void validateContactMatrixFile(String fileName) throws FileFormatException,IOException
424 FileInputStream infile=null;
426 infile = new FileInputStream(new File(fileName));
427 } catch (Throwable t)
429 new IOException("Couldn't open "+fileName,t);
433 JSONObject paeDict=null;
435 paeDict = EBIAlfaFold.parseJSONtoPAEContactMatrix(infile);
436 } catch (Throwable t)
438 new FileFormatException("Couldn't parse "+fileName+" as a JSON dict or array containing a dict");
441 PAEContactMatrix matrix = new PAEContactMatrix(new SequenceDummy("Predicted"), (Map<String,Object>)paeDict);
442 if (matrix.getWidth()<=0)
444 throw new FileFormatException("No data in PAE matrix read from '"+fileName+"'");