/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.bin.Cache; import jalview.bin.Console; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.PDBEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Vector; import static java.lang.String.format; public class SeqsetUtils { public static class SequenceInfo { private String name; private int start; private int end; private Optional description = Optional.empty(); private Optional> features = Optional.empty(); private Optional> pdbId = Optional.empty(); private Optional dataset = Optional.empty(); private Optional hmm = Optional.empty(); private Optional searchScores = Optional.empty(); private SequenceInfo(String name, int start, int end) { this.name = name; this.start = start; this.end = end; } } /** * Store essential properties of a sequence in a hashtable for later recovery * Keys are Name, Start, End, SeqFeatures, PdbId, HMM * * @param seq * SequenceI * @return Hashtable */ public static SequenceInfo SeqCharacterHash(SequenceI seq) { SequenceInfo sqinfo = new SequenceInfo(seq.getName(), seq.getStart(), seq.getEnd()); sqinfo.description = Optional.ofNullable(seq.getDescription()); sqinfo.dataset = Optional.ofNullable(seq.getDatasetSequence()); if (!sqinfo.dataset.isPresent()) { ArrayList feats = new ArrayList<>( seq.getFeatures().getAllFeatures()); sqinfo.features = Optional.of(feats); sqinfo.pdbId = Optional.of(Objects.requireNonNullElse( seq.getAllPDBEntries(), new ArrayList<>())); } if (seq.hasHMMProfile()) { sqinfo.hmm = Optional.of(seq.getHMM()); } sqinfo.searchScores = Optional.ofNullable(seq.getAnnotation("Search Scores")); return sqinfo; } /** * Recover essential properties of a sequence from a hashtable TODO: replace * these methods with something more elegant. * * @param sq * SequenceI * @param sqinfo * Hashtable * @return boolean true if name was not updated from sqinfo Name entry */ public static boolean SeqCharacterUnhash(SequenceI sq, SequenceInfo sqinfo) { if (sqinfo == null) { return false; } if (sqinfo.name != null) { sq.setName(sqinfo.name); } sq.setStart(sqinfo.start); sq.setEnd(sqinfo.end); if (sqinfo.pdbId.isPresent() && !sqinfo.pdbId.get().isEmpty()) sq.setPDBId(new Vector<>(sqinfo.pdbId.get())); if (sqinfo.features.isPresent() && !sqinfo.features.get().isEmpty()) sq.setSequenceFeatures(sqinfo.features.get()); if (sqinfo.description.isPresent()) sq.setDescription(sqinfo.description.get()); if (sqinfo.dataset.isPresent()) { if (sqinfo.features.isPresent()) { Console.warn("Setting dataset sequence for a sequence which has " + "sequence features. Dataset sequence features will not be visible."); assert false; } sq.setDatasetSequence(sqinfo.dataset.get()); } if (sqinfo.hmm.isPresent()) sq.setHMM(new HiddenMarkovModel(sqinfo.hmm.get(), sq)); if (sqinfo.searchScores.isPresent()) { for (AlignmentAnnotation score : sqinfo.searchScores.get()) { sq.addAlignmentAnnotation(score); } } return sqinfo.name != null; } /** * Form of the unique name used in uniquify for the i'th sequence in an * ordered vector of sequences. * * @param i * int * @return String */ public static String unique_name(int i) { return String.format("Sequence%d", i); } /** * Generates a hash of SeqCharacterHash properties for each sequence in a * sequence set, and optionally renames the sequences to an unambiguous 'safe' * name. * * @param sequences * SequenceI[] * @param write_names * boolean set this to rename each of the sequences to its * unique_name(index) name * @return Hashtable to be passed to * @see deuniquify to recover original names (and properties) for renamed * sequences */ public static Map uniquify(SequenceI[] sequences, boolean write_names) { // Generate a safely named sequence set and a hash to recover the sequence // names HashMap map = new HashMap<>(); // String[] un_names = new String[sequences.length]; for (int i = 0; i < sequences.length; i++) { String safename = unique_name(i); map.put(safename, SeqCharacterHash(sequences[i])); if (write_names) { sequences[i].setName(safename); } } return map; } /** * recover unsafe sequence names and original properties for a sequence set * using a map generated by * * @see uniquify(sequences,true) * @param map * Hashtable * @param sequences * SequenceI[] * @return boolean */ public static boolean deuniquify(Map map, SequenceI[] sequences) { return deuniquify(map, sequences, true); } /** * recover unsafe sequence names and original properties for a sequence set * using a map generated by * * @see uniquify(sequences,true) * @param map * Hashtable * @param sequences * SequenceI[] * @param quiet * when false, don't complain about sequences without any data in the * map. * @return boolean */ public static boolean deuniquify(Map map, SequenceI[] sequences, boolean quiet) { jalview.analysis.SequenceIdMatcher matcher = new SequenceIdMatcher( sequences); SequenceI msq = null; Iterator keys = map.keySet().iterator(); Vector unmatched = new Vector<>(); for (int i = 0, j = sequences.length; i < j; i++) { unmatched.addElement(sequences[i]); } while (keys.hasNext()) { String key = keys.next(); try { if ((msq = matcher.findIdMatch((String) key)) != null) { SequenceInfo sqinfo = map.get(key); unmatched.removeElement(msq); SeqCharacterUnhash(msq, sqinfo); } else { if (!quiet) { Console.warn(format("Can't find '%s' in uniquified alignment", key)); } } } catch (ClassCastException ccastex) { if (!quiet) { Console.error("Unexpected object in SeqSet map : "+ key.getClass()); } } } if (unmatched.size() > 0 && !quiet) { StringBuilder sb = new StringBuilder("Did not find match for sequences: "); Enumeration i = unmatched.elements(); sb.append(i.nextElement().getName()); for (; i.hasMoreElements();) { sb.append(", " + i.nextElement().getName()); } Console.warn(sb.toString()); return false; } return true; } /** * returns a subset of the sequenceI seuqences, including only those that * contain at least one residue. * * @param sequences * SequenceI[] * @return SequenceI[] */ public static SequenceI[] getNonEmptySequenceSet(SequenceI[] sequences) { // Identify first row of alignment with residues for prediction boolean ungapped[] = new boolean[sequences.length]; int msflen = 0; for (int i = 0, j = sequences.length; i < j; i++) { String tempseq = jalview.analysis.AlignSeq.extractGaps( jalview.util.Comparison.GapChars, sequences[i].getSequenceAsString()); if (tempseq.length() == 0) { ungapped[i] = false; } else { ungapped[i] = true; msflen++; } } if (msflen == 0) { return null; // no minimal set } // compose minimal set SequenceI[] mset = new SequenceI[msflen]; for (int i = 0, j = sequences.length, k = 0; i < j; i++) { if (ungapped[i]) { mset[k++] = sequences[i]; } } ungapped = null; return mset; } }