/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis;
import jalview.bin.Cache;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.HiddenMarkovModel;
import jalview.datamodel.PDBEntry;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Vector;
import static java.lang.String.format;
public class SeqsetUtils
{
public static class SequenceInfo {
private String name;
private int start;
private int end;
private Optional description = Optional.empty();
private Optional> features = Optional.empty();
private Optional> pdbId = Optional.empty();
private Optional dataset = Optional.empty();
private Optional hmm = Optional.empty();
private Optional searchScores = Optional.empty();
private SequenceInfo(String name, int start, int end) {
this.name = name;
this.start = start;
this.end = end;
}
}
/**
* Store essential properties of a sequence in a hashtable for later recovery
* Keys are Name, Start, End, SeqFeatures, PdbId, HMM
*
* @param seq
* SequenceI
* @return Hashtable
*/
public static SequenceInfo SeqCharacterHash(SequenceI seq)
{
SequenceInfo sqinfo = new SequenceInfo(seq.getName(), seq.getStart(), seq.getEnd());
sqinfo.description = Optional.ofNullable(seq.getDescription());
sqinfo.dataset = Optional.ofNullable(seq.getDatasetSequence());
if (!sqinfo.dataset.isPresent())
{
ArrayList feats = new ArrayList<>(
seq.getFeatures().getAllFeatures());
sqinfo.features = Optional.of(feats);
sqinfo.pdbId = Optional.of(Objects.requireNonNullElse(
seq.getAllPDBEntries(), new ArrayList<>()));
}
if (seq.hasHMMProfile())
{
sqinfo.hmm = Optional.of(seq.getHMM());
}
sqinfo.searchScores = Optional.ofNullable(seq.getAnnotation("Search Scores"));
return sqinfo;
}
/**
* Recover essential properties of a sequence from a hashtable TODO: replace
* these methods with something more elegant.
*
* @param sq
* SequenceI
* @param sqinfo
* Hashtable
* @return boolean true if name was not updated from sqinfo Name entry
*/
public static boolean SeqCharacterUnhash(SequenceI sq, SequenceInfo sqinfo)
{
if (sqinfo == null)
{
return false;
}
if (sqinfo.name != null)
{
sq.setName(sqinfo.name);
}
sq.setStart(sqinfo.start);
sq.setEnd(sqinfo.end);
if (sqinfo.pdbId.isPresent() && !sqinfo.pdbId.get().isEmpty())
sq.setPDBId(new Vector<>(sqinfo.pdbId.get()));
if (sqinfo.features.isPresent() && !sqinfo.features.get().isEmpty())
sq.setSequenceFeatures(sqinfo.features.get());
if (sqinfo.description.isPresent())
sq.setDescription(sqinfo.description.get());
if (sqinfo.dataset.isPresent())
{
if (sqinfo.features.isPresent())
{
Cache.log.warn("Setting dataset sequence for a sequence which has " +
"sequence features. Dataset sequence features will not be visible.");
assert false;
}
sq.setDatasetSequence(sqinfo.dataset.get());
}
if (sqinfo.hmm.isPresent())
sq.setHMM(new HiddenMarkovModel(sqinfo.hmm.get(), sq));
if (sqinfo.searchScores.isPresent())
{
for (AlignmentAnnotation score : sqinfo.searchScores.get())
{
sq.addAlignmentAnnotation(score);
}
}
return sqinfo.name != null;
}
/**
* Form of the unique name used in uniquify for the i'th sequence in an
* ordered vector of sequences.
*
* @param i
* int
* @return String
*/
public static String unique_name(int i)
{
return String.format("Sequence%d", i);
}
/**
* Generates a hash of SeqCharacterHash properties for each sequence in a
* sequence set, and optionally renames the sequences to an unambiguous 'safe'
* name.
*
* @param sequences
* SequenceI[]
* @param write_names
* boolean set this to rename each of the sequences to its
* unique_name(index) name
* @return Hashtable to be passed to
* @see deuniquify to recover original names (and properties) for renamed
* sequences
*/
public static Map uniquify(SequenceI[] sequences,
boolean write_names)
{
// Generate a safely named sequence set and a hash to recover the sequence
// names
HashMap map = new HashMap<>();
// String[] un_names = new String[sequences.length];
for (int i = 0; i < sequences.length; i++)
{
String safename = unique_name(i);
map.put(safename, SeqCharacterHash(sequences[i]));
if (write_names)
{
sequences[i].setName(safename);
}
}
return map;
}
/**
* recover unsafe sequence names and original properties for a sequence set
* using a map generated by
*
* @see uniquify(sequences,true)
* @param map
* Hashtable
* @param sequences
* SequenceI[]
* @return boolean
*/
public static boolean deuniquify(Map map,
SequenceI[] sequences)
{
return deuniquify(map, sequences, true);
}
/**
* recover unsafe sequence names and original properties for a sequence set
* using a map generated by
*
* @see uniquify(sequences,true)
* @param map
* Hashtable
* @param sequences
* SequenceI[]
* @param quiet
* when false, don't complain about sequences without any data in the
* map.
* @return boolean
*/
public static boolean deuniquify(Map map,
SequenceI[] sequences, boolean quiet)
{
jalview.analysis.SequenceIdMatcher matcher = new SequenceIdMatcher(
sequences);
SequenceI msq = null;
Iterator keys = map.keySet().iterator();
Vector unmatched = new Vector<>();
for (int i = 0, j = sequences.length; i < j; i++)
{
unmatched.addElement(sequences[i]);
}
while (keys.hasNext())
{
String key = keys.next();
try {
if ((msq = matcher.findIdMatch((String) key)) != null)
{
SequenceInfo sqinfo = map.get(key);
unmatched.removeElement(msq);
SeqCharacterUnhash(msq, sqinfo);
}
else
{
if (!quiet)
{
Cache.log.warn(format("Can't find '%s' in uniquified alignment",
key));
}
}
} catch (ClassCastException ccastex) {
if (!quiet)
{
Cache.log.error("Unexpected object in SeqSet map : "+ key.getClass());
}
}
}
if (unmatched.size() > 0 && !quiet)
{
StringBuilder sb = new StringBuilder("Did not find match for sequences: ");
Enumeration i = unmatched.elements();
sb.append(i.nextElement().getName());
for (; i.hasMoreElements();)
{
sb.append(", " + i.nextElement().getName());
}
Cache.log.warn(sb.toString());
return false;
}
return true;
}
/**
* returns a subset of the sequenceI seuqences, including only those that
* contain at least one residue.
*
* @param sequences
* SequenceI[]
* @return SequenceI[]
*/
public static SequenceI[] getNonEmptySequenceSet(SequenceI[] sequences)
{
// Identify first row of alignment with residues for prediction
boolean ungapped[] = new boolean[sequences.length];
int msflen = 0;
for (int i = 0, j = sequences.length; i < j; i++)
{
String tempseq = jalview.analysis.AlignSeq.extractGaps(
jalview.util.Comparison.GapChars,
sequences[i].getSequenceAsString());
if (tempseq.length() == 0)
{
ungapped[i] = false;
}
else
{
ungapped[i] = true;
msflen++;
}
}
if (msflen == 0)
{
return null; // no minimal set
}
// compose minimal set
SequenceI[] mset = new SequenceI[msflen];
for (int i = 0, j = sequences.length, k = 0; i < j; i++)
{
if (ungapped[i])
{
mset[k++] = sequences[i];
}
}
ungapped = null;
return mset;
}
}