*/
package jalview.analysis;
-import java.util.Enumeration;
-import java.util.List;
-import java.util.Vector;
-import java.util.Hashtable;
-
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
-import jalview.datamodel.DBRefSource;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.io.gff.SequenceOntology;
+import jalview.schemes.ResidueProperties;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
+import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
import jalview.ws.SequenceFetcher;
import jalview.ws.seqfetcher.ASequenceFetcher;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Vector;
+
/**
* Functions for cross-referencing sequence databases. user must first specify
* if cross-referencing from protein or dna (set dna==true)
public class CrossRef
{
/**
- * get the DNA or protein references for a protein or dna sequence
+ * Select just the DNA or protein references for a protein or dna sequence
*
- * @param dna
- * @param rfs
+ * @param fromDna
+ * if true, select references from DNA (i.e. Protein databases), else
+ * DNA database references
+ * @param refs
+ * a set of references to select from
* @return
*/
- public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
- {
- if (dna)
- {
- rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
- }
- else
- {
- rfs = jalview.util.DBRefUtils.selectRefs(rfs,
- DBRefSource.DNACODINGDBS); // could attempt to find other cross
- // refs and return here - ie PDB xrefs
- // (not dna, not protein seq)
- }
- return rfs;
- }
-
- public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
+ public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
{
- Hashtable classes = new Hashtable();
- classes.put(DBRefSource.PROTEINDBS,
- jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
- classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
- .selectRefs(rfs, DBRefSource.DNACODINGDBS));
- classes.put(DBRefSource.DOMAINDBS,
- jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
- // classes.put(OTHER, )
- return classes;
+ return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
+ : DBRefSource.DNACODINGDBS);
+ // could attempt to find other cross
+ // refs here - ie PDB xrefs
+ // (not dna, not protein seq)
}
/**
SequenceI[] seqs, AlignmentI dataset)
{
String[] dbrefs = null;
- Vector refs = new Vector();
- for (int s = 0; s < seqs.length; s++)
+ List<String> refs = new ArrayList<String>();
+ for (SequenceI seq : seqs)
{
- if (seqs[s] != null)
+ if (seq != null)
{
-
- SequenceI dss = seqs[s];
+ SequenceI dss = seq;
while (dss.getDatasetSequence() != null)
{
dss = dss.getDatasetSequence();
}
DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef());
- for (int r = 0; rfs != null && r < rfs.length; r++)
+ if (rfs != null)
{
- if (!refs.contains(rfs[r].getSource()))
+ for (DBRefEntry ref : rfs)
{
- refs.addElement(rfs[r].getSource());
+ if (!refs.contains(ref.getSource()))
+ {
+ refs.add(ref.getSource());
+ }
}
}
if (dataset != null)
{
// search for references to this sequence's direct references.
- DBRefEntry[] lrfs = CrossRef
- .findXDbRefs(!dna, seqs[s].getDBRef());
- Vector rseqs = new Vector();
- CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRef());
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
+ CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
null); // don't need to specify codon frame for mapping here
- Enumeration lr = rseqs.elements();
- while (lr.hasMoreElements())
+ for (SequenceI rs : rseqs)
{
- SequenceI rs = (SequenceI) lr.nextElement();
DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
- for (int r = 0; rfs != null && r < rfs.length; r++)
+ if (xrs != null)
{
- if (!refs.contains(rfs[r].getSource()))
+ for (DBRefEntry ref : xrs)
{
- refs.addElement(rfs[r].getSource());
+ if (!refs.contains(ref.getSource()))
+ {
+ refs.add(ref.getSource());
+ }
}
}
+ // looks like copy and paste - change rfs to xrs?
+ // for (int r = 0; rfs != null && r < rfs.length; r++)
+ // {
+ // if (!refs.contains(rfs[r].getSource()))
+ // {
+ // refs.add(rfs[r].getSource());
+ // }
+ // }
}
}
}
if (refs.size() > 0)
{
dbrefs = new String[refs.size()];
- refs.copyInto(dbrefs);
+ refs.toArray(dbrefs);
}
return dbrefs;
}
- /*
- * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
- * reference if (!refs.contains(rfs[r].getSource())) {
- * refs.addElement(rfs[r].getSource()); } } }
- */
public static boolean hasCdnaMap(SequenceI[] seqs)
{
+ // TODO unused - remove?
String[] reftypes = findSequenceXrefTypes(false, seqs);
for (int s = 0; s < reftypes.length; s++)
{
public static SequenceI[] getCdnaMap(SequenceI[] seqs)
{
+ // TODO unused - remove?
Vector cseqs = new Vector();
for (int s = 0; s < seqs.length; s++)
{
/**
*
* @param seqs
+ * sequences whose xrefs are being retrieved
* @param dna
+ * true if sequences are nucleotide
* @param source
* @param dataset
* alignment to search for product sequences.
public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
String source, AlignmentI dataset)
{
- Vector rseqs = new Vector();
- Alignment ral = null;
- AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width
- for (int s = 0; s < seqs.length; s++)
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
+ AlignedCodonFrame cf = new AlignedCodonFrame();
+ for (SequenceI seq : seqs)
{
- SequenceI dss = seqs[s];
+ SequenceI dss = seq;
while (dss.getDatasetSequence() != null)
{
dss = dss.getDatasetSequence();
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
System.out.println("Attempting to find ds Xrefs refs.");
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
- // ambiguous
- // would
- // be a
- // 'find
- // primary
- // dbRefEntry'
- // method.
+ // FIXME should be dss not seq here?
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRef());
+ // less ambiguous would be a 'find primary dbRefEntry' method.
// filter for desired source xref here
found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
rseqs, cf);
}
for (int r = 0; xrfs != null && r < xrfs.length; r++)
{
- if (source != null && !source.equals(xrfs[r].getSource()))
+ DBRefEntry xref = xrfs[r];
+ if (source != null && !source.equals(xref.getSource()))
+ {
continue;
- if (xrfs[r].hasMap())
+ }
+ if (xref.hasMap())
{
- if (xrfs[r].getMap().getTo() != null)
+ if (xref.getMap().getTo() != null)
{
- Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
- rseqs.addElement(rsq);
- if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
+ SequenceI rsq = new Sequence(xref.getMap().getTo());
+ rseqs.add(rsq);
+ if (xref.getMap().getMap().getFromRatio() != xref
.getMap().getMap().getToRatio())
{
// get sense of map correct for adding to product alignment.
if (dna)
{
// map is from dna seq to a protein product
- cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
+ cf.addMap(dss, rsq, xref.getMap().getMap());
}
else
{
// map should be from protein seq to its coding dna
- cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
+ cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
}
+
+ /*
+ * compute peptide variants from dna variants
+ */
+ rsq.createDatasetSequence();
+ computeProteinVariants(seq, rsq, xref.getMap().getMap());
}
found = true;
}
// xrefs on this sequence.
if (dataset != null)
{
- found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
+ found |= searchDataset(dss, xref, dataset, rseqs, cf); // ,false,!dna);
if (found)
+ {
xrfs[r] = null; // we've recovered seqs for this one.
+ }
}
}
}
for (int r = 0; r < xrfs.length; r++)
{
if (xrfs[r] != null)
+ {
t[l++] = xrfs[r];
+ }
}
xrfs = t;
try
{
- retrieved = sftch.getSequences(xrfs); // problem here is we don't
- // know which of xrfs
- // resulted in which
+ retrieved = sftch.getSequences(xrfs, !dna);
+ // problem here is we don't know which of xrfs resulted in which
// retrieved element
} catch (Exception e)
{
System.err
.println("Problem whilst retrieving cross references for Sequence : "
- + seqs[s].getName());
+ + seq.getName());
e.printStackTrace();
}
if (retrieved != null)
for (int rs = 0; rs < retrieved.length; rs++)
{
// TODO: examine each sequence for 'redundancy'
- jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
- .getDBRef();
+ DBRefEntry[] dbr = retrieved[rs].getDBRef();
if (dbr != null && dbr.length > 0)
{
for (int di = 0; di < dbr.length; di++)
{
// find any entry where we should put in the sequence being
// cross-referenced into the map
- jalview.datamodel.Mapping map = dbr[di].getMap();
+ Mapping map = dbr[di].getMap();
if (map != null)
{
if (map.getTo() != null && map.getMap() != null)
}
}
retrieved[rs].updatePDBIds();
- rseqs.addElement(retrieved[rs]);
+ rseqs.add(retrieved[rs]);
}
}
}
}
}
}
+
+ Alignment ral = null;
if (rseqs.size() > 0)
{
SequenceI[] rsqs = new SequenceI[rseqs.size()];
- rseqs.copyInto(rsqs);
+ rseqs.toArray(rsqs);
ral = new Alignment(rsqs);
- if (cf != null && cf.getProtMappings() != null)
+ if (cf != null && !cf.isEmpty())
{
ral.addCodonFrame(cf);
}
* @return true if matches were found.
*/
private static boolean searchDatasetXrefs(SequenceI sequenceI,
- boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
- AlignedCodonFrame cf)
+ boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
+ List<SequenceI> rseqs, AlignedCodonFrame cf)
{
boolean found = false;
if (lrfs == null)
+ {
return false;
+ }
for (int i = 0; i < lrfs.length; i++)
{
DBRefEntry xref = new DBRefEntry(lrfs[i]);
* @return true if one or more unique sequences were found and added
*/
public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
+ AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
{
return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
}
* @return true if relationship found and sequence added.
*/
public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
+ AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
boolean direct, boolean dna)
{
boolean found = false;
SequenceI[] typer = new SequenceI[1];
if (dataset == null)
+ {
return false;
+ }
if (dataset.getSequences() == null)
{
System.err.println("Empty dataset sequence set - NO VECTOR");
synchronized (ds = dataset.getSequences())
{
for (SequenceI nxt : ds)
+ {
if (nxt != null)
{
if (nxt.getDatasetSequence() != null)
{
if (!rseqs.contains(nxt))
{
- rseqs.addElement(nxt);
- boolean foundmap = cf != null; // don't search if we aren't
- // given
- // a codon map object
+ rseqs.add(nxt);
+ boolean foundmap = cf != null;
+ // don't search if we aren't given a codon map object
for (int r = 0; foundmap && r < cands.length; r++)
{
if (cands[r].hasMap())
}
}
+ }
}
return found;
}
/**
+ * Computes variants in peptide product generated by variants in dna, and adds
+ * them as sequence_variant features on the protein sequence. Returns the
+ * number of variant features added.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ protected static int computeProteinVariants(SequenceI dnaSeq,
+ SequenceI peptide, MapList dnaToProtein)
+ {
+ /*
+ * map from peptide position to all variant features of the codon for it
+ * LinkedHashMap ensures we add the peptide features in sequence order
+ */
+ LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ SequenceOntology so = SequenceOntology.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return 0;
+ }
+
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isSequenceVariant(sf.getType()))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ String[][] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new String[3][];
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.split(",");
+
+ /*
+ * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and this variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(dnaSeq
+ .getCharAt(codon[codonPos] - 1));
+ if (codon[codonPos] == dnaCol)
+ {
+ /*
+ * record current dna base and its alleles
+ */
+ String[] dnaVariants = new String[alleles.length + 1];
+ dnaVariants[0] = nucleotide;
+ System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
+ codonVariants[codonPos] = dnaVariants;
+ }
+ else if (codonVariants[codonPos] == null)
+ {
+ /*
+ * record current dna base only
+ * (at least until we find any variation and overwrite it)
+ */
+ codonVariants[codonPos] = new String[] { nucleotide };
+ }
+ }
+ }
+ }
+
+ /*
+ * scan codon variations, compute peptide variants and add to peptide sequence
+ */
+ int count = 0;
+ for (Entry<Integer, String[][]> variant : variants.entrySet())
+ {
+ int peptidePos = variant.getKey();
+ String[][] codonVariants = variant.getValue();
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
+ List<String> peptideVariants = computePeptideVariants(codonVariants,
+ residue);
+ if (!peptideVariants.isEmpty())
+ {
+ Collections.sort(peptideVariants);
+ String desc = StringUtils.listToDelimitedString(peptideVariants,
+ ", ");
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, Float.NaN, null);
+ peptide.getDatasetSequence().addSequenceFeature(sf);
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Returns a non-redundant list of all peptide translations generated by the
+ * given dna variants, excluding the current residue value
+ *
+ * @param codonVariants
+ * an array of base values for codon positions 1, 2, 3
+ * @param residue
+ * the current residue translation
+ * @return
+ */
+ protected static List<String> computePeptideVariants(
+ String[][] codonVariants, String residue)
+ {
+ List<String> result = new ArrayList<String>();
+ for (String base1 : codonVariants[0])
+ {
+ for (String base2 : codonVariants[1])
+ {
+ for (String base3 : codonVariants[2])
+ {
+ String codon = base1 + base2 + base3;
+ // TODO: report frameshift/insertion/deletion
+ // and multiple-base variants?!
+ String peptide = codon.contains("-") ? "-" : ResidueProperties
+ .codonTranslate(codon);
+ if (peptide != null && !result.contains(peptide)
+ && !peptide.equals(residue))
+ {
+ result.add(peptide);
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Computes a list of all peptide variants given dna variants
+ *
+ * @param dnaSeq
+ * the coding dna sequence
+ * @param codonVariants
+ * variant features for each codon position (null if no variant)
+ * @param residue
+ * the canonical protein translation
+ * @return
+ */
+ protected static List<String> computePeptideVariants(SequenceI dnaSeq,
+ SequenceFeature[] codonVariants, String residue)
+ {
+ List<String> result = new ArrayList<String>();
+ int[][] dnaVariants = new int[3][];
+ for (int i = 0; i < 3; i++)
+ {
+
+ }
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ /**
* precalculate different products that can be found for seqs in dataset and
* return them.
*