label.linked_view_title = Linked cDNA and protein view
label.align = Align
label.extract_scores = Extract Scores
-label.get_cross_refs = Get Cross References
+label.get_cross_refs = Get Cross-References
label.sort_alignment_new_tree = Sort Alignment With New Tree
label.add_sequences = Add Sequences
label.new_window = New Window
import jalview.datamodel.AlignedCodon;
import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.FeatureProperties;
import jalview.datamodel.Mapping;
import jalview.datamodel.SearchResults;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceGroup;
import jalview.datamodel.SequenceI;
import jalview.schemes.ResidueProperties;
+import jalview.util.DBRefUtils;
import jalview.util.MapList;
+import jalview.util.MappingUtils;
/**
* grab bag of useful alignment manipulation operations Expect these to be
}
return false;
}
+
+ /**
+ * Constructs an alignment consisting of the mapped exon regions in the given
+ * nucleotide sequences, and updates mappings to match.
+ *
+ * @param dna
+ * aligned dna sequences
+ * @param mappings
+ * from dna to protein; these are replaced with new mappings
+ * @return an alignment whose sequences are the exon-only parts of the dna
+ * sequences (or null if no exons are found)
+ */
+ public static AlignmentI makeExonAlignment(SequenceI[] dna,
+ Set<AlignedCodonFrame> mappings)
+ {
+ Set<AlignedCodonFrame> newMappings = new HashSet<AlignedCodonFrame>();
+ List<SequenceI> exonSequences = new ArrayList<SequenceI>();
+
+ for (SequenceI dnaSeq : dna)
+ {
+ final SequenceI ds = dnaSeq.getDatasetSequence();
+ List<AlignedCodonFrame> seqMappings = MappingUtils
+ .findMappingsForSequence(ds, mappings);
+ if (!seqMappings.isEmpty())
+ {
+ /*
+ * We assume here that only one protein mapping is expected per dna
+ * sequence. Mapping to multiple protein sequences is conceivable but
+ * undefined. Splitting a mapping to one protein sequence across
+ * multiple mappings is possible but pathological. Need closer
+ * constraints on the contents of AlignedCodonFrame.
+ */
+ AlignedCodonFrame newMapping = new AlignedCodonFrame();
+ final SequenceI exonSequence = makeExonSequence(ds,
+ seqMappings.get(0), newMapping);
+ exonSequences.add(exonSequence);
+ newMappings.add(newMapping);
+ }
+ }
+ AlignmentI al = new Alignment(
+ exonSequences.toArray(new SequenceI[exonSequences.size()]));
+ al.setDataset(null);
+
+ /*
+ * Replace the old mappings with the new ones
+ */
+ mappings.clear();
+ mappings.addAll(newMappings);
+
+ return al;
+ }
+
+ /**
+ * Helper method to make an exon-only sequence and populate its mapping to
+ * protein
+ * <p>
+ * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
+ * then generate a sequence CCTTGA with mapping [1, 6] to the same protein
+ * residues
+ *
+ * @param dnaSeq
+ * a dna dataset sequence
+ * @param mapping
+ * the current mapping of the sequence to protein
+ * @param newMapping
+ * the new mapping to populate, from the exon-only sequence
+ * @return
+ */
+ protected static SequenceI makeExonSequence(SequenceI dnaSeq,
+ AlignedCodonFrame acf, AlignedCodonFrame newMapping)
+ {
+ Mapping mapping = acf.getMappingForSequence(dnaSeq);
+ final char[] dna = dnaSeq.getSequence();
+ StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+
+ /*
+ * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ */
+ List<int[]> exonRanges = mapping.getMap().getFromRanges();
+ for (int[] range : exonRanges)
+ {
+ for (int pos = range[0]; pos <= range[1]; pos++)
+ {
+ newSequence.append(dna[pos - 1]);
+ }
+ }
+
+ SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString());
+
+ /*
+ * Locate any xrefs to CDS database on the protein product and attach to the
+ * CDS sequence. Also add as a sub-token of the sequence name.
+ */
+ // default to "CDS" if we can't locate an actual gene id
+ String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
+ DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(
+ mapping.getTo().getDBRef(), DBRefSource.CODINGDBS);
+ if (cdsRefs != null)
+ {
+ for (DBRefEntry cdsRef : cdsRefs)
+ {
+ exon.addDBRef(new DBRefEntry(cdsRef));
+ cdsAccId = cdsRef.getAccessionId();
+ }
+ }
+ exon.setName(exon.getName() + "|" + cdsAccId);
+ exon.createDatasetSequence();
+
+ /*
+ * Build new mappings - from the same protein regions, but now to contiguous
+ * exons
+ */
+ List<int[]> exonRange = new ArrayList<int[]>();
+ exonRange.add(new int[]
+ { 1, newSequence.length() });
+ MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1);
+ newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map);
+
+ return exon;
+ }
}
*/
package jalview.analysis;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Vector;
+
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
+import jalview.util.DBRefUtils;
import jalview.ws.SequenceFetcher;
import jalview.ws.seqfetcher.ASequenceFetcher;
-import java.util.Enumeration;
-import java.util.Hashtable;
-import java.util.List;
-import java.util.Vector;
-
/**
* Functions for cross-referencing sequence databases. user must first specify
* if cross-referencing from protein or dna (set dna==true)
public class CrossRef
{
/**
- * get the DNA or protein references for a protein or dna sequence
+ * Select just the DNA or protein references for a protein or dna sequence
*
- * @param dna
- * @param rfs
+ * @param fromDna
+ * if true, select references from DNA (i.e. Protein databases), else
+ * DNA database references
+ * @param refs
+ * a set of references to select from
* @return
*/
- public static DBRefEntry[] findXDbRefs(boolean dna, DBRefEntry[] rfs)
+ public static DBRefEntry[] findXDbRefs(boolean fromDna, DBRefEntry[] refs)
{
- if (dna)
- {
- rfs = jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS);
- }
- else
- {
- rfs = jalview.util.DBRefUtils.selectRefs(rfs,
- DBRefSource.DNACODINGDBS); // could attempt to find other cross
- // refs and return here - ie PDB xrefs
- // (not dna, not protein seq)
- }
- return rfs;
- }
-
- public static Hashtable classifyDbRefs(DBRefEntry[] rfs)
- {
- Hashtable classes = new Hashtable();
- classes.put(DBRefSource.PROTEINDBS,
- jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS));
- classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils
- .selectRefs(rfs, DBRefSource.DNACODINGDBS));
- classes.put(DBRefSource.DOMAINDBS,
- jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS));
- // classes.put(OTHER, )
- return classes;
+ return DBRefUtils.selectRefs(refs, fromDna ? DBRefSource.PROTEINDBS
+ : DBRefSource.DNACODINGDBS);
+ // could attempt to find other cross
+ // refs here - ie PDB xrefs
+ // (not dna, not protein seq)
}
/**
SequenceI[] seqs, AlignmentI dataset)
{
String[] dbrefs = null;
- Vector refs = new Vector();
+ List<String> refs = new ArrayList<String>();
for (int s = 0; s < seqs.length; s++)
{
if (seqs[s] != null)
{
-
SequenceI dss = seqs[s];
while (dss.getDatasetSequence() != null)
{
{
if (!refs.contains(rfs[r].getSource()))
{
- refs.addElement(rfs[r].getSource());
+ refs.add(rfs[r].getSource());
}
}
if (dataset != null)
// search for references to this sequence's direct references.
DBRefEntry[] lrfs = CrossRef
.findXDbRefs(!dna, seqs[s].getDBRef());
- Vector rseqs = new Vector();
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
null); // don't need to specify codon frame for mapping here
- Enumeration lr = rseqs.elements();
- while (lr.hasMoreElements())
+ for (SequenceI rs : rseqs)
{
- SequenceI rs = (SequenceI) lr.nextElement();
- DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef());
+ DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); // not used??
for (int r = 0; rfs != null && r < rfs.length; r++)
{
if (!refs.contains(rfs[r].getSource()))
{
- refs.addElement(rfs[r].getSource());
+ refs.add(rfs[r].getSource());
}
}
}
if (refs.size() > 0)
{
dbrefs = new String[refs.size()];
- refs.copyInto(dbrefs);
+ refs.toArray(dbrefs);
}
return dbrefs;
}
public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
String source, AlignmentI dataset)
{
- Vector rseqs = new Vector();
+ List<SequenceI> rseqs = new ArrayList<SequenceI>();
Alignment ral = null;
AlignedCodonFrame cf = new AlignedCodonFrame(); // nominal width
for (int s = 0; s < seqs.length; s++)
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
System.out.println("Attempting to find ds Xrefs refs.");
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less
- // ambiguous
- // would
- // be a
- // 'find
- // primary
- // dbRefEntry'
- // method.
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef());
+ // less ambiguous would be a 'find primary dbRefEntry' method.
// filter for desired source xref here
found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
rseqs, cf);
{
if (xrfs[r].getMap().getTo() != null)
{
- Sequence rsq = new Sequence(xrfs[r].getMap().getTo());
- rseqs.addElement(rsq);
+ SequenceI rsq = new Sequence(xrfs[r].getMap().getTo());
+ rseqs.add(rsq);
if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
.getMap().getMap().getToRatio())
{
}
}
retrieved[rs].updatePDBIds();
- rseqs.addElement(retrieved[rs]);
+ rseqs.add(retrieved[rs]);
}
}
}
if (rseqs.size() > 0)
{
SequenceI[] rsqs = new SequenceI[rseqs.size()];
- rseqs.copyInto(rsqs);
+ rseqs.toArray(rsqs);
ral = new Alignment(rsqs);
if (cf != null && cf.getProtMappings() != null)
{
* @return true if matches were found.
*/
private static boolean searchDatasetXrefs(SequenceI sequenceI,
- boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs,
+ boolean dna, DBRefEntry[] lrfs, AlignmentI dataset,
+ List<SequenceI> rseqs,
AlignedCodonFrame cf)
{
boolean found = false;
* @return true if one or more unique sequences were found and added
*/
public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf)
+ AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf)
{
return searchDataset(sequenceI, xrf, dataset, rseqs, cf, true, false);
}
* @return true if relationship found and sequence added.
*/
public static boolean searchDataset(SequenceI sequenceI, DBRefEntry xrf,
- AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf,
+ AlignmentI dataset, List<SequenceI> rseqs, AlignedCodonFrame cf,
boolean direct, boolean dna)
{
boolean found = false;
{
if (!rseqs.contains(nxt))
{
- rseqs.addElement(nxt);
- boolean foundmap = cf != null; // don't search if we aren't
- // given
- // a codon map object
+ rseqs.add(nxt);
+ boolean foundmap = cf != null;
+ // don't search if we aren't given a codon map object
for (int r = 0; foundmap && r < cands.length; r++)
{
if (cands[r].hasMap())
*/
package jalview.analysis;
-import java.util.*;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.Vector;
-import jalview.datamodel.*;
+import jalview.datamodel.PDBEntry;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
public class SeqsetUtils
{
}
sqinfo.put("SeqFeatures", sfeat);
sqinfo.put("PdbId", (seq.getPDBId() != null) ? seq.getPDBId()
- : new Vector());
+ : new Vector<PDBEntry>());
sqinfo.put("datasetSequence",
(seq.getDatasetSequence() != null) ? seq.getDatasetSequence()
: new Sequence("THISISAPLACEHOLDER", ""));
Integer start = (Integer) sqinfo.get("Start");
Integer end = (Integer) sqinfo.get("End");
Vector sfeatures = (Vector) sqinfo.get("SeqFeatures");
- Vector pdbid = (Vector) sqinfo.get("PdbId");
+ Vector<PDBEntry> pdbid = (Vector<PDBEntry>) sqinfo.get("PdbId");
String description = (String) sqinfo.get("Description");
Sequence seqds = (Sequence) sqinfo.get("datasetSequence");
if (oldname == null)
*/
package jalview.datamodel;
-import jalview.analysis.AlignmentUtils;
-import jalview.io.FastaFile;
-import jalview.util.MessageManager;
-
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;
+import jalview.analysis.AlignmentUtils;
+import jalview.io.FastaFile;
+import jalview.util.MessageManager;
+
/**
* Data structure to hold and manipulate a multiple sequence alignment
*/
* identically. If this is nucleotide and the other is protein, make 3 gaps
* for each gap in the protein sequences. If this is protein and the other is
* nucleotide, insert a gap for each 3 gaps (or part thereof) between
- * nucleotide bases. Does nothing if alignment of protein from cDNA is
- * requested (not yet implemented).
+ * nucleotide bases. If this is protein and the other is nucleotide, gaps
+ * protein to match the relative ordering of codons in the nucleotide.
*
* Parameters control whether gaps in exon (mapped) and intron (unmapped)
* regions are preserved. Gaps that connect introns to exons are treated
{ thisGapChar, thisGapChar, thisGapChar }) : String
.valueOf(thisGapChar);
+ // TODO handle intron regions? Needs a 'holistic' alignment of dna,
+ // not just sequence by sequence. But how to 'gap' intron regions?
+
/*
* Get mappings from 'that' alignment's sequences to this.
*/
public class FeatureProperties
{
+ private static final String EMBL_CODING_FEATURE = "CDS";
+
public static final String EXONPOS = "exon number";
public static final String EXONPRODUCT = "product";
*/
public static boolean isCodingFeature(String dbrefsource, String type)
{
- return ((dbrefsource == null
- || dbrefsource.equalsIgnoreCase(DBRefSource.EMBL) || dbrefsource
- .equalsIgnoreCase(DBRefSource.EMBLCDS)) && type
- .equalsIgnoreCase("CDS"));
+ if (type.equalsIgnoreCase(EMBL_CODING_FEATURE))
+ {
+ return (dbrefsource == null
+ || dbrefsource.equalsIgnoreCase(DBRefSource.EMBL) || dbrefsource
+ .equalsIgnoreCase(DBRefSource.EMBLCDS));
+ }
+ return false;
+ }
+
+ /**
+ * Returns the coding feature name for a database source. Currently just
+ * hard-coded to return CDS for EMBL/EMBLCDS, else null.
+ *
+ * @param dbrefsource
+ * @return
+ */
+ public static String getCodingFeature(String dbrefsource)
+ {
+ if (DBRefSource.EMBL.equalsIgnoreCase(dbrefsource)
+ || DBRefSource.EMBLCDS.equalsIgnoreCase(dbrefsource))
+ {
+ return EMBL_CODING_FEATURE;
+ }
+ return null;
}
}
*/
package jalview.datamodel;
-import jalview.analysis.AlignSeq;
-import jalview.util.StringUtils;
-
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import fr.orsay.lri.varna.models.rna.RNA;
+import jalview.analysis.AlignSeq;
+import jalview.util.StringUtils;
+
/**
*
* Implements the SequenceI interface for a char[] based sequence object.
int end;
- Vector pdbIds;
+ Vector<PDBEntry> pdbIds;
String vamsasId;
{
if (pdbIds == null)
{
- pdbIds = new Vector();
+ pdbIds = new Vector<PDBEntry>();
}
if (!pdbIds.contains(entry))
{
* @param id
* DOCUMENT ME!
*/
- public void setPDBId(Vector ids);
+ public void setPDBId(Vector<PDBEntry> ids);
/**
- * DOCUMENT ME!
+ * Returns a list
*
* @return DOCUMENT ME!
*/
- public Vector getPDBId();
+ public Vector<PDBEntry> getPDBId();
/**
* add entry to the vector of PDBIds, if it isn't in the list already
*/
package jalview.util;
-import jalview.datamodel.DBRefEntry;
-import jalview.datamodel.PDBEntry;
-import jalview.datamodel.SequenceI;
-
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
+import java.util.List;
import java.util.Map;
-import java.util.Vector;
+
+import com.stevesoft.pat.Regex;
+
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.PDBEntry;
+import jalview.datamodel.SequenceI;
public class DBRefUtils
{
+ private static Map<String, String> canonicalSourceNameLookup = new HashMap<String, String>();
+
+ private static Map<String, String> dasCoordinateSystemsLookup = new HashMap<String, String>();
+
+ static
+ {
+ // TODO load these from a resource file?
+ canonicalSourceNameLookup.put("uniprotkb/swiss-prot",
+ DBRefSource.UNIPROT);
+ canonicalSourceNameLookup.put("uniprotkb/trembl", DBRefSource.UNIPROT);
+ canonicalSourceNameLookup.put("pdb", DBRefSource.PDB);
+
+ dasCoordinateSystemsLookup.put("pdbresnum", DBRefSource.PDB);
+ dasCoordinateSystemsLookup.put("uniprot", DBRefSource.UNIPROT);
+ dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBL);
+ // dasCoordinateSystemsLookup.put("embl", DBRefSource.EMBLCDS);
+ }
+
/**
* Utilities for handling DBRef objects and their collections.
*/
* @return boolean true if Source DBRefEntry is compatible with DAS
* CoordinateSystem name
*/
- public static Hashtable DasCoordinateSystemsLookup = null;
public static boolean isDasCoordinateSystem(String string,
DBRefEntry dBRefEntry)
{
- if (DasCoordinateSystemsLookup == null)
+ if (string == null || dBRefEntry == null)
{
- // TODO: Make a DasCoordinateSystemsLookup properties resource
- // Initialise
- DasCoordinateSystemsLookup = new Hashtable();
- DasCoordinateSystemsLookup.put("pdbresnum",
- jalview.datamodel.DBRefSource.PDB);
- DasCoordinateSystemsLookup.put("uniprot",
- jalview.datamodel.DBRefSource.UNIPROT);
- DasCoordinateSystemsLookup.put("EMBL",
- jalview.datamodel.DBRefSource.EMBL);
- // DasCoordinateSystemsLookup.put("EMBL",
- // jalview.datamodel.DBRefSource.EMBLCDS);
+ return false;
}
-
- String coordsys = (String) DasCoordinateSystemsLookup.get(string
+ String coordsys = dasCoordinateSystemsLookup.get(string
.toLowerCase());
- if (coordsys != null)
- {
- return coordsys.equals(dBRefEntry.getSource());
- }
- return false;
+ return coordsys == null ? false : coordsys.equals(dBRefEntry
+ .getSource());
}
- public static Hashtable CanonicalSourceNameLookup = null;
-
/**
* look up source in an internal list of database reference sources and return
* the canonical jalview name for the source, or the original string if it has
*/
public static String getCanonicalName(String source)
{
- if (CanonicalSourceNameLookup == null)
+ if (source == null)
{
- CanonicalSourceNameLookup = new Hashtable();
- CanonicalSourceNameLookup.put("uniprotkb/swiss-prot",
- jalview.datamodel.DBRefSource.UNIPROT);
- CanonicalSourceNameLookup.put("uniprotkb/trembl",
- jalview.datamodel.DBRefSource.UNIPROT);
- CanonicalSourceNameLookup.put("pdb",
- jalview.datamodel.DBRefSource.PDB);
+ return null;
}
- String canonical = (String) CanonicalSourceNameLookup.get(source
+ String canonical = canonicalSourceNameLookup.get(source
.toLowerCase());
- if (canonical == null)
- {
- return source;
- }
- return canonical;
+ return canonical == null ? source : canonical;
}
/**
- * find RefEntry corresponding to a particular pattern the equals method of
- * each entry is used, from String attributes right down to Mapping
- * attributes.
+ * Returns an array of those references that match the given entry, or null if
+ * no matches. Currently uses a comparator which matches if
+ * <ul>
+ * <li>database sources are the same</li>
+ * <li>accession ids are the same</li>
+ * <li>both have no mapping, or the mappings are the same</li>
+ * </ul>
*
* @param ref
* Set of references to search
* @param entry
- * pattern to collect - null any entry for wildcard match
+ * pattern to match
* @return
*/
public static DBRefEntry[] searchRefs(DBRefEntry[] ref, DBRefEntry entry)
matchDbAndIdAndEitherMapOrEquivalentMapList);
}
- public static DBRefEntry[] searchRefs(DBRefEntry[] ref, DBRefEntry entry,
+ /**
+ * Returns an array of those references that match the given entry, according
+ * to the given comparator. Returns null if no matches.
+ *
+ * @param refs
+ * an array of database references to search
+ * @param entry
+ * an entry to compare against
+ * @param comparator
+ * @return
+ */
+ static DBRefEntry[] searchRefs(DBRefEntry[] refs, DBRefEntry entry,
DbRefComp comparator)
{
- if (ref == null || entry == null)
+ if (refs == null || entry == null)
{
return null;
}
- Vector rfs = new Vector();
- for (int i = 0; i < ref.length; i++)
+ List<DBRefEntry> rfs = new ArrayList<DBRefEntry>();
+ for (int i = 0; i < refs.length; i++)
{
- if (comparator.matches(entry, ref[i]))
+ if (comparator.matches(entry, refs[i]))
{
- rfs.addElement(ref[i]);
+ rfs.add(refs[i]);
}
}
- // TODO Auto-generated method stub
- if (rfs.size() > 0)
- {
- DBRefEntry[] rf = new DBRefEntry[rfs.size()];
- rfs.copyInto(rf);
- return rf;
- }
- return null;
+ return rfs.size() == 0 ? null : rfs.toArray(new DBRefEntry[rfs.size()]);
}
- public interface DbRefComp
+ interface DbRefComp
{
public boolean matches(DBRefEntry refa, DBRefEntry refb);
}
};
/**
- * used by file parsers to generate DBRefs from annotation within file (eg
- * stockholm)
+ * Parses a DBRefEntry and adds it to the sequence, also a PDBEntry if the
+ * database is PDB.
+ * <p>
+ * Used by file parsers to generate DBRefs from annotation within file (eg
+ * Stockholm)
*
* @param dbname
* @param version
* @param acn
* @param seq
- * where to anotate with reference
+ * where to annotate with reference
* @return parsed version of entry that was added to seq (if any)
*/
public static DBRefEntry parseToDbRef(SequenceI seq, String dbname,
DBRefEntry ref = null;
if (dbname != null)
{
- String locsrc = jalview.util.DBRefUtils.getCanonicalName(dbname);
- if (locsrc.equals(jalview.datamodel.DBRefSource.PDB))
+ String locsrc = DBRefUtils.getCanonicalName(dbname);
+ if (locsrc.equals(DBRefSource.PDB))
{
- // check for chaincode and mapping
- // PFAM style stockhom PDB citation
- com.stevesoft.pat.Regex r = new com.stevesoft.pat.Regex(
+ /*
+ * Check for PFAM style stockhom PDB accession id citation e.g.
+ * "1WRI A; 7-80;"
+ */
+ Regex r = new com.stevesoft.pat.Regex(
"([0-9][0-9A-Za-z]{3})\\s*(.?)\\s*;\\s*([0-9]+)-([0-9]+)");
if (r.search(acn.trim()))
{
{
chaincode = " ";
}
- String mapstart = r.stringMatched(3);
- String mapend = r.stringMatched(4);
+ // String mapstart = r.stringMatched(3);
+ // String mapend = r.stringMatched(4);
if (chaincode.equals(" "))
{
chaincode = "_";
{
UniprotEntry entry = (UniprotEntry) entries.elementAt(i);
Enumeration e = entry.getDbReference().elements();
- Vector onlyPdbEntries = new Vector();
+ Vector<PDBEntry> onlyPdbEntries = new Vector();
Vector dbxrefs = new Vector();
while (e.hasMoreElements())
{
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.junit.Test;
import jalview.datamodel.Annotation;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.Mapping;
+import jalview.datamodel.SearchResults;
+import jalview.datamodel.SearchResults.Match;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import jalview.io.AppletFormatAdapter;
import jalview.io.FormatAdapter;
import jalview.util.MapList;
+import jalview.util.MappingUtils;
public class AlignmentUtilsTests
{
assertTrue(AlignmentUtils.haveCrossRef(seq1, seq2));
assertTrue(AlignmentUtils.haveCrossRef(seq2, seq1));
}
+
+ /**
+ * Test the method that extracts the exon-only part of a dna alignment.
+ */
+ @Test
+ public void testMakeExonAlignment()
+ {
+ SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
+ SequenceI dna2 = new Sequence("dna2", "GGGcccTTTaaaCCC");
+ SequenceI pep1 = new Sequence("pep1", "GF");
+ SequenceI pep2 = new Sequence("pep2", "GFP");
+ dna1.createDatasetSequence();
+ dna2.createDatasetSequence();
+ pep1.createDatasetSequence();
+ pep2.createDatasetSequence();
+
+ Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
+ MapList map = new MapList(new int[]
+ { 4, 6, 10, 12 }, new int[]
+ { 1, 2 }, 3, 1);
+ AlignedCodonFrame acf = new AlignedCodonFrame();
+ acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
+ mappings.add(acf);
+ map = new MapList(new int[]
+ { 1, 3, 7, 9, 13, 15 }, new int[]
+ { 1, 3 }, 3, 1);
+ acf = new AlignedCodonFrame();
+ acf.addMap(dna2.getDatasetSequence(), pep2.getDatasetSequence(), map);
+ mappings.add(acf);
+
+ AlignmentI exons = AlignmentUtils.makeExonAlignment(new SequenceI[]
+ { dna1, dna2 }, mappings);
+ assertEquals(2, exons.getSequences().size());
+ assertEquals("GGGTTT", exons.getSequenceAt(0).getSequenceAsString());
+ assertEquals("GGGTTTCCC", exons.getSequenceAt(1).getSequenceAsString());
+
+ /*
+ * Verify updated mappings
+ */
+ assertEquals(2, mappings.size());
+
+ /*
+ * Mapping from pep1 to GGGTTT in first new exon sequence
+ */
+ List<AlignedCodonFrame> pep1Mapping = MappingUtils
+ .findMappingsForSequence(pep1, mappings);
+ assertEquals(1, pep1Mapping.size());
+ // map G to GGG
+ SearchResults sr = MappingUtils.buildSearchResults(pep1, 1, mappings);
+ assertEquals(1, sr.getResults().size());
+ Match m = sr.getResults().get(0);
+ assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
+ m.getSequence());
+ assertEquals(1, m.getStart());
+ assertEquals(3, m.getEnd());
+ // map F to TTT
+ sr = MappingUtils.buildSearchResults(pep1, 2, mappings);
+ m = sr.getResults().get(0);
+ assertEquals(exons.getSequenceAt(0).getDatasetSequence(),
+ m.getSequence());
+ assertEquals(4, m.getStart());
+ assertEquals(6, m.getEnd());
+
+ /*
+ * Mapping from pep2 to GGGTTTCCC in second new exon sequence
+ */
+ List<AlignedCodonFrame> pep2Mapping = MappingUtils
+ .findMappingsForSequence(pep2, mappings);
+ assertEquals(1, pep2Mapping.size());
+ // map G to GGG
+ sr = MappingUtils.buildSearchResults(pep2, 1, mappings);
+ assertEquals(1, sr.getResults().size());
+ m = sr.getResults().get(0);
+ assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
+ m.getSequence());
+ assertEquals(1, m.getStart());
+ assertEquals(3, m.getEnd());
+ // map F to TTT
+ sr = MappingUtils.buildSearchResults(pep2, 2, mappings);
+ m = sr.getResults().get(0);
+ assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
+ m.getSequence());
+ assertEquals(4, m.getStart());
+ assertEquals(6, m.getEnd());
+ // map P to CCC
+ sr = MappingUtils.buildSearchResults(pep2, 3, mappings);
+ m = sr.getResults().get(0);
+ assertEquals(exons.getSequenceAt(1).getDatasetSequence(),
+ m.getSequence());
+ assertEquals(7, m.getStart());
+ assertEquals(9, m.getEnd());
+ }
+
+ /**
+ * Test the method that makes an exon-only sequence from a DNA sequence and
+ * its product mapping. Test includes the expected case that the DNA sequence
+ * already has a protein product (Uniprot translation) which in turn has an
+ * x-ref to the EMBLCDS record.
+ */
+ @Test
+ public void testMakeExonSequence()
+ {
+ SequenceI dna1 = new Sequence("dna1", "aaaGGGcccTTTaaa");
+ SequenceI pep1 = new Sequence("pep1", "GF");
+ dna1.createDatasetSequence();
+ pep1.createDatasetSequence();
+ pep1.getDatasetSequence().addDBRef(
+ new DBRefEntry("EMBLCDS", "2", "A12345"));
+
+ /*
+ * Make the mapping from dna to protein. The protein sequence has a DBRef to
+ * EMBLCDS|A12345.
+ */
+ Set<AlignedCodonFrame> mappings = new HashSet<AlignedCodonFrame>();
+ MapList map = new MapList(new int[]
+ { 4, 6, 10, 12 }, new int[]
+ { 1, 2 }, 3, 1);
+ AlignedCodonFrame acf = new AlignedCodonFrame();
+ acf.addMap(dna1.getDatasetSequence(), pep1.getDatasetSequence(), map);
+ mappings.add(acf);
+
+ AlignedCodonFrame newMapping = new AlignedCodonFrame();
+ SequenceI exon = AlignmentUtils.makeExonSequence(dna1, acf, newMapping);
+
+ assertEquals("GGGTTT", exon.getSequenceAsString());
+ assertEquals("dna1|A12345", exon.getName());
+ assertEquals(1, exon.getDBRef().length);
+ DBRefEntry cdsRef = exon.getDBRef()[0];
+ assertEquals("EMBLCDS", cdsRef.getSource());
+ assertEquals("2", cdsRef.getVersion());
+ assertEquals("A12345", cdsRef.getAccessionId());
+
+ }
}
--- /dev/null
+package jalview.analysis;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+
+import org.junit.Test;
+
+import jalview.datamodel.DBRefEntry;
+
+public class CrossRefTest
+{
+ @Test
+ public void testFindXDbRefs()
+ {
+ DBRefEntry ref1 = new DBRefEntry("UNIPROT", "1", "A123");
+ DBRefEntry ref2 = new DBRefEntry("UNIPROTKB/TREMBL", "1", "A123");
+ DBRefEntry ref3 = new DBRefEntry("pdb", "1", "A123");
+ DBRefEntry ref4 = new DBRefEntry("EMBLCDSPROTEIN", "1", "A123");
+ DBRefEntry ref5 = new DBRefEntry("embl", "1", "A123");
+ DBRefEntry ref6 = new DBRefEntry("emblCDS", "1", "A123");
+ DBRefEntry ref7 = new DBRefEntry("GeneDB", "1", "A123");
+ DBRefEntry ref8 = new DBRefEntry("PFAM", "1", "A123");
+ DBRefEntry[] refs = new DBRefEntry[]
+ { ref1, ref2, ref3, ref4, ref5, ref6, ref7, ref8 };
+
+ /*
+ * Just the DNA refs:
+ */
+ DBRefEntry[] found = CrossRef.findXDbRefs(false, refs);
+ assertEquals(3, found.length);
+ assertSame(ref5, found[0]);
+ assertSame(ref6, found[1]);
+ assertSame(ref7, found[2]);
+
+ /*
+ * Just the protein refs:
+ */
+ found = CrossRef.findXDbRefs(true, refs);
+ assertEquals(4, found.length);
+ assertSame(ref1, found[0]);
+ assertSame(ref2, found[1]);
+ assertSame(ref3, found[2]);
+ assertSame(ref4, found[3]);
+ }
+
+}
--- /dev/null
+package jalview.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.PDBEntry;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+
+public class DBRefUtilsTest
+{
+
+ /**
+ * Test the method that selects DBRefEntry items whose source is in a supplied
+ * list
+ */
+ @Test
+ public void testSelectRefs()
+ {
+ assertNull(DBRefUtils.selectRefs(null, null));
+ assertNull(DBRefUtils.selectRefs(null, DBRefSource.CODINGDBS));
+
+ DBRefEntry ref1 = new DBRefEntry("EMBL", "1.2", "A12345");
+ DBRefEntry ref2 = new DBRefEntry("UNIPROT", "1.2", "A12346");
+ // Source is converted to upper-case by this constructor!
+ DBRefEntry ref3 = new DBRefEntry("Uniprot", "1.2", "A12347");
+ DBRefEntry[] dbrefs = new DBRefEntry[]
+ { ref1, ref2, ref3 };
+ String[] sources = new String[]
+ { "EMBL", "UNIPROT" };
+
+ DBRefEntry[] selected = DBRefUtils.selectRefs(dbrefs, sources);
+ assertEquals(3, selected.length);
+ assertSame(ref1, selected[0]);
+ assertSame(ref2, selected[1]);
+ assertSame(ref3, selected[2]);
+
+ sources = new String[]
+ { "EMBL" };
+ selected = DBRefUtils.selectRefs(dbrefs, sources);
+ assertEquals(1, selected.length);
+ assertSame(ref1, selected[0]);
+
+ sources = new String[]
+ { "UNIPROT" };
+ selected = DBRefUtils.selectRefs(dbrefs, sources);
+ assertEquals(2, selected.length);
+ assertSame(ref2, selected[0]);
+ assertSame(ref3, selected[1]);
+
+ sources = new String[]
+ { "Uniprot", "EMBLCDS" };
+ selected = DBRefUtils.selectRefs(dbrefs, sources);
+ assertNull(selected);
+ }
+
+ /**
+ * Test the method that converts (currently three) database names to a
+ * canonical name (not case-sensitive)
+ */
+ @Test
+ public void testGetCanonicalName()
+ {
+ assertNull(DBRefUtils.getCanonicalName(null));
+ assertEquals("", DBRefUtils.getCanonicalName(""));
+ assertEquals("PDB", DBRefUtils.getCanonicalName("pdb"));
+ assertEquals("PDB", DBRefUtils.getCanonicalName("Pdb"));
+ assertEquals("UNIPROT",
+ DBRefUtils.getCanonicalName("uniprotkb/swiss-prot"));
+ assertEquals("UNIPROT", DBRefUtils.getCanonicalName("uniprotkb/trembl"));
+ assertEquals("UNIPROT",
+ DBRefUtils.getCanonicalName("UNIPROTKB/SWISS-PROT"));
+ assertEquals("UNIPROT", DBRefUtils.getCanonicalName("UNIPROTKB/TREMBL"));
+ assertEquals("UNIPROTKB/SWISS-CHEESE",
+ DBRefUtils.getCanonicalName("UNIPROTKB/SWISS-CHEESE"));
+ }
+
+ @Test
+ public void testIsDasCoordinateSystem()
+ {
+ assertFalse(DBRefUtils.isDasCoordinateSystem(null, null));
+ assertFalse(DBRefUtils.isDasCoordinateSystem("pdbresnum", null));
+ assertFalse(DBRefUtils.isDasCoordinateSystem(null, new DBRefEntry(
+ "PDB", "v1", "a1")));
+
+ assertTrue(DBRefUtils.isDasCoordinateSystem("pdbresnum",
+ new DBRefEntry("PDB", "v1", "a1")));
+ assertTrue(DBRefUtils.isDasCoordinateSystem("PDBRESNUM",
+ new DBRefEntry("PDB", "v1", "a1")));
+ // "pdb" is converted to upper-case in DBRefEntry constructor
+ assertTrue(DBRefUtils.isDasCoordinateSystem("pdbresnum",
+ new DBRefEntry("pdb", "v1", "a1")));
+ assertFalse(DBRefUtils.isDasCoordinateSystem("pdb", new DBRefEntry(
+ "pdb", "v1", "a1")));
+
+ assertTrue(DBRefUtils.isDasCoordinateSystem("UNIPROT", new DBRefEntry(
+ "Uniprot", "v1", "a1")));
+ assertTrue(DBRefUtils.isDasCoordinateSystem("Uniprot", new DBRefEntry(
+ "UNIPROT", "v1", "a1")));
+ assertFalse(DBRefUtils.isDasCoordinateSystem("UNIPROTKB",
+ new DBRefEntry(
+ "pdb", "v1", "a1")));
+
+ assertTrue(DBRefUtils.isDasCoordinateSystem("EMBL", new DBRefEntry(
+ "EMBL", "v1", "a1")));
+ assertTrue(DBRefUtils.isDasCoordinateSystem("embl", new DBRefEntry(
+ "embl", "v1", "a1")));
+ }
+
+ /**
+ * Test 'parsing' a DBRef - non PDB case
+ */
+ @Test
+ public void testParseToDbRef()
+ {
+ SequenceI seq = new Sequence("Seq1", "ABCD");
+ DBRefEntry ref = DBRefUtils.parseToDbRef(seq, "EMBL", "1.2", "a7890");
+ DBRefEntry[] refs = seq.getDBRef();
+ assertEquals(1, refs.length);
+ assertSame(ref, refs[0]);
+ assertEquals("EMBL", ref.getSource());
+ assertEquals("1.2", ref.getVersion());
+ assertEquals("a7890", ref.getAccessionId());
+ assertNull(seq.getPDBId());
+ }
+
+ /**
+ * Test 'parsing' a DBRef - Stockholm PDB format
+ */
+ @Test
+ public void testParseToDbRef_PDB()
+ {
+ SequenceI seq = new Sequence("Seq1", "ABCD");
+ DBRefEntry ref = DBRefUtils.parseToDbRef(seq, "pdb", "1.2",
+ "1WRI A; 7-80;");
+ DBRefEntry[] refs = seq.getDBRef();
+ assertEquals(1, refs.length);
+ assertSame(ref, refs[0]);
+ assertEquals("PDB", ref.getSource());
+ assertEquals("1.2", ref.getVersion());
+ // DBRef id is pdbId + chain code
+ assertEquals("1WRIA", ref.getAccessionId());
+ assertEquals(1, seq.getPDBId().size());
+ PDBEntry pdbRef = seq.getPDBId().get(0);
+ assertEquals("1WRI", pdbRef.getId());
+ assertNull(pdbRef.getFile());
+ assertEquals("A", pdbRef.getProperty().get("CHAIN"));
+ assertNull(pdbRef.getType());
+ }
+
+ /**
+ * Test the method that searches for matches references - case when we are
+ * matching a reference with no mappings
+ */
+ @Test
+ public void testSearchRefs_noMapping()
+ {
+ DBRefEntry target = new DBRefEntry("EMBL", "2", "A1234");
+
+ DBRefEntry ref1 = new DBRefEntry("EMBL", "1", "A1234"); // matches
+ // constructor changes embl to EMBL
+ DBRefEntry ref2 = new DBRefEntry("embl", "1", "A1234"); // matches
+ // constructor does not upper-case accession id
+ DBRefEntry ref3 = new DBRefEntry("EMBL", "1", "a1234"); // no match
+ DBRefEntry ref4 = new DBRefEntry("EMBLCDS", "1", "A1234"); // no match
+ // ref5 matches although it has a mapping - ignored
+ DBRefEntry ref5 = new DBRefEntry("EMBL", "1", "A1234");
+ ref5.setMap(new Mapping(new MapList(new int[]
+ { 1, 1 }, new int[]
+ { 1, 1 }, 1, 1)));
+
+ DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[]
+ { ref1, ref2, ref3, ref4, ref5 }, target);
+ assertEquals(3, matches.length);
+ assertSame(ref1, matches[0]);
+ assertSame(ref2, matches[1]);
+ assertSame(ref5, matches[2]);
+ }
+
+ /**
+ * Test the method that searches for matches references - case when we are
+ * matching a reference with a mapping
+ */
+ @Test
+ public void testSearchRefs_withMapping()
+ {
+ DBRefEntry target = new DBRefEntry("EMBL", "2", "A1234");
+ final Mapping map1 = new Mapping(new MapList(new int[]
+ { 1, 1 }, new int[]
+ { 1, 1 }, 1, 1));
+ target.setMap(map1);
+
+ // these all match target iff mappings match
+ DBRefEntry ref1 = new DBRefEntry("EMBL", "1", "A1234"); // no map: matches
+ DBRefEntry ref2 = new DBRefEntry("EMBL", "1", "A1234"); // =map: matches
+ final Mapping map2 = new Mapping(new MapList(new int[]
+ { 1, 1 }, new int[]
+ { 1, 1 }, 1, 1));
+ ref2.setMap(map2);
+
+ // different map: no match
+ DBRefEntry ref3 = new DBRefEntry("EMBL", "1", "A1234");
+ final Mapping map3 = new Mapping(new MapList(new int[]
+ { 1, 1 }, new int[]
+ { 1, 1 }, 2, 2));
+ ref3.setMap(map3);
+
+ DBRefEntry[] matches = DBRefUtils.searchRefs(new DBRefEntry[]
+ { ref1, ref2, ref3 }, target);
+ assertEquals(2, matches.length);
+ assertSame(ref1, matches[0]);
+ assertSame(ref2, matches[1]);
+ }
+}
* Simple test of mapping with introns involved.
*/
@Test
- public void testBuildSearchResults_withIntro()
+ public void testBuildSearchResults_withIntron()
{
final Sequence seq1 = new Sequence("Seq1", "C-G-TAGA-GCAGCTT");
seq1.createDatasetSequence();