import jalview.datamodel.DBRefSource;
import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
-import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntology;
-import jalview.schemes.ResidueProperties;
import jalview.util.DBRefUtils;
-import jalview.util.MapList;
-import jalview.util.MappingUtils;
-import jalview.util.StringUtils;
import jalview.ws.SequenceFetcher;
import jalview.ws.seqfetcher.ASequenceFetcher;
import java.util.ArrayList;
-import java.util.Collections;
-import java.util.LinkedHashMap;
import java.util.List;
-import java.util.Map.Entry;
import java.util.Vector;
/**
// map should be from protein seq to its coding dna
cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
}
-
- /*
- * compute peptide variants from dna variants
- */
- rsq.createDatasetSequence();
- computeProteinVariants(seq, rsq, xref.getMap().getMap());
}
found = true;
}
}
/**
- * Computes variants in peptide product generated by variants in dna, and adds
- * them as sequence_variant features on the protein sequence. Returns the
- * number of variant features added.
- *
- * @param dnaSeq
- * @param peptide
- * @param dnaToProtein
- */
- protected static int computeProteinVariants(SequenceI dnaSeq,
- SequenceI peptide, MapList dnaToProtein)
- {
- /*
- * map from peptide position to all variant features of the codon for it
- * LinkedHashMap ensures we add the peptide features in sequence order
- */
- LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
- SequenceOntology so = SequenceOntology.getInstance();
-
- SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
- if (dnaFeatures == null)
- {
- return 0;
- }
-
- int[] lastCodon = null;
- int lastPeptidePostion = 0;
-
- /*
- * build a map of codon variations for peptides
- */
- for (SequenceFeature sf : dnaFeatures)
- {
- int dnaCol = sf.getBegin();
- if (dnaCol != sf.getEnd())
- {
- // not handling multi-locus variant features
- continue;
- }
- if (so.isSequenceVariant(sf.getType()))
- {
- int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
- if (mapsTo == null)
- {
- // feature doesn't lie within coding region
- continue;
- }
- int peptidePosition = mapsTo[0];
- String[][] codonVariants = variants.get(peptidePosition);
- if (codonVariants == null)
- {
- codonVariants = new String[3][];
- variants.put(peptidePosition, codonVariants);
- }
-
- /*
- * extract dna variants to a string array
- */
- String alls = (String) sf.getValue("alleles");
- if (alls == null)
- {
- continue;
- }
- String[] alleles = alls.split(",");
-
- /*
- * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
- */
- int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
- : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
- peptidePosition, peptidePosition));
- lastPeptidePostion = peptidePosition;
- lastCodon = codon;
-
- /*
- * save nucleotide (and this variant) for each codon position
- */
- for (int codonPos = 0; codonPos < 3; codonPos++)
- {
- String nucleotide = String.valueOf(dnaSeq
- .getCharAt(codon[codonPos] - 1));
- if (codon[codonPos] == dnaCol)
- {
- /*
- * record current dna base and its alleles
- */
- String[] dnaVariants = new String[alleles.length + 1];
- dnaVariants[0] = nucleotide;
- System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
- codonVariants[codonPos] = dnaVariants;
- }
- else if (codonVariants[codonPos] == null)
- {
- /*
- * record current dna base only
- * (at least until we find any variation and overwrite it)
- */
- codonVariants[codonPos] = new String[] { nucleotide };
- }
- }
- }
- }
-
- /*
- * scan codon variations, compute peptide variants and add to peptide sequence
- */
- int count = 0;
- for (Entry<Integer, String[][]> variant : variants.entrySet())
- {
- int peptidePos = variant.getKey();
- String[][] codonVariants = variant.getValue();
- String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
- List<String> peptideVariants = computePeptideVariants(codonVariants,
- residue);
- if (!peptideVariants.isEmpty())
- {
- Collections.sort(peptideVariants);
- String desc = StringUtils.listToDelimitedString(peptideVariants,
- ", ");
- SequenceFeature sf = new SequenceFeature(
- SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
- peptidePos, Float.NaN, null);
- peptide.getDatasetSequence().addSequenceFeature(sf);
- count++;
- }
- }
- return count;
- }
-
- /**
- * Returns a non-redundant list of all peptide translations generated by the
- * given dna variants, excluding the current residue value
- *
- * @param codonVariants
- * an array of base values for codon positions 1, 2, 3
- * @param residue
- * the current residue translation
- * @return
- */
- protected static List<String> computePeptideVariants(
- String[][] codonVariants, String residue)
- {
- List<String> result = new ArrayList<String>();
- for (String base1 : codonVariants[0])
- {
- for (String base2 : codonVariants[1])
- {
- for (String base3 : codonVariants[2])
- {
- String codon = base1 + base2 + base3;
- // TODO: report frameshift/insertion/deletion
- // and multiple-base variants?!
- String peptide = codon.contains("-") ? "-" : ResidueProperties
- .codonTranslate(codon);
- if (peptide != null && !result.contains(peptide)
- && !peptide.equals(residue))
- {
- result.add(peptide);
- }
- }
- }
- }
- return result;
- }
-
- /**
- * Computes a list of all peptide variants given dna variants
- *
- * @param dnaSeq
- * the coding dna sequence
- * @param codonVariants
- * variant features for each codon position (null if no variant)
- * @param residue
- * the canonical protein translation
- * @return
- */
- protected static List<String> computePeptideVariants(SequenceI dnaSeq,
- SequenceFeature[] codonVariants, String residue)
- {
- List<String> result = new ArrayList<String>();
- int[][] dnaVariants = new int[3][];
- for (int i = 0; i < 3; i++)
- {
-
- }
- // TODO Auto-generated method stub
- return null;
- }
-
- /**
* precalculate different products that can be found for seqs in dataset and
* return them.
*
}
/**
- * Answers true unless the feature type is 'transcript' or 'exon' (or a
- * sub-type in the Sequence Ontology). These features are only retrieved in
- * order to identify the exon sequence loci, and are redundant information on
- * the exon sequence itself.
+ * Answers true unless the feature type is 'transcript' (or a sub-type in the
+ * Sequence Ontology).
*/
@Override
protected boolean retainFeature(SequenceFeature sf, String accessionId)
{
- SequenceOntology so = SequenceOntology.getInstance();
- String type = sf.getType();
-
- if (isTranscript(type) || so.isA(type, SequenceOntology.EXON))
+ if (isTranscript(sf.getType()))
{
return false;
}
{
/*
* fetch cds features on genomic sequence (to identify the CDS regions)
- * and variation features (to retain)
+ * and exon and variation features (to retain for display)
*/
private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
- EnsemblFeatureType.cds, EnsemblFeatureType.variation };
+ EnsemblFeatureType.cds, EnsemblFeatureType.exon,
+ EnsemblFeatureType.variation };
/**
* Constructor
import jalview.io.FastaFile;
import jalview.io.FileParse;
import jalview.io.gff.SequenceOntology;
+import jalview.schemes.ResidueProperties;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
+import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map.Entry;
/**
* Base class for Ensembl sequence fetchers
DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
accId, map);
querySeq.getDatasetSequence().addDBRef(dbr);
+
+ /*
+ * compute peptide variants from dna variants and add as
+ * sequence features on the protein sequence ta-da
+ */
+ computeProteinFeatures(querySeq, proteinSeq, mapList);
}
} catch (Exception e)
{
}
/**
+ * Maps exon features from dna to protein, and computes variants in peptide
+ * product generated by variants in dna, and adds them as sequence_variant
+ * features on the protein sequence. Returns the number of variant features
+ * added.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ static int computeProteinFeatures(SequenceI dnaSeq,
+ SequenceI peptide, MapList dnaToProtein)
+ {
+ while (dnaSeq.getDatasetSequence() != null)
+ {
+ dnaSeq = dnaSeq.getDatasetSequence();
+ }
+ while (peptide.getDatasetSequence() != null)
+ {
+ peptide = peptide.getDatasetSequence();
+ }
+
+ mapExonsToProtein(dnaSeq, peptide, dnaToProtein);
+
+ LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+ dnaSeq, dnaToProtein);
+
+ /*
+ * scan codon variations, compute peptide variants and add to peptide sequence
+ */
+ int count = 0;
+ for (Entry<Integer, String[][]> variant : variants.entrySet())
+ {
+ int peptidePos = variant.getKey();
+ String[][] codonVariants = variant.getValue();
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
+ List<String> peptideVariants = computePeptideVariants(codonVariants,
+ residue);
+ if (!peptideVariants.isEmpty())
+ {
+ Collections.sort(peptideVariants);
+ String desc = StringUtils.listToDelimitedString(peptideVariants,
+ ", ");
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntology.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, null);
+ peptide.addSequenceFeature(sf);
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Transfers exon features to the corresponding mapped regions of the protein
+ * sequence. This is useful because it allows visualisation of exon boundaries
+ * on the peptide (using 'colour by label' for the exon name). Returns the
+ * number of features written.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ static int mapExonsToProtein(SequenceI dnaSeq, SequenceI peptide,
+ MapList dnaToProtein)
+ {
+ SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return 0;
+ }
+
+ SequenceOntology so = SequenceOntology.getInstance();
+ int count = 0;
+
+ for (SequenceFeature sf : sfs)
+ {
+ if (so.isA(sf.getType(), SequenceOntology.EXON))
+ {
+ int start = sf.getBegin();
+ int end = sf.getEnd();
+ int[] mapsTo = dnaToProtein.locateInTo(start, end);
+ if (mapsTo != null)
+ {
+ SequenceFeature copy = new SequenceFeature(SequenceOntology.EXON,
+ sf.getDescription(), mapsTo[0], mapsTo[1], 0f, null);
+ peptide.addSequenceFeature(copy);
+ count++;
+ }
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is an
+ * array of all variants for the coding codon positions
+ *
+ * @param dnaSeq
+ * @param dnaToProtein
+ * @return
+ */
+ static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+ SequenceI dnaSeq, MapList dnaToProtein)
+ {
+ /*
+ * map from peptide position to all variant features of the codon for it
+ * LinkedHashMap ensures we add the peptide features in sequence order
+ */
+ LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+ SequenceOntology so = SequenceOntology.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return variants;
+ }
+
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isSequenceVariant(sf.getType()))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ String[][] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new String[3][];
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.split(",");
+
+ /*
+ * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and this variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(dnaSeq
+ .getCharAt(codon[codonPos] - 1));
+ if (codon[codonPos] == dnaCol)
+ {
+ /*
+ * record current dna base and its alleles
+ */
+ String[] dnaVariants = new String[alleles.length + 1];
+ dnaVariants[0] = nucleotide;
+ System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
+ codonVariants[codonPos] = dnaVariants;
+ }
+ else if (codonVariants[codonPos] == null)
+ {
+ /*
+ * record current dna base only
+ * (at least until we find any variation and overwrite it)
+ */
+ codonVariants[codonPos] = new String[] { nucleotide };
+ }
+ }
+ }
+ }
+ return variants;
+ }
+
+ /**
+ * Returns a non-redundant list of all peptide translations generated by the
+ * given dna variants, excluding the current residue value
+ *
+ * @param codonVariants
+ * an array of base values for codon positions 1, 2, 3
+ * @param residue
+ * the current residue translation
+ * @return
+ */
+ static List<String> computePeptideVariants(
+ String[][] codonVariants, String residue)
+ {
+ List<String> result = new ArrayList<String>();
+ for (String base1 : codonVariants[0])
+ {
+ for (String base2 : codonVariants[1])
+ {
+ for (String base3 : codonVariants[2])
+ {
+ String codon = base1 + base2 + base3;
+ // TODO: report frameshift/insertion/deletion
+ // and multiple-base variants?!
+ String peptide = codon.contains("-") ? "-" : ResidueProperties
+ .codonTranslate(codon);
+ if (peptide != null && !result.contains(peptide)
+ && !peptide.equals(residue))
+ {
+ result.add(peptide);
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
* Answers true if the feature type is either 'NMD_transcript_variant' or
* 'transcript' or one of its sub-types in the Sequence Ontology. This is
* needed because NMD_transcript_variant behaves like 'transcript' in Ensembl
desc = target.split(" ")[0];
}
- if (SequenceOntology.getInstance().isSequenceVariant(sf.getType()))
+ SequenceOntology so = SequenceOntology.getInstance();
+ String type = sf.getType();
+ if (so.isSequenceVariant(type))
{
/*
* Ensembl returns dna variants as 'alleles'
}
/*
- * Ensembl returns gene name as 'Name' for a transcript
+ * extract 'Name' for a transcript (to show gene name)
+ * or an exon (so 'colour by label' shows exon boundaries)
*/
- if (EnsemblSeqProxy.isTranscript(sf.getType()))
+ if (EnsemblSeqProxy.isTranscript(type)
+ || so.isA(type, SequenceOntology.EXON))
{
desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
}
{
int start = Integer.parseInt(gff[START_COL]);
int end = Integer.parseInt(gff[END_COL]);
- float score = Float.NaN;
+
+ /*
+ * default 'score' is 0 rather than Float.NaN as the latter currently
+ * disables the 'graduated colour => colour by label' option
+ */
+ float score = 0f;
try
{
score = Float.parseFloat(gff[SCORE_COL]);
} catch (NumberFormatException nfe)
{
- // e.g. '.' - leave as NaN to indicate no score
+ // e.g. '.' - leave as zero
}
SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],