From: gmungoc
Date: Tue, 15 Mar 2016 15:37:39 +0000 (+0000)
Subject: Merge branch 'develop' into features/JAL-653_JAL-1766_htslib_refseqsupport
X-Git-Tag: Release_2_10_0~296^2~2
X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=refs%2Fheads%2Fmungo_develop;hp=81764c0a4a0a1807ac2a1cab2e4d99d588d09669;p=jalview.git
Merge branch 'develop' into features/JAL-653_JAL-1766_htslib_refseqsupport
---
diff --git a/.classpath b/.classpath
index 473d937..cad9e2b 100644
--- a/.classpath
+++ b/.classpath
@@ -49,6 +49,7 @@
+
@@ -66,5 +67,7 @@
+
+
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
index 3f8ca28..8a5e7a7 100644
--- a/.settings/org.eclipse.jdt.core.prefs
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -21,7 +21,7 @@ org.eclipse.jdt.core.formatter.alignment_for_assignment=0
org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16
org.eclipse.jdt.core.formatter.alignment_for_compact_if=16
org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80
-org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0
+org.eclipse.jdt.core.formatter.alignment_for_enum_constants=16
org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16
org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0
org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16
@@ -60,7 +60,7 @@ org.eclipse.jdt.core.formatter.brace_position_for_switch=next_line
org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=next_line
org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false
org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false
-org.eclipse.jdt.core.formatter.comment.format_block_comments=true
+org.eclipse.jdt.core.formatter.comment.format_block_comments=false
org.eclipse.jdt.core.formatter.comment.format_header=false
org.eclipse.jdt.core.formatter.comment.format_html=true
org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true
diff --git a/THIRDPARTYLIBS b/THIRDPARTYLIBS
index 3094939..c6c817a 100644
--- a/THIRDPARTYLIBS
+++ b/THIRDPARTYLIBS
@@ -45,6 +45,9 @@ jfreesvg-2.1.jar : GPL v3 licensed library from the JFree suite: http://www.jfre
quaqua: v.8.0 (latest stable) by Randel S Hofer. LGPL and BSD Modified license: downloaded from http://www.randelshofer.ch/quaqua/
+lib/htsjdk-1.120-SNAPSHOT.jar: built from maven master at https://github.com/samtools/htsjdk MIT License to Broad Institute
+
+
Additional dependencies
examples/javascript/deployJava.js : http://java.com/js/deployJava.js
diff --git a/examples/exampleFeatures.txt b/examples/exampleFeatures.txt
index 0bb8b7e..dfadb50 100755
--- a/examples/exampleFeatures.txt
+++ b/examples/exampleFeatures.txt
@@ -1,23 +1,5 @@
-#-------------------------------------------------------------------------------
-# Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
-# Copyright (C) $$Year-Rel$$ The Jalview Authors
-#
-# This file is part of Jalview.
-#
-# Jalview is free software: you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
-#
-# Jalview is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty
-# of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-# PURPOSE. See the GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along with Jalview. If not, see .
-# The Jalview Authors are detailed in the 'AUTHORS' file.
-#-------------------------------------------------------------------------------
-ST-TURN-IIL 705b23
-GAMMA-TURN-CLASSIC 788763
+ST-TURN-IIL blue|255,0,255|absolute|20.0|95.0|below|66.0
+GAMMA-TURN-CLASSIC red|0,255,255|20.0|95.0|below|66.0
BETA-TURN-IR 9a6a94
BETA-TURN-IL d6a6ca
BETA-BULGE 1dc451
diff --git a/examples/testdata/exonerateoutput.gff b/examples/testdata/exonerateoutput.gff
index 3ea68dc..d3b5f9b 100644
--- a/examples/testdata/exonerateoutput.gff
+++ b/examples/testdata/exonerateoutput.gff
@@ -1,3 +1,7 @@
+# (exonerate delimits GFF with [START|END] OF GFF DUMP)
+# --- START OF GFF DUMP ---
+#
+#
##gff-version 2
##source-version exonerate:protein2genome:local 2.2.0
##date 2015-01-16
@@ -9,5 +13,8 @@
contig_1146 exonerate:protein2genome:local gene 8534 11269 3652 - . gene_id 0 ; sequence DDB_G0269124 ; gene_orientation .
contig_1146 exonerate:protein2genome:local cds 8534 11269 . - .
contig_1146 exonerate:protein2genome:local exon 8534 11269 . - . insertions 3 ; deletions 6
+#TODO need to understand why GFF features is from 11269 but Align is from 11270
contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0269124 ; Align 11270 143 120 ; Align 11150 187 282 ; Align 10865 281 888 ; Align 9977 578 1068 ; Align 8909 935 375
+# and a made-up alignment to a sequence in exonerateseqs.fa
+contig_1146 exonerate:protein2genome:local similarity 8534 11269 3652 - . alignment_id 0 ; Query DDB_G0280897 ; Align 11270 143 120
# --- END OF GFF DUMP ---
diff --git a/examples/testdata/simpleGff3.gff b/examples/testdata/simpleGff3.gff
new file mode 100644
index 0000000..d363bae
--- /dev/null
+++ b/examples/testdata/simpleGff3.gff
@@ -0,0 +1,28 @@
+##gff-version 2
+# exonerate output in gff2 format; not gff3 because
+# - 'similarity' is not a Sequence Ontology term
+# - attributes' name/values are separated by space ' ' not equals '='
+##source-version exonerate:protein2genome:local 2.2.0
+##date 2015-01-16
+##type DNA
+#
+# exonerate run with --showtargetgff generates 'features on the target' i.e. mappings to the query
+# tab-delimited
+# seqname source feature start end score strand frame attributes
+#
+seq1 exonerate:protein2genome:local gene 8 11 3652 - . gene_id 0 ; sequence seq2 ; gene_orientation .
+seq1 exonerate:protein2genome:local cds 9 11 . - .
+seq1 exonerate:protein2genome:local exon 9 11 . - . insertions 3 ; deletions 6
+#seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
+seq1 exonerate:protein2genome:local similarity 9 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
+#
+# appending FASTA sequences is strictly a GFF3 format feature
+# but Jalview is able to handle this mixture of GFF2 / GFF3 :-)
+#
+##FASTA
+>seq1
+ACTACGACACGACGACGACGACG
+>seq2
+CDEQEATGTQDAQEQAQC
+
+
diff --git a/examples/testdata/simplegff3.gff b/examples/testdata/simplegff3.gff
deleted file mode 100644
index 2ac5421..0000000
--- a/examples/testdata/simplegff3.gff
+++ /dev/null
@@ -1,19 +0,0 @@
-##gff-version 2
-##source-version exonerate:protein2genome:local 2.2.0
-##date 2015-01-16
-##type DNA
-#
-#
-# seqname source feature start end score strand frame attributes
-#
-seq1 exonerate:protein2genome:local gene 8 11 3652 - . gene_id 0 ; sequence seq2 ; gene_orientation .
-seq1 exonerate:protein2genome:local cds 9 11 . - .
-seq1 exonerate:protein2genome:local exon 9 11 . - . insertions 3 ; deletions 6
-seq1 exonerate:protein2genome:local similarity 8 11 3652 - . alignment_id 0 ; Query seq2 ; Align 11 1 3
-##FASTA
->seq1
-ACTACGACACGACGACGACGACG
->seq2
-CDEQEATGTQDAQEQAQC
-
-
diff --git a/help/html/calculations/quality.html b/help/html/calculations/quality.html
index 4f05b06..448efef 100755
--- a/help/html/calculations/quality.html
+++ b/help/html/calculations/quality.html
@@ -44,7 +44,7 @@
conserved BLOSUM62 score (which is higher). This value is normalised
for each column, and then plotted on a scale from 0 to 1.
- Multiple alignment algorithms using the BLOSUM 62 substition
+
Multiple alignment algorithms using the BLOSUM 62 substitution
matrices should, in theory, maximise alignment quality for an
un-gapped alignment, and locally maximise quality for gapped
alignments.
diff --git a/help/html/calculations/scorematrices.html b/help/html/calculations/scorematrices.html
index 5fb900f..f6bffb0 100644
--- a/help/html/calculations/scorematrices.html
+++ b/help/html/calculations/scorematrices.html
@@ -33,7 +33,7 @@
matrix, and (since 2.8.1) is available for Tree and PCA
calculations.
Simple Nucleotide
- Substition is a (fairly) arbitrary DNA/RNA substitution matrix.
+ Substitution is a (fairly) arbitrary DNA/RNA substitution matrix.
diff --git a/help/html/features/featuresFormat.html b/help/html/features/featuresFormat.html
index 84bc5d4..9f33b7b 100755
--- a/help/html/features/featuresFormat.html
+++ b/help/html/features/featuresFormat.html
@@ -83,7 +83,7 @@
label
Indicate that the feature
description should be used to create a colour for features of
this type.
Note: if no threshold value is
- needed then the final '|' may be ommitted.
This
+ needed then the final '|' may be omitted.
This
keyword was added in Jalview 2.6
@@ -122,7 +122,7 @@
If your sequence annotation is already available in GFF Format (see
- http://www.sanger.ac.uk/resources/software/gff/spec.html),
+ gmod.org/wiki/GFF2),
then you can leave it as is, after first adding a line containing
only 'GFF' after any Jalview feature colour definitions (this
mixed format capability was added in Jalview 2.6). Alternately,
@@ -141,7 +141,7 @@
This format allows two alternate ways of referring to a sequence,
- either by its text ID, or its index in an associated alignment.
+ either by its text ID, or its index (base 0) in an associated alignment.
Normally, sequence features are associated with sequences rather than
alignments, and the sequenceIndex field is given as "-1". In
order to specify a sequence by its index in a particular alignment,
diff --git a/help/html/features/featuresettings.html b/help/html/features/featuresettings.html
index 849d7b1..9164afd 100755
--- a/help/html/features/featuresettings.html
+++ b/help/html/features/featuresettings.html
@@ -112,8 +112,7 @@
ordering based on the average length of each feature type.
- The transparency slider setting (currently
- only available in the application version) controls the visibility
+ The transparency slider setting controls the visibility
of features rendered below other features. Reducing the transparency
will mean that features at the top of the list can obscure features
lower down, and increasing it allows the user to 'see through' the
diff --git a/lib/biojava-core-4.1.0.jar b/lib/biojava-core-4.1.0.jar
new file mode 100644
index 0000000..5a09c1f
Binary files /dev/null and b/lib/biojava-core-4.1.0.jar differ
diff --git a/lib/biojava-ontology-4.1.0.jar b/lib/biojava-ontology-4.1.0.jar
new file mode 100644
index 0000000..80737d5
Binary files /dev/null and b/lib/biojava-ontology-4.1.0.jar differ
diff --git a/lib/htsjdk-1.133.jar b/lib/htsjdk-1.133.jar
new file mode 100644
index 0000000..f084258
Binary files /dev/null and b/lib/htsjdk-1.133.jar differ
diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties
index e428989..4ab8732 100644
--- a/resources/lang/Messages.properties
+++ b/resources/lang/Messages.properties
@@ -133,6 +133,7 @@ action.using_jmol = Using Jmol
action.link = Link
action.group_link = Group Link
action.show_chain = Show Chain
+label.highlight_selection = Highlight Selection
action.show_group = Show Group
action.fetch_db_references = Fetch DB References
action.view_flanking_regions = Show flanking regions
@@ -217,6 +218,8 @@ label.above_identity_threshold = Above Identity Threshold
label.show_sequence_features = Show Sequence Features
label.nucleotide = Nucleotide
label.protein = Protein
+label.nucleotides = Nucleotides
+label.proteins = Proteins
label.to_new_alignment = To New Alignment
label.to_this_alignment = Add To This Alignment
label.apply_colour_to_all_groups = Apply Colour To All Groups
@@ -704,7 +707,9 @@ label.load_tree_for_sequence_set = Load a tree for this sequence set
label.export_image = Export Image
label.vamsas_store = VAMSAS store
label.translate_cDNA = Translate as cDNA
-label.linked_view_title = Linked cDNA and protein view
+label.reverse = Reverse
+label.reverse_complement = Reverse Complement
+label.linked_view_title = Linked CDS and protein view
label.align = Align
label.extract_scores = Extract Scores
label.get_cross_refs = Get Cross-References
@@ -1282,6 +1287,7 @@ exception.pdb_server_unreachable = Jalview is unable to reach the PDBe Solr serv
label.nw_mapping = Needleman & Wunsch Alignment
label.sifts_mapping = SIFTs Mapping
label.mapping_method = Sequence \u27f7 Structure mapping method
+label.mapping_method = Sequence \u27f7 Structure mapping method
status.waiting_for_user_to_select_output_file = Waiting for user to select {0} file.
status.cancelled_image_export_operation = Cancelled {0} export operation.
-info.error_creating_file = Error creating {0} file.
\ No newline at end of file
+info.error_creating_file = Error creating {0} file.
diff --git a/resources/so-xp-simple.obo.zip b/resources/so-xp-simple.obo.zip
new file mode 100644
index 0000000..d150da0
Binary files /dev/null and b/resources/so-xp-simple.obo.zip differ
diff --git a/src/jalview/analysis/AAFrequency.java b/src/jalview/analysis/AAFrequency.java
index 5227795..3d61b11 100755
--- a/src/jalview/analysis/AAFrequency.java
+++ b/src/jalview/analysis/AAFrequency.java
@@ -32,7 +32,6 @@ import jalview.util.QuickSort;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.List;
-import java.util.Set;
/**
* Takes in a vector or array of sequences and column start and column end and
@@ -520,7 +519,7 @@ public class AAFrequency
Hashtable[] hconsensus)
{
final char gapCharacter = alignment.getGapCharacter();
- Set mappings = alignment.getCodonFrames();
+ List mappings = alignment.getCodonFrames();
if (mappings == null || mappings.isEmpty())
{
return;
@@ -541,12 +540,16 @@ public class AAFrequency
{
continue;
}
- char[] codon = MappingUtils.findCodonFor(seq, col, mappings);
- int codonEncoded = CodingUtils.encodeCodon(codon);
- if (codonEncoded >= 0)
+ List codons = MappingUtils
+ .findCodonsFor(seq, col, mappings);
+ for (char[] codon : codons)
{
- codonCounts[codonEncoded + 2]++;
- ungappedCount++;
+ int codonEncoded = CodingUtils.encodeCodon(codon);
+ if (codonEncoded >= 0)
+ {
+ codonCounts[codonEncoded + 2]++;
+ ungappedCount++;
+ }
}
}
codonCounts[1] = ungappedCount;
diff --git a/src/jalview/analysis/AlignmentSorter.java b/src/jalview/analysis/AlignmentSorter.java
index 007d538..5ee4bcb 100755
--- a/src/jalview/analysis/AlignmentSorter.java
+++ b/src/jalview/analysis/AlignmentSorter.java
@@ -32,6 +32,7 @@ import jalview.util.MessageManager;
import jalview.util.QuickSort;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
/**
@@ -719,12 +720,13 @@ public class AlignmentSorter
int start, int stop, AlignmentI alignment, String method)
{
sortByFeature(featureLabel == null ? null
- : new String[] { featureLabel }, groupLabel == null ? null
- : new String[] { groupLabel }, start, stop, alignment, method);
+ : Arrays.asList(new String[] { featureLabel }),
+ groupLabel == null ? null
+ : Arrays.asList(new String[]{ groupLabel }), start, stop, alignment, method);
}
private static boolean containsIgnoreCase(final String lab,
- final String[] labs)
+ final List labs)
{
if (labs == null)
{
@@ -734,9 +736,9 @@ public class AlignmentSorter
{
return false;
}
- for (int q = 0; q < labs.length; q++)
+ for (String label : labs)
{
- if (labs[q] != null && lab.equalsIgnoreCase(labs[q]))
+ if (lab.equalsIgnoreCase(label))
{
return true;
}
@@ -744,8 +746,8 @@ public class AlignmentSorter
return false;
}
- public static void sortByFeature(String[] featureLabels,
- String[] groupLabels, int start, int stop, AlignmentI alignment,
+ public static void sortByFeature(List featureLabels,
+ List groupLabels, int start, int stop, AlignmentI alignment,
String method)
{
if (method != FEATURE_SCORE && method != FEATURE_LABEL
@@ -761,14 +763,19 @@ public class AlignmentSorter
scoreLabel.append(start + stop + method);
// This doesn't quite work yet - we'd like to have a canonical ordering that
// can be preserved from call to call
- for (int i = 0; featureLabels != null && i < featureLabels.length; i++)
+ if (featureLabels != null)
{
- scoreLabel.append(featureLabels[i] == null ? "null"
- : featureLabels[i]);
+ for (String label : featureLabels)
+ {
+ scoreLabel.append(label);
+ }
}
- for (int i = 0; groupLabels != null && i < groupLabels.length; i++)
+ if (groupLabels != null)
{
- scoreLabel.append(groupLabels[i] == null ? "null" : groupLabels[i]);
+ for (String label : groupLabels)
+ {
+ scoreLabel.append(label);
+ }
}
/*
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java
index da5bc2f..db69823 100644
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -28,27 +28,35 @@ import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
import jalview.datamodel.FeatureProperties;
+import jalview.datamodel.IncompleteCodonException;
import jalview.datamodel.Mapping;
import jalview.datamodel.SearchResults;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceGroup;
import jalview.datamodel.SequenceI;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
import jalview.schemes.ResidueProperties;
+import jalview.util.Comparison;
import jalview.util.DBRefUtils;
import jalview.util.MapList;
import jalview.util.MappingUtils;
+import jalview.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.NoSuchElementException;
import java.util.Set;
import java.util.TreeMap;
@@ -315,7 +323,7 @@ public class AlignmentUtils
}
else
{
- MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq);
+ MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
if (map != null)
{
acf.addMap(cdnaSeq, aaSeq, map);
@@ -338,12 +346,12 @@ public class AlignmentUtils
* Answers true if the mappings include one between the given (dataset)
* sequences.
*/
- public static boolean mappingExists(Set set,
+ public static boolean mappingExists(List mappings,
SequenceI aaSeq, SequenceI cdnaSeq)
{
- if (set != null)
+ if (mappings != null)
{
- for (AlignedCodonFrame acf : set)
+ for (AlignedCodonFrame acf : mappings)
{
if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
{
@@ -355,16 +363,22 @@ public class AlignmentUtils
}
/**
- * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
- * must be three times the length of the protein, possibly after ignoring
- * start and/or stop codons, and must translate to the protein. Returns null
- * if no mapping is determined.
+ * Builds a mapping (if possible) of a cDNA to a protein sequence.
+ *
+ * - first checks if the cdna translates exactly to the protein sequence
+ * - else checks for translation after removing a STOP codon
+ * - else checks for translation after removing a START codon
+ * - if that fails, inspect CDS features on the cDNA sequence
+ *
+ * Returns null if no mapping is determined.
*
- * @param proteinSeqs
+ * @param proteinSeq
+ * the aligned protein sequence
* @param cdnaSeq
+ * the aligned cdna sequence
* @return
*/
- public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq,
+ public static MapList mapCdnaToProtein(SequenceI proteinSeq,
SequenceI cdnaSeq)
{
/*
@@ -394,7 +408,7 @@ public class AlignmentUtils
final int proteinEnd = proteinSeq.getEnd();
/*
- * If lengths don't match, try ignoring stop codon.
+ * If lengths don't match, try ignoring stop codon (if present)
*/
if (cdnaLength != mappedLength && cdnaLength > 2)
{
@@ -425,17 +439,20 @@ public class AlignmentUtils
cdnaLength -= 3;
}
- if (cdnaLength != mappedLength)
+ if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
{
- return null;
- }
- if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
- {
- return null;
+ /*
+ * protein is translation of dna (+/- start/stop codons)
+ */
+ MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
}
- MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] {
- proteinStart, proteinEnd }, 3, 1);
- return map;
+
+ /*
+ * translation failed - try mapping CDS annotated regions of dna
+ */
+ return mapCdsToProtein(cdnaSeq, proteinSeq);
}
/**
@@ -456,16 +473,18 @@ public class AlignmentUtils
return false;
}
- int aaResidue = 0;
- for (int i = cdnaStart; i < cdnaSeqChars.length - 2
- && aaResidue < aaSeqChars.length; i += 3, aaResidue++)
+ int aaPos = 0;
+ int dnaPos = cdnaStart;
+ for (; dnaPos < cdnaSeqChars.length - 2
+ && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
{
- String codon = String.valueOf(cdnaSeqChars, i, 3);
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
final String translated = ResidueProperties.codonTranslate(codon);
+
/*
* allow * in protein to match untranslatable in dna
*/
- final char aaRes = aaSeqChars[aaResidue];
+ final char aaRes = aaSeqChars[aaPos];
if ((translated == null || "STOP".equals(translated)) && aaRes == '*')
{
continue;
@@ -478,8 +497,32 @@ public class AlignmentUtils
return false;
}
}
- // fail if we didn't match all of the aa sequence
- return (aaResidue == aaSeqChars.length);
+
+ /*
+ * check we matched all of the protein sequence
+ */
+ if (aaPos != aaSeqChars.length)
+ {
+ return false;
+ }
+
+ /*
+ * check we matched all of the dna except
+ * for optional trailing STOP codon
+ */
+ if (dnaPos == cdnaSeqChars.length)
+ {
+ return true;
+ }
+ if (dnaPos == cdnaSeqChars.length - 3)
+ {
+ String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+ if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
+ {
+ return true;
+ }
+ }
+ return false;
}
/**
@@ -514,8 +557,8 @@ public class AlignmentUtils
/*
* Locate the aligned source sequence whose dataset sequence is mapped. We
- * just take the first match here (as we can't align cDNA like more than one
- * protein sequence).
+ * just take the first match here (as we can't align like more than one
+ * sequence).
*/
SequenceI alignFrom = null;
AlignedCodonFrame mapping = null;
@@ -541,8 +584,8 @@ public class AlignmentUtils
/**
* Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
* match residues and codons. Flags control whether existing gaps in unmapped
- * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro
- * and exon are only retained if both flags are set.
+ * (intron) and mapped (exon) regions are preserved or not. Gaps between
+ * intron and exon are only retained if both flags are set.
*
* @param alignTo
* @param alignFrom
@@ -558,9 +601,6 @@ public class AlignmentUtils
boolean preserveUnmappedGaps)
{
// TODO generalise to work for Protein-Protein, dna-dna, dna-protein
- final char[] thisSeq = alignTo.getSequence();
- final char[] thatAligned = alignFrom.getSequence();
- StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
// aligned and dataset sequence positions, all base zero
int thisSeqPos = 0;
@@ -570,13 +610,17 @@ public class AlignmentUtils
char myGapChar = myGap.charAt(0);
int ratio = myGap.length();
- /*
- * Traverse the aligned protein sequence.
- */
int fromOffset = alignFrom.getStart() - 1;
int toOffset = alignTo.getStart() - 1;
int sourceGapMappedLength = 0;
boolean inExon = false;
+ final char[] thisSeq = alignTo.getSequence();
+ final char[] thatAligned = alignFrom.getSequence();
+ StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+
+ /*
+ * Traverse the 'model' aligned sequence
+ */
for (char sourceChar : thatAligned)
{
if (sourceChar == sourceGap)
@@ -586,7 +630,7 @@ public class AlignmentUtils
}
/*
- * Found a residue. Locate its mapped codon (start) position.
+ * Found a non-gap character. Locate its mapped region if any.
*/
sourceDsPos++;
// Note mapping positions are base 1, our sequence positions base 0
@@ -595,11 +639,13 @@ public class AlignmentUtils
if (mappedPos == null)
{
/*
- * Abort realignment if unmapped protein. Or could ignore it??
+ * unmapped position; treat like a gap
*/
- System.err.println("Can't align: no codon mapping to residue "
- + sourceDsPos + "(" + sourceChar + ")");
- return;
+ sourceGapMappedLength += ratio;
+ // System.err.println("Can't align: no codon mapping to residue "
+ // + sourceDsPos + "(" + sourceChar + ")");
+ // return;
+ continue;
}
int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
@@ -669,8 +715,8 @@ public class AlignmentUtils
}
/*
- * At end of protein sequence. Copy any remaining dna sequence, optionally
- * including (intron) gaps. We do not copy trailing gaps in protein.
+ * At end of model aligned sequence. Copy any remaining target sequence, optionally
+ * including (intron) gaps.
*/
while (thisSeqPos < thisSeq.length)
{
@@ -679,6 +725,20 @@ public class AlignmentUtils
{
thisAligned.append(c);
}
+ sourceGapMappedLength--;
+ }
+
+ /*
+ * finally add gaps to pad for any trailing source gaps or
+ * unmapped characters
+ */
+ if (preserveUnmappedGaps)
+ {
+ while (sourceGapMappedLength > 0)
+ {
+ thisAligned.append(myGapChar);
+ sourceGapMappedLength--;
+ }
}
/*
@@ -907,33 +967,152 @@ public class AlignmentUtils
public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
{
List unmappedProtein = new ArrayList();
+ Map> alignedCodons = buildCodonColumnsMap(
+ protein, dna, unmappedProtein);
+ return alignProteinAs(protein, alignedCodons, unmappedProtein);
+ }
+
+ /**
+ * Builds a map whose key is an aligned codon position (3 alignment column
+ * numbers base 0), and whose value is a map from protein sequence to each
+ * protein's peptide residue for that codon. The map generates an ordering of
+ * the codons, and allows us to read off the peptides at each position in
+ * order to assemble 'aligned' protein sequences.
+ *
+ * @param protein
+ * the protein alignment
+ * @param dna
+ * the coding dna alignment
+ * @param unmappedProtein
+ * any unmapped proteins are added to this list
+ * @return
+ */
+ protected static Map> buildCodonColumnsMap(
+ AlignmentI protein, AlignmentI dna,
+ List unmappedProtein)
+ {
+ /*
+ * maintain a list of any proteins with no mappings - these will be
+ * rendered 'as is' in the protein alignment as we can't align them
+ */
unmappedProtein.addAll(protein.getSequences());
- Set mappings = protein.getCodonFrames();
+ List mappings = protein.getCodonFrames();
/*
* Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
* {dnaSequence, {proteinSequence, codonProduct}} at that position. The
* comparator keeps the codon positions ordered.
*/
- Map> alignedCodons = new TreeMap>(
+ Map> alignedCodons = new TreeMap>(
new CodonComparator());
+
for (SequenceI dnaSeq : dna.getSequences())
{
for (AlignedCodonFrame mapping : mappings)
{
- Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
SequenceI prot = mapping.findAlignedSequence(
dnaSeq.getDatasetSequence(), protein);
if (prot != null)
{
+ Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
seqMap, alignedCodons);
unmappedProtein.remove(prot);
}
}
}
- return alignProteinAs(protein, alignedCodons, unmappedProtein);
+
+ /*
+ * Finally add any unmapped peptide start residues (e.g. for incomplete
+ * codons) as if at the codon position before the second residue
+ */
+ int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
+ addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
+
+ return alignedCodons;
+ }
+
+ /**
+ * Scans for any protein mapped from position 2 (meaning unmapped start
+ * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
+ * preceding position in the alignment
+ *
+ * @param alignedCodons
+ * the codon-to-peptide map
+ * @param mappedSequenceCount
+ * the number of distinct sequences in the map
+ */
+ protected static void addUnmappedPeptideStarts(
+ Map> alignedCodons,
+ int mappedSequenceCount)
+ {
+ // TODO delete this ugly hack once JAL-2022 is resolved
+ // i.e. we can model startPhase > 0 (incomplete start codon)
+
+ List sequencesChecked = new ArrayList();
+ AlignedCodon lastCodon = null;
+ Map toAdd = new HashMap();
+
+ for (Entry> entry : alignedCodons
+ .entrySet())
+ {
+ for (Entry sequenceCodon : entry.getValue()
+ .entrySet())
+ {
+ SequenceI seq = sequenceCodon.getKey();
+ if (sequencesChecked.contains(seq))
+ {
+ continue;
+ }
+ sequencesChecked.add(seq);
+ AlignedCodon codon = sequenceCodon.getValue();
+ if (codon.peptideCol > 1)
+ {
+ System.err
+ .println("Problem mapping protein with >1 unmapped start positions: "
+ + seq.getName());
+ }
+ else if (codon.peptideCol == 1)
+ {
+ /*
+ * first position (peptideCol == 0) was unmapped - add it
+ */
+ if (lastCodon != null)
+ {
+ AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
+ lastCodon.pos2, lastCodon.pos3, String.valueOf(seq
+ .getCharAt(0)), 0);
+ toAdd.put(seq, firstPeptide);
+ }
+ else
+ {
+ /*
+ * unmapped residue at start of alignment (no prior column) -
+ * 'insert' at nominal codon [0, 0, 0]
+ */
+ AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
+ String.valueOf(seq.getCharAt(0)), 0);
+ toAdd.put(seq, firstPeptide);
+ }
+ }
+ if (sequencesChecked.size() == mappedSequenceCount)
+ {
+ // no need to check past first mapped position in all sequences
+ break;
+ }
+ }
+ lastCodon = entry.getKey();
+ }
+
+ /*
+ * add any new codons safely after iterating over the map
+ */
+ for (Entry startCodon : toAdd.entrySet())
+ {
+ addCodonToMap(alignedCodons, startCodon.getValue(),
+ startCodon.getKey());
+ }
}
/**
@@ -948,7 +1127,7 @@ public class AlignmentUtils
* @return
*/
protected static int alignProteinAs(AlignmentI protein,
- Map> alignedCodons,
+ Map> alignedCodons,
List unmappedProtein)
{
/*
@@ -970,12 +1149,13 @@ public class AlignmentUtils
int column = 0;
for (AlignedCodon codon : alignedCodons.keySet())
{
- final Map columnResidues = alignedCodons
+ final Map columnResidues = alignedCodons
.get(codon);
- for (Entry entry : columnResidues.entrySet())
+ for (Entry entry : columnResidues.entrySet())
{
// place translated codon at its column position in sequence
- entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
+ entry.getKey().getSequence()[column] = entry.getValue().product
+ .charAt(0);
}
column++;
}
@@ -1000,23 +1180,51 @@ public class AlignmentUtils
*/
static void addCodonPositions(SequenceI dna, SequenceI protein,
char gapChar, Mapping seqMap,
- Map> alignedCodons)
+ Map> alignedCodons)
{
Iterator codons = seqMap.getCodonIterator(dna, gapChar);
+
+ /*
+ * add codon positions, and their peptide translations, to the alignment
+ * map, while remembering the first codon mapped
+ */
while (codons.hasNext())
{
- AlignedCodon codon = codons.next();
- Map seqProduct = alignedCodons.get(codon);
- if (seqProduct == null)
+ try
+ {
+ AlignedCodon codon = codons.next();
+ addCodonToMap(alignedCodons, codon, protein);
+ } catch (IncompleteCodonException e)
{
- seqProduct = new HashMap();
- alignedCodons.put(codon, seqProduct);
+ // possible incomplete trailing codon - ignore
+ } catch (NoSuchElementException e)
+ {
+ // possibly peptide lacking STOP
}
- seqProduct.put(protein, codon.product);
}
}
/**
+ * Helper method to add a codon-to-peptide entry to the aligned codons map
+ *
+ * @param alignedCodons
+ * @param codon
+ * @param protein
+ */
+ protected static void addCodonToMap(
+ Map> alignedCodons,
+ AlignedCodon codon, SequenceI protein)
+ {
+ Map seqProduct = alignedCodons.get(codon);
+ if (seqProduct == null)
+ {
+ seqProduct = new HashMap();
+ alignedCodons.put(codon, seqProduct);
+ }
+ seqProduct.put(protein, codon);
+ }
+
+ /**
* Returns true if a cDNA/Protein mapping either exists, or could be made,
* between at least one pair of sequences in the two alignments. Currently,
* the logic is:
@@ -1048,7 +1256,7 @@ public class AlignmentUtils
}
AlignmentI dna = al1.isNucleotide() ? al1 : al2;
AlignmentI protein = dna == al1 ? al2 : al1;
- Set mappings = protein.getCodonFrames();
+ List mappings = protein.getCodonFrames();
for (SequenceI dnaSeq : dna.getSequences())
{
for (SequenceI proteinSeq : protein.getSequences())
@@ -1072,7 +1280,7 @@ public class AlignmentUtils
* @return
*/
protected static boolean isMappable(SequenceI dnaSeq,
- SequenceI proteinSeq, Set mappings)
+ SequenceI proteinSeq, List mappings)
{
if (dnaSeq == null || proteinSeq == null)
{
@@ -1084,13 +1292,13 @@ public class AlignmentUtils
SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq
: proteinSeq.getDatasetSequence();
- /*
- * Already mapped?
- */
for (AlignedCodonFrame mapping : mappings)
{
if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
{
+ /*
+ * already mapped
+ */
return true;
}
}
@@ -1099,7 +1307,7 @@ public class AlignmentUtils
* Just try to make a mapping (it is not yet stored), test whether
* successful.
*/
- return mapProteinSequenceToCdna(proteinDs, dnaDs) != null;
+ return mapCdnaToProtein(proteinDs, dnaDs) != null;
}
/**
@@ -1301,21 +1509,31 @@ public class AlignmentUtils
}
/**
- * Constructs an alignment consisting of the mapped exon regions in the given
- * nucleotide sequences, and updates mappings to match.
+ * Constructs an alignment consisting of the mapped (CDS) regions in the given
+ * nucleotide sequences, and updates mappings to match. The new sequences are
+ * aligned as per the original sequence, with entirely gapped columns (codon
+ * interrupted by intron) omitted.
*
* @param dna
* aligned dna sequences
* @param mappings
* from dna to protein; these are replaced with new mappings
- * @return an alignment whose sequences are the exon-only parts of the dna
- * sequences (or null if no exons are found)
+ * @param al
+ * @return an alignment whose sequences are the cds-only parts of the dna
+ * sequences (or null if no mappings are found)
*/
- public static AlignmentI makeExonAlignment(SequenceI[] dna,
- Set mappings)
+ public static AlignmentI makeCdsAlignment(SequenceI[] dna,
+ List mappings, AlignmentI al)
{
- Set newMappings = new LinkedHashSet();
- List exonSequences = new ArrayList();
+ List cdsColumns = findCdsColumns(dna);
+
+ /*
+ * create CDS sequences and new mappings
+ * (from cdna to cds, and cds to peptide)
+ */
+ List newMappings = new ArrayList();
+ List cdsSequences = new ArrayList();
+ char gap = al.getGapCharacter();
for (SequenceI dnaSeq : dna)
{
@@ -1325,18 +1543,30 @@ public class AlignmentUtils
for (AlignedCodonFrame acf : seqMappings)
{
AlignedCodonFrame newMapping = new AlignedCodonFrame();
- final List mappedExons = makeExonSequences(ds, acf,
- newMapping);
- if (!mappedExons.isEmpty())
+ final List mappedCds = makeCdsSequences(dnaSeq, acf,
+ cdsColumns, newMapping, gap);
+ if (!mappedCds.isEmpty())
{
- exonSequences.addAll(mappedExons);
+ cdsSequences.addAll(mappedCds);
newMappings.add(newMapping);
}
}
}
- AlignmentI al = new Alignment(
- exonSequences.toArray(new SequenceI[exonSequences.size()]));
- al.setDataset(null);
+ AlignmentI newAl = new Alignment(
+ cdsSequences.toArray(new SequenceI[cdsSequences.size()]));
+
+ /*
+ * add new sequences to the shared dataset, set it on the new alignment
+ */
+ List dsseqs = al.getDataset().getSequences();
+ for (SequenceI seq : newAl.getSequences())
+ {
+ if (!dsseqs.contains(seq.getDatasetSequence()))
+ {
+ dsseqs.add(seq.getDatasetSequence());
+ }
+ }
+ newAl.setDataset(al.getDataset());
/*
* Replace the old mappings with the new ones
@@ -1344,90 +1574,741 @@ public class AlignmentUtils
mappings.clear();
mappings.addAll(newMappings);
- return al;
+ return newAl;
+ }
+
+ /**
+ * Returns a consolidated list of column ranges where at least one sequence
+ * has a CDS feature. This assumes CDS features are on genomic sequence i.e.
+ * are for contiguous CDS ranges (no gaps).
+ *
+ * @param seqs
+ * @return
+ */
+ public static List findCdsColumns(SequenceI[] seqs)
+ {
+ // TODO use refactored code from AlignViewController
+ // markColumnsContainingFeatures, not reinvent the wheel!
+
+ List result = new ArrayList();
+ for (SequenceI seq : seqs)
+ {
+ result.addAll(findCdsColumns(seq));
+ }
+
+ /*
+ * sort and compact the list into ascending, non-overlapping ranges
+ */
+ Collections.sort(result, new Comparator()
+ {
+ @Override
+ public int compare(int[] o1, int[] o2)
+ {
+ return Integer.compare(o1[0], o2[0]);
+ }
+ });
+ result = MapList.coalesceRanges(result);
+
+ return result;
+ }
+
+ public static List findCdsColumns(SequenceI seq)
+ {
+ List result = new ArrayList();
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ SequenceFeature[] sfs = seq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature sf : sfs)
+ {
+ if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+ {
+ int colStart = seq.findIndex(sf.getBegin());
+ int colEnd = seq.findIndex(sf.getEnd());
+ result.add(new int[] { colStart, colEnd });
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Answers true if all sequences have a gap at (or do not extend to) the
+ * specified column position (base 1)
+ *
+ * @param seqs
+ * @param col
+ * @return
+ */
+ public static boolean isGappedColumn(List seqs, int col)
+ {
+ if (seqs != null)
+ {
+ for (SequenceI seq : seqs)
+ {
+ if (!Comparison.isGap(seq.getCharAt(col - 1)))
+ {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns the column ranges (base 1) of each aligned sequence that are
+ * involved in any mapping. This is a helper method for aligning protein
+ * products of aligned transcripts.
+ *
+ * @param mappedSequences
+ * (possibly gapped) dna sequences
+ * @param mappings
+ * @return
+ */
+ protected static List> getMappedColumns(
+ List mappedSequences, List mappings)
+ {
+ List> result = new ArrayList>();
+ for (SequenceI seq : mappedSequences)
+ {
+ List columns = new ArrayList();
+ List seqMappings = MappingUtils
+ .findMappingsForSequence(seq, mappings);
+ for (AlignedCodonFrame mapping : seqMappings)
+ {
+ List maps = mapping.getMappingsForSequence(seq);
+ for (Mapping map : maps)
+ {
+ /*
+ * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ * Find and add the overall aligned column range for each
+ */
+ for (int[] cdsRange : map.getMap().getFromRanges())
+ {
+ int startPos = cdsRange[0];
+ int endPos = cdsRange[1];
+ int startCol = seq.findIndex(startPos);
+ int endCol = seq.findIndex(endPos);
+ columns.add(new int[] { startCol, endCol });
+ }
+ }
+ }
+ result.add(columns);
+ }
+ return result;
}
/**
- * Helper method to make exon-only sequences and populate their mappings to
+ * Helper method to make cds-only sequences and populate their mappings to
* protein products
*
* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
* then generate a sequence CCTTGA with mapping [1, 6] to the same protein
* residues
*
- * Typically eukaryotic dna will include exons encoding for a single peptide
+ * Typically eukaryotic dna will include cds encoding for a single peptide
* sequence i.e. return a single result. Bacterial dna may have overlapping
- * exon mappings coding for multiple peptides so return multiple results
+ * cds mappings coding for multiple peptides so return multiple results
* (example EMBL KF591215).
*
* @param dnaSeq
- * a dna dataset sequence
+ * a dna aligned sequence
* @param mapping
* containing one or more mappings of the sequence to protein
- * @param newMapping
- * the new mapping to populate, from the exon-only sequences to their
+ * @param ungappedCdsColumns
+ * @param newMappings
+ * the new mapping to populate, from the cds-only sequences to their
* mapped protein sequences
* @return
*/
- protected static List makeExonSequences(SequenceI dnaSeq,
- AlignedCodonFrame mapping, AlignedCodonFrame newMapping)
+ protected static List makeCdsSequences(SequenceI dnaSeq,
+ AlignedCodonFrame mapping, List ungappedCdsColumns,
+ AlignedCodonFrame newMappings, char gapChar)
{
- List exonSequences = new ArrayList();
+ List cdsSequences = new ArrayList();
List seqMappings = mapping.getMappingsForSequence(dnaSeq);
- final char[] dna = dnaSeq.getSequence();
+
for (Mapping seqMapping : seqMappings)
{
- StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+ SequenceI cds = makeCdsSequence(dnaSeq, seqMapping,
+ ungappedCdsColumns, gapChar);
+ cds.createDatasetSequence();
+ cdsSequences.add(cds);
/*
- * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ * add new mappings, from dna to cds, and from cds to peptide
*/
- final List dnaExonRanges = seqMapping.getMap().getFromRanges();
- for (int[] range : dnaExonRanges)
+ MapList dnaToCds = addCdsMappings(dnaSeq.getDatasetSequence(), cds,
+ seqMapping, newMappings);
+
+ /*
+ * transfer any features on dna that overlap the CDS
+ */
+ transferFeatures(dnaSeq, cds, dnaToCds, null, SequenceOntologyI.CDS);
+ }
+ return cdsSequences;
+ }
+
+ /**
+ * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
+ * feature start/end ranges, optionally omitting specified feature types.
+ * Returns the number of features copied.
+ *
+ * @param fromSeq
+ * @param toSeq
+ * @param select
+ * if not null, only features of this type are copied (including
+ * subtypes in the Sequence Ontology)
+ * @param mapping
+ * the mapping from 'fromSeq' to 'toSeq'
+ * @param omitting
+ */
+ public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
+ MapList mapping, String select, String... omitting)
+ {
+ SequenceI copyTo = toSeq;
+ while (copyTo.getDatasetSequence() != null)
+ {
+ copyTo = copyTo.getDatasetSequence();
+ }
+
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ int count = 0;
+ SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature sf : sfs)
{
- for (int pos = range[0]; pos <= range[1]; pos++)
+ String type = sf.getType();
+ if (select != null && !so.isA(type, select))
{
- newSequence.append(dna[pos - 1]);
+ continue;
+ }
+ boolean omit = false;
+ for (String toOmit : omitting)
+ {
+ if (type.equals(toOmit))
+ {
+ omit = true;
+ }
+ }
+ if (omit)
+ {
+ continue;
+ }
+
+ /*
+ * locate the mapped range - null if either start or end is
+ * not mapped (no partial overlaps are calculated)
+ */
+ int start = sf.getBegin();
+ int end = sf.getEnd();
+ int[] mappedTo = mapping.locateInTo(start, end);
+ /*
+ * if whole exon range doesn't map, try interpreting it
+ * as 5' or 3' exon overlapping the CDS range
+ */
+ if (mappedTo == null)
+ {
+ mappedTo = mapping.locateInTo(end, end);
+ if (mappedTo != null)
+ {
+ /*
+ * end of exon is in CDS range - 5' overlap
+ * to a range from the start of the peptide
+ */
+ mappedTo[0] = 1;
+ }
+ }
+ if (mappedTo == null)
+ {
+ mappedTo = mapping.locateInTo(start, start);
+ if (mappedTo != null)
+ {
+ /*
+ * start of exon is in CDS range - 3' overlap
+ * to a range up to the end of the peptide
+ */
+ mappedTo[1] = toSeq.getLength();
+ }
+ }
+ if (mappedTo != null)
+ {
+ SequenceFeature copy = new SequenceFeature(sf);
+ copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
+ copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
+ copyTo.addSequenceFeature(copy);
+ count++;
}
}
+ }
+ return count;
+ }
- SequenceI exon = new Sequence(dnaSeq.getName(),
- newSequence.toString());
+ /**
+ * Creates and adds mappings
+ *
+ * - from cds to peptide
+ * - from dna to cds
+ *
+ * and returns the dna-to-cds mapping
+ *
+ * @param dnaSeq
+ * @param cdsSeq
+ * @param dnaMapping
+ * @param newMappings
+ * @return
+ */
+ protected static MapList addCdsMappings(SequenceI dnaSeq,
+ SequenceI cdsSeq, Mapping dnaMapping,
+ AlignedCodonFrame newMappings)
+ {
+ cdsSeq.createDatasetSequence();
- /*
- * Locate any xrefs to CDS database on the protein product and attach to
- * the CDS sequence. Also add as a sub-token of the sequence name.
- */
- // default to "CDS" if we can't locate an actual gene id
- String cdsAccId = FeatureProperties
- .getCodingFeature(DBRefSource.EMBL);
- DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo()
- .getDBRefs(), DBRefSource.CODINGDBS);
- if (cdsRefs != null)
+ /*
+ * CDS to peptide is just a contiguous 3:1 mapping, with
+ * the peptide ranges taken unchanged from the dna mapping
+ */
+ List cdsRanges = new ArrayList();
+ SequenceI cdsDataset = cdsSeq.getDatasetSequence();
+ cdsRanges.add(new int[] { 1, cdsDataset.getLength() });
+ MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
+ .getToRanges(), 3, 1);
+ newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide);
+
+ /*
+ * dna 'from' ranges map 1:1 to the contiguous extracted CDS
+ */
+ MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(),
+ cdsRanges, 1, 1);
+ newMappings.addMap(dnaSeq, cdsDataset, dnaToCds);
+ return dnaToCds;
+ }
+
+ /**
+ * Makes and returns a CDS-only sequence, where the CDS regions are identified
+ * as the 'from' ranges of the mapping on the dna.
+ *
+ * @param dnaSeq
+ * nucleotide sequence
+ * @param seqMapping
+ * mappings from CDS regions of nucleotide
+ * @param ungappedCdsColumns
+ * @return
+ */
+ protected static SequenceI makeCdsSequence(SequenceI dnaSeq,
+ Mapping seqMapping, List ungappedCdsColumns, char gapChar)
+ {
+ int cdsWidth = MappingUtils.getLength(ungappedCdsColumns);
+
+ /*
+ * populate CDS columns with the aligned
+ * column character if that column is mapped (which may be a gap
+ * if an intron interrupts a codon), else with a gap
+ */
+ List fromRanges = seqMapping.getMap().getFromRanges();
+ char[] cdsChars = new char[cdsWidth];
+ int pos = 0;
+ for (int[] columns : ungappedCdsColumns)
+ {
+ for (int i = columns[0]; i <= columns[1]; i++)
{
- for (DBRefEntry cdsRef : cdsRefs)
+ char dnaChar = dnaSeq.getCharAt(i - 1);
+ if (Comparison.isGap(dnaChar))
{
- exon.addDBRef(new DBRefEntry(cdsRef));
- cdsAccId = cdsRef.getAccessionId();
+ cdsChars[pos] = gapChar;
+ }
+ else
+ {
+ int seqPos = dnaSeq.findPosition(i - 1);
+ if (MappingUtils.contains(fromRanges, seqPos))
+ {
+ cdsChars[pos] = dnaChar;
+ }
+ else
+ {
+ cdsChars[pos] = gapChar;
+ }
}
+ pos++;
}
- exon.setName(exon.getName() + "|" + cdsAccId);
- exon.createDatasetSequence();
+ }
+ SequenceI cdsSequence = new Sequence(dnaSeq.getName(),
+ String.valueOf(cdsChars));
+
+ transferDbRefs(seqMapping.getTo(), cdsSequence);
+
+ return cdsSequence;
+ }
+
+ /**
+ * Locate any xrefs to CDS databases on the protein product and attach to the
+ * CDS sequence. Also add as a sub-token of the sequence name.
+ *
+ * @param from
+ * @param to
+ */
+ protected static void transferDbRefs(SequenceI from, SequenceI to)
+ {
+ String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
+ DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRefs(),
+ DBRefSource.CODINGDBS);
+ if (cdsRefs != null)
+ {
+ for (DBRefEntry cdsRef : cdsRefs)
+ {
+ to.addDBRef(new DBRefEntry(cdsRef));
+ cdsAccId = cdsRef.getAccessionId();
+ }
+ }
+ if (!to.getName().contains(cdsAccId))
+ {
+ to.setName(to.getName() + "|" + cdsAccId);
+ }
+ }
+ /**
+ * Returns a mapping from dna to protein by inspecting sequence features of
+ * type "CDS" on the dna.
+ *
+ * @param dnaSeq
+ * @param proteinSeq
+ * @return
+ */
+ public static MapList mapCdsToProtein(SequenceI dnaSeq,
+ SequenceI proteinSeq)
+ {
+ List ranges = findCdsPositions(dnaSeq);
+ int mappedDnaLength = MappingUtils.getLength(ranges);
+
+ int proteinLength = proteinSeq.getLength();
+ int proteinStart = proteinSeq.getStart();
+ int proteinEnd = proteinSeq.getEnd();
+
+ /*
+ * incomplete start codon may mean X at start of peptide
+ * we ignore both for mapping purposes
+ */
+ if (proteinSeq.getCharAt(0) == 'X')
+ {
+ // todo JAL-2022 support startPhase > 0
+ proteinStart++;
+ proteinLength--;
+ }
+ List proteinRange = new ArrayList();
+
+ /*
+ * dna length should map to protein (or protein plus stop codon)
+ */
+ int codesForResidues = mappedDnaLength / 3;
+ if (codesForResidues == (proteinLength + 1))
+ {
+ // assuming extra codon is for STOP and not in peptide
+ codesForResidues--;
+ }
+ if (codesForResidues == proteinLength)
+ {
+ proteinRange.add(new int[] { proteinStart, proteinEnd });
+ return new MapList(ranges, proteinRange, 3, 1);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
+ * start/end positions of sequence features of type "CDS" (or a sub-type of
+ * CDS in the Sequence Ontology)
+ *
+ * @param dnaSeq
+ * @return
+ */
+ public static List findCdsPositions(SequenceI dnaSeq)
+ {
+ List result = new ArrayList();
+ SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return result;
+ }
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ for (SequenceFeature sf : sfs)
+ {
/*
- * Build new mappings - from the same protein regions, but now to
- * contiguous exons
+ * process a CDS feature (or a sub-type of CDS)
*/
- List exonRange = new ArrayList();
- exonRange.add(new int[] { 1, newSequence.length() });
- MapList map = new MapList(exonRange, seqMapping.getMap()
- .getToRanges(), 3, 1);
- newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map);
- MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1);
- newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap);
+ if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+ {
+ int phase = 0;
+ try {
+ phase = Integer.parseInt(sf.getPhase());
+ } catch (NumberFormatException e)
+ {
+ // ignore
+ }
+ /*
+ * phase > 0 on first codon means 5' incomplete - skip to the start
+ * of the next codon; example ENST00000496384
+ */
+ int begin = sf.getBegin();
+ int end = sf.getEnd();
+ if (result.isEmpty())
+ {
+ // TODO JAL-2022 support start phase > 0
+ begin += phase;
+ if (begin > end)
+ {
+ continue; // shouldn't happen?
+ }
+ }
+ result.add(new int[] { begin, end });
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Maps exon features from dna to protein, and computes variants in peptide
+ * product generated by variants in dna, and adds them as sequence_variant
+ * features on the protein sequence. Returns the number of variant features
+ * added.
+ *
+ * @param dnaSeq
+ * @param peptide
+ * @param dnaToProtein
+ */
+ public static int computeProteinFeatures(SequenceI dnaSeq,
+ SequenceI peptide, MapList dnaToProtein)
+ {
+ while (dnaSeq.getDatasetSequence() != null)
+ {
+ dnaSeq = dnaSeq.getDatasetSequence();
+ }
+ while (peptide.getDatasetSequence() != null)
+ {
+ peptide = peptide.getDatasetSequence();
+ }
+
+ transferFeatures(dnaSeq, peptide, dnaToProtein,
+ SequenceOntologyI.EXON);
+
+ LinkedHashMap variants = buildDnaVariantsMap(
+ dnaSeq, dnaToProtein);
+
+ /*
+ * scan codon variations, compute peptide variants and add to peptide sequence
+ */
+ int count = 0;
+ for (Entry variant : variants.entrySet())
+ {
+ int peptidePos = variant.getKey();
+ String[][] codonVariants = variant.getValue();
+ String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
+ List peptideVariants = computePeptideVariants(codonVariants,
+ residue);
+ if (!peptideVariants.isEmpty())
+ {
+ String desc = StringUtils.listToDelimitedString(peptideVariants,
+ ", ");
+ SequenceFeature sf = new SequenceFeature(
+ SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+ peptidePos, 0f, null);
+ peptide.addSequenceFeature(sf);
+ count++;
+ }
+ }
+
+ /*
+ * ugly sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet instead?
+ */
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
+ return count;
+ }
+
+ /**
+ * Builds a map whose key is position in the protein sequence, and value is an
+ * array of all variants for the coding codon positions
+ *
+ * @param dnaSeq
+ * @param dnaToProtein
+ * @return
+ */
+ static LinkedHashMap buildDnaVariantsMap(
+ SequenceI dnaSeq, MapList dnaToProtein)
+ {
+ /*
+ * map from peptide position to all variant features of the codon for it
+ * LinkedHashMap ensures we add the peptide features in sequence order
+ */
+ LinkedHashMap variants = new LinkedHashMap();
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return variants;
+ }
+
+ int dnaStart = dnaSeq.getStart();
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ String[][] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new String[3][];
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.toUpperCase().split(",");
+ int i = 0;
+ for (String allele : alleles)
+ {
+ alleles[i++] = allele.trim(); // lose any space characters "A, G"
+ }
+
+ /*
+ * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and this variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(
+ dnaSeq.getCharAt(codon[codonPos] - dnaStart))
+ .toUpperCase();
+ if (codonVariants[codonPos] == null)
+ {
+ /*
+ * record current dna base
+ */
+ codonVariants[codonPos] = new String[] { nucleotide };
+ }
+ if (codon[codonPos] == dnaCol)
+ {
+ /*
+ * add alleles to dna base (and any previously found alleles)
+ */
+ String[] known = codonVariants[codonPos];
+ String[] dnaVariants = new String[alleles.length + known.length];
+ System.arraycopy(known, 0, dnaVariants, 0, known.length);
+ System.arraycopy(alleles, 0, dnaVariants, known.length,
+ alleles.length);
+ codonVariants[codonPos] = dnaVariants;
+ }
+ }
+ }
+ }
+ return variants;
+ }
- exonSequences.add(exon);
+ /**
+ * Returns a sorted, non-redundant list of all peptide translations generated
+ * by the given dna variants, excluding the current residue value
+ *
+ * @param codonVariants
+ * an array of base values (acgtACGT) for codon positions 1, 2, 3
+ * @param residue
+ * the current residue translation
+ * @return
+ */
+ static List computePeptideVariants(
+ String[][] codonVariants, String residue)
+ {
+ List result = new ArrayList();
+ for (String base1 : codonVariants[0])
+ {
+ for (String base2 : codonVariants[1])
+ {
+ for (String base3 : codonVariants[2])
+ {
+ String codon = base1 + base2 + base3;
+ /*
+ * get peptide translation of codon e.g. GAT -> D
+ * note that variants which are not single alleles,
+ * e.g. multibase variants or HGMD_MUTATION etc
+ * are ignored here
+ */
+ String peptide = codon.contains("-") ? "-"
+ : (codon.length() > 3 ? null : ResidueProperties
+ .codonTranslate(codon));
+ if (peptide != null && !result.contains(peptide)
+ && !peptide.equalsIgnoreCase(residue))
+ {
+ result.add(peptide);
+ }
+ }
+ }
}
- return exonSequences;
+
+ /*
+ * sort alphabetically with STOP at the end
+ */
+ Collections.sort(result, new Comparator()
+ {
+
+ @Override
+ public int compare(String o1, String o2)
+ {
+ if ("STOP".equals(o1))
+ {
+ return 1;
+ }
+ else if ("STOP".equals(o2))
+ {
+ return -1;
+ }
+ else
+ {
+ return o1.compareTo(o2);
+ }
+ }
+ });
+ return result;
}
}
diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java
index a71e614..7d09a3b 100644
--- a/src/jalview/analysis/CrossRef.java
+++ b/src/jalview/analysis/CrossRef.java
@@ -25,9 +25,12 @@ import jalview.datamodel.Alignment;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.DBRefUtils;
+import jalview.util.MapList;
import jalview.ws.SequenceFetcher;
import jalview.ws.seqfetcher.ASequenceFetcher;
@@ -44,6 +47,27 @@ import java.util.Vector;
*/
public class CrossRef
{
+ /*
+ * A sub-class that ignores Parent attribute when comparing sequence
+ * features. This avoids 'duplicate' CDS features that only
+ * differ in their parent Transcript ids.
+ */
+ class MySequenceFeature extends SequenceFeature
+ {
+ private SequenceFeature feat;
+
+ MySequenceFeature(SequenceFeature sf)
+ {
+ this.feat = sf;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ return feat.equals(o, true);
+ }
+ }
+
/**
* Select just the DNA or protein references for a protein or dna sequence
*
@@ -88,41 +112,54 @@ public class CrossRef
{
String[] dbrefs = null;
List refs = new ArrayList();
- for (int s = 0; s < seqs.length; s++)
+ for (SequenceI seq : seqs)
{
- if (seqs[s] != null)
+ if (seq != null)
{
- SequenceI dss = seqs[s];
+ SequenceI dss = seq;
while (dss.getDatasetSequence() != null)
{
dss = dss.getDatasetSequence();
}
DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRefs());
- for (int r = 0; rfs != null && r < rfs.length; r++)
+ if (rfs != null)
{
- if (!refs.contains(rfs[r].getSource()))
+ for (DBRefEntry ref : rfs)
{
- refs.add(rfs[r].getSource());
+ if (!refs.contains(ref.getSource()))
+ {
+ refs.add(ref.getSource());
+ }
}
}
if (dataset != null)
{
// search for references to this sequence's direct references.
- DBRefEntry[] lrfs = CrossRef
- .findXDbRefs(!dna, seqs[s].getDBRefs());
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
List rseqs = new ArrayList();
- CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs,
+ CrossRef.searchDatasetXrefs(seq, !dna, lrfs, dataset, rseqs,
null); // don't need to specify codon frame for mapping here
for (SequenceI rs : rseqs)
{
- DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRefs()); // not used??
- for (int r = 0; rfs != null && r < rfs.length; r++)
+ DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRefs());
+ if (xrs != null)
{
- if (!refs.contains(rfs[r].getSource()))
+ for (DBRefEntry ref : xrs)
{
- refs.add(rfs[r].getSource());
+ if (!refs.contains(ref.getSource()))
+ {
+ refs.add(ref.getSource());
+ }
}
}
+ // looks like copy and paste - change rfs to xrs?
+ // for (int r = 0; rfs != null && r < rfs.length; r++)
+ // {
+ // if (!refs.contains(rfs[r].getSource()))
+ // {
+ // refs.add(rfs[r].getSource());
+ // }
+ // }
}
}
}
@@ -135,13 +172,9 @@ public class CrossRef
return dbrefs;
}
- /*
- * if (dna) { if (rfs[r].hasMap()) { // most likely this is a protein cross
- * reference if (!refs.contains(rfs[r].getSource())) {
- * refs.addElement(rfs[r].getSource()); } } }
- */
public static boolean hasCdnaMap(SequenceI[] seqs)
{
+ // TODO unused - remove?
String[] reftypes = findSequenceXrefTypes(false, seqs);
for (int s = 0; s < reftypes.length; s++)
{
@@ -156,6 +189,7 @@ public class CrossRef
public static SequenceI[] getCdnaMap(SequenceI[] seqs)
{
+ // TODO unused - remove?
Vector cseqs = new Vector();
for (int s = 0; s < seqs.length; s++)
{
@@ -186,34 +220,29 @@ public class CrossRef
/**
*
- * @param dna
- * @param seqs
- * @return
- */
- public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
- String source)
- {
- return findXrefSequences(seqs, dna, source, null);
- }
-
- /**
- *
* @param seqs
+ * sequences whose xrefs are being retrieved
* @param dna
+ * true if sequences are nucleotide
* @param source
- * @param dataset
- * alignment to search for product sequences.
+ * @param al
+ * alignment to search for cross-referenced sequences (and possibly
+ * add to)
+ * @param addedPeers
+ * a list of sequences to add to if 'peers' to the original sequences
+ * are found e.g. alternative protein products for a protein's gene
* @return products (as dataset sequences)
*/
- public static Alignment findXrefSequences(SequenceI[] seqs, boolean dna,
- String source, AlignmentI dataset)
+ public static Alignment findXrefSequences(SequenceI[] seqs,
+ final boolean dna, final String source, AlignmentI al,
+ List addedPeers)
{
+ AlignmentI dataset = al.getDataset() == null ? al : al.getDataset();
List rseqs = new ArrayList();
- Alignment ral = null;
- AlignedCodonFrame cf = new AlignedCodonFrame(); // nominal width
- for (int s = 0; s < seqs.length; s++)
+ AlignedCodonFrame cf = new AlignedCodonFrame();
+ for (SequenceI seq : seqs)
{
- SequenceI dss = seqs[s];
+ SequenceI dss = seq;
while (dss.getDatasetSequence() != null)
{
dss = dss.getDatasetSequence();
@@ -223,7 +252,8 @@ public class CrossRef
if ((xrfs == null || xrfs.length == 0) && dataset != null)
{
System.out.println("Attempting to find ds Xrefs refs.");
- DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRefs());
+ // FIXME should be dss not seq here?
+ DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seq.getDBRefs());
// less ambiguous would be a 'find primary dbRefEntry' method.
// filter for desired source xref here
found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset,
@@ -231,29 +261,30 @@ public class CrossRef
}
for (int r = 0; xrfs != null && r < xrfs.length; r++)
{
- if (source != null && !source.equals(xrfs[r].getSource()))
+ DBRefEntry xref = xrfs[r];
+ if (source != null && !source.equals(xref.getSource()))
{
continue;
}
- if (xrfs[r].hasMap())
+ if (xref.hasMap())
{
- if (xrfs[r].getMap().getTo() != null)
+ if (xref.getMap().getTo() != null)
{
- SequenceI rsq = new Sequence(xrfs[r].getMap().getTo());
+ SequenceI rsq = new Sequence(xref.getMap().getTo());
rseqs.add(rsq);
- if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r]
+ if (xref.getMap().getMap().getFromRatio() != xref
.getMap().getMap().getToRatio())
{
// get sense of map correct for adding to product alignment.
if (dna)
{
// map is from dna seq to a protein product
- cf.addMap(dss, rsq, xrfs[r].getMap().getMap());
+ cf.addMap(dss, rsq, xref.getMap().getMap());
}
else
{
// map should be from protein seq to its coding dna
- cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse());
+ cf.addMap(rsq, dss, xref.getMap().getMap().getInverse());
}
}
found = true;
@@ -265,7 +296,9 @@ public class CrossRef
// xrefs on this sequence.
if (dataset != null)
{
- found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna);
+ found |= searchDataset(dss, xref, dataset, rseqs, cf, false,
+ !dna);
+ // ,false,!dna);
if (found)
{
xrfs[r] = null; // we've recovered seqs for this one.
@@ -313,31 +346,34 @@ public class CrossRef
xrfs = t;
try
{
- retrieved = sftch.getSequences(xrfs); // problem here is we don't
- // know which of xrfs
- // resulted in which
+ retrieved = sftch.getSequences(xrfs, !dna);
+ // problem here is we don't know which of xrfs resulted in which
// retrieved element
} catch (Exception e)
{
System.err
.println("Problem whilst retrieving cross references for Sequence : "
- + seqs[s].getName());
+ + seq.getName());
e.printStackTrace();
}
+
if (retrieved != null)
{
+ updateDbrefMappings(dna, seq, xrfs, retrieved, cf);
+
+ List copiedFeatures = new ArrayList();
+ CrossRef me = new CrossRef();
for (int rs = 0; rs < retrieved.length; rs++)
{
// TODO: examine each sequence for 'redundancy'
- jalview.datamodel.DBRefEntry[] dbr = retrieved[rs]
- .getDBRefs();
+ DBRefEntry[] dbr = retrieved[rs].getDBRefs();
if (dbr != null && dbr.length > 0)
{
for (int di = 0; di < dbr.length; di++)
{
// find any entry where we should put in the sequence being
// cross-referenced into the map
- jalview.datamodel.Mapping map = dbr[di].getMap();
+ Mapping map = dbr[di].getMap();
if (map != null)
{
if (map.getTo() != null && map.getMap() != null)
@@ -352,17 +388,54 @@ public class CrossRef
int sf = map.getMap().getToLowest();
int st = map.getMap().getToHighest();
SequenceI mappedrg = ms.getSubSequence(sf, st);
- SequenceI loc = dss.getSubSequence(sf, st);
+ // SequenceI loc = dss.getSubSequence(sf, st);
if (mappedrg.getLength() > 0
- && mappedrg.getSequenceAsString().equals(
- loc.getSequenceAsString()))
+ && ms.getSequenceAsString().equals(
+ dss.getSequenceAsString()))
+ // && mappedrg.getSequenceAsString().equals(
+ // loc.getSequenceAsString()))
{
- System.err
- .println("Mapping updated for retrieved crossreference");
+ String msg = "Mapping updated from "
+ + ms.getName()
+ + " to retrieved crossreference "
+ + dss.getName();
+ System.out.println(msg);
// method to update all refs of existing To on
// retrieved sequence with dss and merge any props
// on To onto dss.
map.setTo(dss);
+ /*
+ * copy sequence features as well, avoiding
+ * duplication (e.g. from 2 transcripts)
+ */
+ SequenceFeature[] sfs = ms
+ .getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature feat : sfs)
+ {
+ /*
+ * we override SequenceFeature.equals here (but
+ * not elsewhere) to ignore Parent attribute
+ * TODO not quite working yet!
+ */
+ if (!copiedFeatures
+ .contains(me.new MySequenceFeature(
+ feat)))
+ {
+ dss.addSequenceFeature(feat);
+ copiedFeatures.add(feat);
+ }
+ }
+ }
+ cf.addMap(retrieved[rs].getDatasetSequence(),
+ dss, map.getMap());
+ }
+ else
+ {
+ addedPeers.add(map.getTo());
+ cf.addMap(retrieved[rs].getDatasetSequence(),
+ map.getTo(), map.getMap());
}
} catch (Exception e)
{
@@ -382,12 +455,12 @@ public class CrossRef
}
}
}
+
+ Alignment ral = null;
if (rseqs.size() > 0)
{
- SequenceI[] rsqs = new SequenceI[rseqs.size()];
- rseqs.toArray(rsqs);
- ral = new Alignment(rsqs);
- if (cf != null && cf.getProtMappings() != null)
+ ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
+ if (cf != null && !cf.isEmpty())
{
ral.addCodonFrame(cf);
}
@@ -396,6 +469,69 @@ public class CrossRef
}
/**
+ * Updates any empty mappings in the cross-references with one to a compatible
+ * retrieved sequence if found, and adds any new mappings to the
+ * AlignedCodonFrame
+ *
+ * @param dna
+ * @param mapFrom
+ * @param xrefs
+ * @param retrieved
+ * @param acf
+ */
+ static void updateDbrefMappings(boolean dna, SequenceI mapFrom,
+ DBRefEntry[] xrefs, SequenceI[] retrieved, AlignedCodonFrame acf)
+ {
+ SequenceIdMatcher matcher = new SequenceIdMatcher(retrieved);
+ for (DBRefEntry xref : xrefs)
+ {
+ if (!xref.hasMap())
+ {
+ String targetSeqName = xref.getSource() + "|"
+ + xref.getAccessionId();
+ SequenceI[] matches = matcher.findAllIdMatches(targetSeqName);
+ if (matches == null)
+ {
+ return;
+ }
+ for (SequenceI seq : matches)
+ {
+ MapList mapping = null;
+ if (dna)
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(seq, mapFrom);
+ }
+ else
+ {
+ mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, seq);
+ if (mapping != null)
+ {
+ mapping = mapping.getInverse();
+ }
+ }
+ if (mapping != null)
+ {
+ xref.setMap(new Mapping(seq, mapping));
+ if (dna)
+ {
+ AlignmentUtils.computeProteinFeatures(mapFrom, seq, mapping);
+ }
+ if (dna)
+ {
+ acf.addMap(mapFrom, seq, mapping);
+ }
+ else
+ {
+ acf.addMap(seq, mapFrom, mapping.getInverse());
+ }
+ continue;
+ }
+ }
+ }
+ }
+ }
+
+ /**
* find references to lrfs in the cross-reference set of each sequence in
* dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
* based on source and accession string only - Map and Version are nulled.
diff --git a/src/jalview/analysis/Dna.java b/src/jalview/analysis/Dna.java
index 2939e3c..be138f3 100644
--- a/src/jalview/analysis/Dna.java
+++ b/src/jalview/analysis/Dna.java
@@ -806,4 +806,154 @@ public class Dna
}
}
}
+
+ /**
+ * Returns an alignment consisting of the reversed (and optionally
+ * complemented) sequences set in this object's constructor
+ *
+ * @param complement
+ * @return
+ */
+ public AlignmentI reverseCdna(boolean complement)
+ {
+ int sSize = selection.size();
+ List reversed = new ArrayList();
+ for (int s = 0; s < sSize; s++)
+ {
+ SequenceI newseq = reverseSequence(selection.get(s).getName(),
+ seqstring[s], complement);
+
+ if (newseq != null)
+ {
+ reversed.add(newseq);
+ }
+ }
+
+ SequenceI[] newseqs = reversed.toArray(new SequenceI[reversed.size()]);
+ AlignmentI al = new Alignment(newseqs);
+ ((Alignment) al).createDatasetAlignment();
+ return al;
+ }
+
+ /**
+ * Returns a reversed, and optionally complemented, sequence. The new
+ * sequence's name is the original name with "|rev" or "|revcomp" appended.
+ * aAcCgGtT and DNA ambiguity codes are complemented, any other characters are
+ * left unchanged.
+ *
+ * @param seq
+ * @param complement
+ * @return
+ */
+ public static SequenceI reverseSequence(String seqName, String sequence,
+ boolean complement)
+ {
+ String newName = seqName + "|rev" + (complement ? "comp" : "");
+ char[] originalSequence = sequence.toCharArray();
+ int length = originalSequence.length;
+ char[] reversedSequence = new char[length];
+
+ for (int i = 0; i < length; i++)
+ {
+ reversedSequence[length - i - 1] = complement ? getComplement(originalSequence[i])
+ : originalSequence[i];
+ }
+ SequenceI reversed = new Sequence(newName, reversedSequence, 1, length);
+ return reversed;
+ }
+
+ /**
+ * Returns dna complement (preserving case) for aAcCgGtTuU. Ambiguity codes
+ * are treated as on http://reverse-complement.com/. Anything else is left
+ * unchanged.
+ *
+ * @param c
+ * @return
+ */
+ public static char getComplement(char c)
+ {
+ char result = c;
+ switch (c) {
+ case 'a':
+ result = 't';
+ break;
+ case 'A':
+ result = 'T';
+ break;
+ case 'c':
+ result = 'g';
+ break;
+ case 'C':
+ result = 'G';
+ break;
+ case 'g':
+ result = 'c';
+ break;
+ case 'G':
+ result = 'C';
+ break;
+ case 't':
+ result = 'a';
+ break;
+ case 'T':
+ result = 'A';
+ break;
+ case 'u':
+ result = 'a';
+ break;
+ case 'U':
+ result = 'A';
+ break;
+ case 'r':
+ result = 'y';
+ break;
+ case 'R':
+ result = 'Y';
+ break;
+ case 'y':
+ result = 'r';
+ break;
+ case 'Y':
+ result = 'R';
+ break;
+ case 'k':
+ result = 'm';
+ break;
+ case 'K':
+ result = 'M';
+ break;
+ case 'm':
+ result = 'k';
+ break;
+ case 'M':
+ result = 'K';
+ break;
+ case 'b':
+ result = 'v';
+ break;
+ case 'B':
+ result = 'V';
+ break;
+ case 'v':
+ result = 'b';
+ break;
+ case 'V':
+ result = 'B';
+ break;
+ case 'd':
+ result = 'h';
+ break;
+ case 'D':
+ result = 'H';
+ break;
+ case 'h':
+ result = 'd';
+ break;
+ case 'H':
+ result = 'D';
+ break;
+ }
+
+ return result;
+ }
}
diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java
index 8351686..b89287c 100755
--- a/src/jalview/analysis/SequenceIdMatcher.java
+++ b/src/jalview/analysis/SequenceIdMatcher.java
@@ -304,7 +304,7 @@ public class SequenceIdMatcher
}
if (s instanceof SeqIdName)
{
- return this.equals((SeqIdName) s);
+ return this.equals(((SeqIdName) s).id);
}
else
{
@@ -332,25 +332,8 @@ public class SequenceIdMatcher
* todo: (JBPNote) Set separator characters appropriately
*
* @param s
- * SeqIdName
* @return boolean
*/
- public boolean equals(SeqIdName s)
- {
- // TODO: JAL-732 patch for cases when name includes a list of IDs, and the
- // match contains one ID flanked
- if (id.length() > s.id.length())
- {
- return id.startsWith(s.id) ? (WORD_SEP.indexOf(id.charAt(s.id
- .length())) > -1) : false;
- }
- else
- {
- return s.id.startsWith(id) ? (s.id.equals(id) ? true : (WORD_SEP
- .indexOf(s.id.charAt(id.length())) > -1)) : false;
- }
- }
-
public boolean equals(String s)
{
if (id.length() > s.length())
diff --git a/src/jalview/analysis/scoremodels/FeatureScoreModel.java b/src/jalview/analysis/scoremodels/FeatureScoreModel.java
index 69538d5..1ca3342 100644
--- a/src/jalview/analysis/scoremodels/FeatureScoreModel.java
+++ b/src/jalview/analysis/scoremodels/FeatureScoreModel.java
@@ -28,7 +28,6 @@ import jalview.datamodel.SequenceI;
import jalview.util.Comparison;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Hashtable;
import java.util.List;
@@ -48,12 +47,9 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI
public float[][] findDistances(AlignmentView seqData)
{
int nofeats = 0;
- List dft = Arrays.asList(fr.getDisplayedFeatureTypes());
+ List dft = fr.getDisplayedFeatureTypes();
- if (dft != null)
- {
- nofeats = dft.size();
- }
+ nofeats = dft.size();
SequenceI[] sequenceString = seqData.getVisibleAlignment(
Comparison.GapChars.charAt(0)).getSequencesArray();
@@ -151,6 +147,7 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI
return true;
}
+ @Override
public String toString()
{
return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";
diff --git a/src/jalview/api/AlignViewControllerI.java b/src/jalview/api/AlignViewControllerI.java
index a2f2204..17a1563 100644
--- a/src/jalview/api/AlignViewControllerI.java
+++ b/src/jalview/api/AlignViewControllerI.java
@@ -20,6 +20,8 @@
*/
package jalview.api;
+import java.util.List;
+
/**
* prototype abstract controller for a Jalview alignment view
*
@@ -71,7 +73,7 @@ public interface AlignViewControllerI
* @param typ
* list of feature names or null to use currently displayed features
*/
- void sortAlignmentByFeatureScore(String[] typ);
+ void sortAlignmentByFeatureScore(List typ);
/**
* sort the alignment or current selection by distribution of the given set of
@@ -80,7 +82,7 @@ public interface AlignViewControllerI
* @param typ
* list of feature names or null to use currently displayed features
*/
- void sortAlignmentByFeatureDensity(String[] typ);
+ void sortAlignmentByFeatureDensity(List typ);
/**
* add a features file of some kind to the current view
diff --git a/src/jalview/api/FeatureColourI.java b/src/jalview/api/FeatureColourI.java
new file mode 100644
index 0000000..d8363f4
--- /dev/null
+++ b/src/jalview/api/FeatureColourI.java
@@ -0,0 +1,82 @@
+package jalview.api;
+
+import java.awt.Color;
+
+public interface FeatureColourI
+{
+
+ /**
+ * Answers true when either isColourByLabel, isAboveThreshold or
+ * isBelowThreshold answers true
+ *
+ * @return
+ */
+ boolean isGraduatedColour();
+
+ /**
+ * Returns the feature colour (when isGraduatedColour answers false)
+ *
+ * @return
+ */
+ Color getColour();
+
+ /**
+ * Returns the minimum colour (when isGraduatedColour answers true)
+ *
+ * @return
+ */
+ Color getMinColour();
+
+ /**
+ * Returns the maximum colour (when isGraduatedColour answers true)
+ *
+ * @return
+ */
+ Color getMaxColour();
+
+ /**
+ * Answers true if the feature is coloured by label (description); only
+ * applicable when isGraduatedColour answers true
+ *
+ * @return
+ */
+ boolean isColourByLabel();
+
+ /**
+ * Answers true if the feature is coloured below a threshold value; only
+ * applicable when isGraduatedColour answers true
+ *
+ * @return
+ */
+ boolean isBelowThreshold();
+
+ /**
+ * Answers true if the feature is coloured above a threshold value; only
+ * applicable when isGraduatedColour answers true
+ *
+ * @return
+ */
+ boolean isAboveThreshold();
+
+ /**
+ * Answers true if the threshold is the min (or max) of the colour range; only
+ * applicable when isGraduatedColour answers true
+ *
+ * @return
+ */
+ boolean isThresholdMinMax();
+
+ /**
+ * Returns the threshold value (if any), else zero
+ *
+ * @return
+ */
+ float getThreshold();
+
+ /**
+ * Answers true if ?
+ *
+ * @return
+ */
+ boolean isLowToHigh();
+}
diff --git a/src/jalview/api/FeatureRenderer.java b/src/jalview/api/FeatureRenderer.java
index 0d0adaa..5b15cad 100644
--- a/src/jalview/api/FeatureRenderer.java
+++ b/src/jalview/api/FeatureRenderer.java
@@ -149,19 +149,19 @@ public interface FeatureRenderer
boolean isTransparencyAvailable();
/**
- * get current displayed types
+ * get current displayed types, in ordering of rendering (on top last)
*
- * @return
+ * @return a (possibly empty) list of feature types
*/
- String[] getDisplayedFeatureTypes();
+ List getDisplayedFeatureTypes();
/**
* get current displayed groups
*
- * @return
+ * @return a (possibly empty) list of feature groups
*/
- String[] getDisplayedFeatureGroups();
+ List getDisplayedFeatureGroups();
/**
* display all features of these types
diff --git a/src/jalview/api/FeatureSettingsModelI.java b/src/jalview/api/FeatureSettingsModelI.java
index 5474f4e..c0fc523 100644
--- a/src/jalview/api/FeatureSettingsModelI.java
+++ b/src/jalview/api/FeatureSettingsModelI.java
@@ -20,7 +20,76 @@
*/
package jalview.api;
-public interface FeatureSettingsModelI
+import java.util.Comparator;
+
+/**
+ * An interface that describes the settings configurable in the Feature Settings
+ * dialog.
+ *
+ * @author gmcarstairs
+ */
+public interface FeatureSettingsModelI extends Comparator
{
+ // note Java 8 will allow default implementations of these methods in the
+ // interface, simplifying instantiating classes
+
+ /**
+ * Answers true if the specified feature type is displayed
+ *
+ * @param type
+ * @return
+ */
+ boolean isFeatureDisplayed(String type);
+
+ /**
+ * Answers true if the specified feature group is displayed
+ *
+ * @param group
+ * @return
+ */
+ boolean isGroupDisplayed(String group);
+
+ /**
+ * Returns the colour (or graduated colour) for the feature type, or null if
+ * not known
+ *
+ * @param type
+ * @return
+ */
+ FeatureColourI getFeatureColour(String type);
+
+ /**
+ * Returns the transparency value, from 0 (fully transparent) to 1 (fully
+ * opaque)
+ *
+ * @return
+ */
+ float getTransparency();
+
+ /**
+ * Returns -1 if feature1 is displayed before (below) feature 2, +1 if
+ * feature2 is displayed after (on top of) feature1, or 0 if we don't care.
+ *
+ *
+ * Note that this is the opposite ordering to how features are displayed in
+ * the feature settings dialogue. FeatureRendererModel.setFeaturePriority
+ * takes care of converting between the two.
+ *
+ * @param feature1
+ * @param feature2
+ * @return
+ */
+ @Override
+ int compare(String feature1, String feature2);
+
+ /**
+ * Answers true if features should be initially sorted so that features with a
+ * shorter average length are displayed on top of those with a longer average
+ * length
+ *
+ * @return
+ */
+ boolean optimiseOrder();
+
}
diff --git a/src/jalview/api/FeaturesDisplayedI.java b/src/jalview/api/FeaturesDisplayedI.java
index bda1360..32b0565 100644
--- a/src/jalview/api/FeaturesDisplayedI.java
+++ b/src/jalview/api/FeaturesDisplayedI.java
@@ -44,6 +44,6 @@ public interface FeaturesDisplayedI
int getVisibleFeatureCount();
- int getRegisterdFeaturesCount();
+ int getRegisteredFeaturesCount();
}
diff --git a/src/jalview/api/FeaturesSourceI.java b/src/jalview/api/FeaturesSourceI.java
new file mode 100644
index 0000000..8f8d8c1
--- /dev/null
+++ b/src/jalview/api/FeaturesSourceI.java
@@ -0,0 +1,8 @@
+package jalview.api;
+
+/**
+ * A tagging interface to mark a source of sequence features
+ */
+public interface FeaturesSourceI
+{
+}
diff --git a/src/jalview/appletgui/AlignFrame.java b/src/jalview/appletgui/AlignFrame.java
index b7e7899..e5f0053 100644
--- a/src/jalview/appletgui/AlignFrame.java
+++ b/src/jalview/appletgui/AlignFrame.java
@@ -364,18 +364,15 @@ public class AlignFrame extends EmbmenuFrame implements ActionListener,
public boolean parseFeaturesFile(String file, String type,
boolean autoenabledisplay)
{
- // TODO: test if importing a features file onto an alignment which already
- // has features with links overwrites the original links.
-
- Hashtable featureLinks = new Hashtable();
boolean featuresFile = false;
try
{
- featuresFile = new jalview.io.FeaturesFile(file, type).parse(viewport
- .getAlignment(), alignPanel.seqPanel.seqCanvas
- .getFeatureRenderer().getFeatureColours(), featureLinks,
- true, viewport.applet.getDefaultParameter("relaxedidmatch",
- false));
+ Map colours = alignPanel.seqPanel.seqCanvas
+ .getFeatureRenderer().getFeatureColours();
+ boolean relaxedIdMatching = viewport.applet.getDefaultParameter(
+ "relaxedidmatch", false);
+ featuresFile = new FeaturesFile(file, type).parse(
+ viewport.getAlignment(), colours, true, relaxedIdMatching);
} catch (Exception ex)
{
ex.printStackTrace();
@@ -383,10 +380,6 @@ public class AlignFrame extends EmbmenuFrame implements ActionListener,
if (featuresFile)
{
- if (featureLinks.size() > 0)
- {
- alignPanel.seqPanel.seqCanvas.getFeatureRenderer().featureLinks = featureLinks;
- }
if (autoenabledisplay)
{
viewport.setShowSequenceFeatures(true);
@@ -1420,15 +1413,16 @@ public class AlignFrame extends EmbmenuFrame implements ActionListener,
public String outputFeatures(boolean displayTextbox, String format)
{
String features;
+ FeaturesFile formatter = new FeaturesFile();
if (format.equalsIgnoreCase("Jalview"))
{
- features = new FeaturesFile().printJalviewFormat(viewport
+ features = formatter.printJalviewFormat(viewport
.getAlignment().getSequencesArray(),
getDisplayedFeatureCols());
}
else
{
- features = new FeaturesFile().printGFFFormat(viewport.getAlignment()
+ features = formatter.printGffFormat(viewport.getAlignment()
.getSequencesArray(), getDisplayedFeatureCols());
}
diff --git a/src/jalview/appletgui/CutAndPasteTransfer.java b/src/jalview/appletgui/CutAndPasteTransfer.java
index 70a7319..bbaeb68 100644
--- a/src/jalview/appletgui/CutAndPasteTransfer.java
+++ b/src/jalview/appletgui/CutAndPasteTransfer.java
@@ -22,6 +22,7 @@ package jalview.appletgui;
import jalview.analysis.AlignmentUtils;
import jalview.api.ComplexAlignFile;
+import jalview.api.FeaturesSourceI;
import jalview.bin.JalviewLite;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.ColumnSelection;
@@ -116,6 +117,7 @@ public class CutAndPasteTransfer extends Panel implements ActionListener,
addSequences.setVisible(false);
}
+ @Override
public void actionPerformed(ActionEvent evt)
{
if (evt.getSource() == accept)
@@ -223,7 +225,7 @@ public class CutAndPasteTransfer extends Panel implements ActionListener,
{
AlignmentI al = null;
- String format = new IdentifyFile().Identify(text,
+ String format = new IdentifyFile().identify(text,
AppletFormatAdapter.PASTE);
AppletFormatAdapter afa = new AppletFormatAdapter(alignFrame.alignPanel);
try
@@ -277,6 +279,10 @@ public class CutAndPasteTransfer extends Panel implements ActionListener,
{
af = new AlignFrame(al, alignFrame.viewport.applet,
"Cut & Paste input - " + format, false);
+ if (source instanceof FeaturesSourceI)
+ {
+ af.getAlignViewport().setShowSequenceFeatures(true);
+ }
}
af.statusBar
@@ -490,6 +496,7 @@ public class CutAndPasteTransfer extends Panel implements ActionListener,
this.add(textarea, java.awt.BorderLayout.CENTER);
}
+ @Override
public void mousePressed(MouseEvent evt)
{
if (textarea.getText().startsWith(
@@ -499,18 +506,22 @@ public class CutAndPasteTransfer extends Panel implements ActionListener,
}
}
+ @Override
public void mouseReleased(MouseEvent evt)
{
}
+ @Override
public void mouseClicked(MouseEvent evt)
{
}
+ @Override
public void mouseEntered(MouseEvent evt)
{
}
+ @Override
public void mouseExited(MouseEvent evt)
{
}
diff --git a/src/jalview/appletgui/FeatureRenderer.java b/src/jalview/appletgui/FeatureRenderer.java
index 4655ba5..4391fa2 100644
--- a/src/jalview/appletgui/FeatureRenderer.java
+++ b/src/jalview/appletgui/FeatureRenderer.java
@@ -43,7 +43,6 @@ import java.awt.TextArea;
import java.awt.TextField;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
-import java.util.Hashtable;
/**
* DOCUMENT ME!
@@ -54,11 +53,6 @@ import java.util.Hashtable;
public class FeatureRenderer extends
jalview.renderer.seqfeatures.FeatureRenderer
{
-
- // Holds web links for feature groups and feature types
- // in the form label|link
- Hashtable featureLinks = null;
-
/**
* Creates a new FeatureRenderer object.
*
@@ -154,6 +148,7 @@ public class FeatureRenderer extends
super(null);
}
+ @Override
public void paint(Graphics g)
{
Dimension d = getSize();
@@ -227,6 +222,7 @@ public class FeatureRenderer extends
overlaps.addItemListener(new java.awt.event.ItemListener()
{
+ @Override
public void itemStateChanged(java.awt.event.ItemEvent e)
{
int index = overlaps.getSelectedIndex();
@@ -344,6 +340,7 @@ public class FeatureRenderer extends
dialog.buttonPanel.add(deleteButton, 1);
deleteButton.addActionListener(new ActionListener()
{
+ @Override
public void actionPerformed(ActionEvent evt)
{
deleteFeature = true;
@@ -370,6 +367,7 @@ public class FeatureRenderer extends
// TODO: render the graduated color in the box.
colourPanel.addMouseListener(new java.awt.event.MouseAdapter()
{
+ @Override
public void mousePressed(java.awt.event.MouseEvent evt)
{
if (!colourPanel.isGcol)
diff --git a/src/jalview/appletgui/FeatureSettings.java b/src/jalview/appletgui/FeatureSettings.java
index 1c156dc..584a69a 100755
--- a/src/jalview/appletgui/FeatureSettings.java
+++ b/src/jalview/appletgui/FeatureSettings.java
@@ -57,6 +57,7 @@ import java.awt.event.MouseListener;
import java.awt.event.MouseMotionListener;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
+import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
@@ -165,6 +166,7 @@ public class FeatureSettings extends Panel implements ItemListener,
final FeatureSettings me = this;
frame.addWindowListener(new WindowAdapter()
{
+ @Override
public void windowClosing(WindowEvent e)
{
if (me.av.featureSettings == me)
@@ -185,6 +187,7 @@ public class FeatureSettings extends Panel implements ItemListener,
width, height);
}
+ @Override
public void paint(Graphics g)
{
g.setColor(Color.black);
@@ -212,10 +215,11 @@ public class FeatureSettings extends Panel implements ItemListener,
scr.addActionListener(new ActionListener()
{
+ @Override
public void actionPerformed(ActionEvent e)
{
- me.ap.alignFrame.avc
- .sortAlignmentByFeatureScore(new String[] { type });
+ me.ap.alignFrame.avc.sortAlignmentByFeatureScore(Arrays
+ .asList(new String[] { type }));
}
});
@@ -224,10 +228,11 @@ public class FeatureSettings extends Panel implements ItemListener,
dens.addActionListener(new ActionListener()
{
+ @Override
public void actionPerformed(ActionEvent e)
{
- me.ap.alignFrame.avc
- .sortAlignmentByFeatureDensity(new String[] { type });
+ me.ap.alignFrame.avc.sortAlignmentByFeatureDensity(Arrays
+ .asList(new String[] { type }));
}
});
@@ -258,6 +263,7 @@ public class FeatureSettings extends Panel implements ItemListener,
mxcol.addActionListener(new ActionListener()
{
+ @Override
public void actionPerformed(ActionEvent e)
{
if (typeCol instanceof Color)
@@ -312,9 +318,7 @@ public class FeatureSettings extends Panel implements ItemListener,
for (String group : fr.getFeatureGroups())
{
boolean vis = fr.checkGroupVisibility(group, false);
- Checkbox check = new MyCheckbox(group, vis,
- (fr.featureLinks != null && fr.featureLinks
- .containsKey(group)));
+ Checkbox check = new MyCheckbox(group, vis, false);
check.addMouseListener(this);
check.setFont(new Font("Serif", Font.BOLD, 12));
check.addItemListener(groupItemListener);
@@ -452,10 +456,7 @@ public class FeatureSettings extends Panel implements ItemListener,
selected = true;
}
- check = new MyCheckbox(
- type,
- selected,
- (fr.featureLinks != null && fr.featureLinks.containsKey(type)),
+ check = new MyCheckbox(type, selected, false,
fr.getFeatureStyle(type));
check.addMouseListener(this);
@@ -474,6 +475,7 @@ public class FeatureSettings extends Panel implements ItemListener,
}
}
+ @Override
public void actionPerformed(ActionEvent evt)
{
for (int i = 0; i < featurePanel.getComponentCount(); i++)
@@ -486,6 +488,7 @@ public class FeatureSettings extends Panel implements ItemListener,
private ItemListener groupItemListener = new ItemListener()
{
+ @Override
public void itemStateChanged(ItemEvent evt)
{
Checkbox source = (Checkbox) evt.getSource();
@@ -500,6 +503,7 @@ public class FeatureSettings extends Panel implements ItemListener,
};
};
+ @Override
public void itemStateChanged(ItemEvent evt)
{
selectionChanged();
@@ -533,22 +537,7 @@ public class FeatureSettings extends Panel implements ItemListener,
boolean dragging = false;
- public void mousePressed(MouseEvent evt)
- {
-
- selectedCheck = (MyCheckbox) evt.getSource();
-
- if (fr.featureLinks != null
- && fr.featureLinks.containsKey(selectedCheck.type))
- {
- if (evt.getX() > selectedCheck.stringWidth + 20)
- {
- evt.consume();
- }
- }
-
- }
-
+ @Override
public void mouseDragged(MouseEvent evt)
{
if (((Component) evt.getSource()).getParent() != featurePanel)
@@ -558,6 +547,7 @@ public class FeatureSettings extends Panel implements ItemListener,
dragging = true;
}
+ @Override
public void mouseReleased(MouseEvent evt)
{
if (((Component) evt.getSource()).getParent() != featurePanel)
@@ -633,14 +623,17 @@ public class FeatureSettings extends Panel implements ItemListener,
ap.paintAlignment(true);
}
+ @Override
public void mouseEntered(MouseEvent evt)
{
}
+ @Override
public void mouseExited(MouseEvent evt)
{
}
+ @Override
public void mouseClicked(MouseEvent evt)
{
MyCheckbox check = (MyCheckbox) evt.getSource();
@@ -648,16 +641,6 @@ public class FeatureSettings extends Panel implements ItemListener,
{
this.popupSort(check, fr.getMinMax(), evt.getX(), evt.getY());
}
- if (fr.featureLinks != null && fr.featureLinks.containsKey(check.type))
- {
- if (evt.getX() > check.stringWidth + 20)
- {
- evt.consume();
- String link = fr.featureLinks.get(check.type).toString();
- ap.alignFrame.showURL(link.substring(link.indexOf("|") + 1),
- link.substring(0, link.indexOf("|")));
- }
- }
if (check.getParent() != featurePanel)
{
@@ -680,10 +663,12 @@ public class FeatureSettings extends Panel implements ItemListener,
}
}
+ @Override
public void mouseMoved(MouseEvent evt)
{
}
+ @Override
public void adjustmentValueChanged(AdjustmentEvent evt)
{
fr.setTransparency((100 - transparency.getValue()) / 100f);
@@ -764,6 +749,7 @@ public class FeatureSettings extends Panel implements ItemListener,
updateColor(featureStyle);
}
+ @Override
public void paint(Graphics g)
{
Dimension d = getSize();
@@ -802,4 +788,9 @@ public class FeatureSettings extends Panel implements ItemListener,
}
}
+ @Override
+ public void mousePressed(MouseEvent e)
+ {
+ }
+
}
diff --git a/src/jalview/bin/Jalview.java b/src/jalview/bin/Jalview.java
index 462f5a7..8fe3bca 100755
--- a/src/jalview/bin/Jalview.java
+++ b/src/jalview/bin/Jalview.java
@@ -377,7 +377,7 @@ public class Jalview
protocol = jalview.io.AppletFormatAdapter.checkProtocol(file);
- format = new jalview.io.IdentifyFile().Identify(file, protocol);
+ format = new jalview.io.IdentifyFile().identify(file, protocol);
AlignFrame af = fileLoader.LoadFileWaitTillLoaded(file, protocol,
format);
@@ -627,7 +627,7 @@ public class Jalview
}
else
{
- format = new jalview.io.IdentifyFile().Identify(file, protocol);
+ format = new jalview.io.IdentifyFile().identify(file, protocol);
}
startUpAlframe = fileLoader.LoadFileWaitTillLoaded(file, protocol,
diff --git a/src/jalview/bin/JalviewLite.java b/src/jalview/bin/JalviewLite.java
index 36a7cff..ae84ba5 100644
--- a/src/jalview/bin/JalviewLite.java
+++ b/src/jalview/bin/JalviewLite.java
@@ -850,7 +850,7 @@ public class JalviewLite extends Applet implements
{
AlignmentI al = null;
- String format = new IdentifyFile().Identify(text,
+ String format = new IdentifyFile().identify(text,
AppletFormatAdapter.PASTE);
try
{
@@ -1967,7 +1967,7 @@ public class JalviewLite extends Applet implements
return null;
}
String resolvedFile = resolveFileProtocol(fileParam);
- String format = new IdentifyFile().Identify(resolvedFile, protocol);
+ String format = new IdentifyFile().identify(resolvedFile, protocol);
dbgMsg("File identified as '" + format + "'");
AlignmentI al = null;
try
diff --git a/src/jalview/bin/JalviewLiteURLRetrieve.java b/src/jalview/bin/JalviewLiteURLRetrieve.java
index 6be1016..fd88028 100644
--- a/src/jalview/bin/JalviewLiteURLRetrieve.java
+++ b/src/jalview/bin/JalviewLiteURLRetrieve.java
@@ -113,7 +113,7 @@ public class JalviewLiteURLRetrieve extends Applet
String format = getParameter("format");
if (format == null || format.length() == 0)
{
- format = new jalview.io.IdentifyFile().Identify(file, protocol);
+ format = new jalview.io.IdentifyFile().identify(file, protocol);
System.out.println("Format is " + format);
}
else
diff --git a/src/jalview/controller/AlignViewController.java b/src/jalview/controller/AlignViewController.java
index 6a7e222..ca2ae6d 100644
--- a/src/jalview/controller/AlignViewController.java
+++ b/src/jalview/controller/AlignViewController.java
@@ -37,7 +37,6 @@ import jalview.io.FeaturesFile;
import jalview.util.MessageManager;
import java.awt.Color;
-import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
@@ -317,46 +316,23 @@ public class AlignViewController implements AlignViewControllerI
}
@Override
- public void sortAlignmentByFeatureDensity(String[] typ)
+ public void sortAlignmentByFeatureDensity(List typ)
{
sortBy(typ, "Sort by Density", AlignmentSorter.FEATURE_DENSITY);
}
- protected void sortBy(String[] typ, String methodText, final String method)
+ protected void sortBy(List typ, String methodText,
+ final String method)
{
FeatureRenderer fr = alignPanel.getFeatureRenderer();
- if (typ == null)
+ if (typ == null && fr != null)
{
- typ = fr == null ? null : fr.getDisplayedFeatureTypes();
+ typ = fr.getDisplayedFeatureTypes();
}
- String gps[] = null;
- gps = fr == null ? null : fr.getDisplayedFeatureGroups();
- if (typ != null)
+ List gps = null;
+ if (fr != null)
{
- List types = new ArrayList();
- for (String displayed : typ)
- {
- if (displayed != null)
- {
- types.add(displayed);
- }
- }
- typ = new String[types.size()];
- types.toArray(typ);
- }
- if (gps != null)
- {
- List grps = new ArrayList();
-
- for (int i = 0; i < gps.length; i++)
- {
- if (gps[i] != null)
- {
- grps.add(gps[i]);
- }
- }
- gps = new String[grps.size()];
- grps.toArray(gps);
+ gps = fr.getDisplayedFeatureGroups();
}
AlignmentI al = viewport.getAlignment();
@@ -381,7 +357,7 @@ public class AlignViewController implements AlignViewControllerI
}
@Override
- public void sortAlignmentByFeatureScore(String[] typ)
+ public void sortAlignmentByFeatureScore(List typ)
{
sortBy(typ, "Sort by Feature Score", AlignmentSorter.FEATURE_SCORE);
}
@@ -393,7 +369,7 @@ public class AlignViewController implements AlignViewControllerI
boolean featuresFile = false;
try
{
- featuresFile = new FeaturesFile(file, protocol).parse(viewport
+ featuresFile = new FeaturesFile(false, file, protocol).parse(viewport
.getAlignment().getDataset(), alignPanel.getFeatureRenderer()
.getFeatureColours(), false, relaxedIdMatching);
} catch (Exception ex)
diff --git a/src/jalview/datamodel/AlignedCodon.java b/src/jalview/datamodel/AlignedCodon.java
index 6179831..39a1853 100644
--- a/src/jalview/datamodel/AlignedCodon.java
+++ b/src/jalview/datamodel/AlignedCodon.java
@@ -27,32 +27,38 @@ package jalview.datamodel;
*
* Example: in "G-AT-C-GA" the aligned codons are (0, 2, 3) and (5, 7, 8).
*
- * JBPComment: Is this useful anywhere other than jalview.analysis.Dna ?
- *
* @author gmcarstairs
*
*/
public final class AlignedCodon
{
+ // base 1 aligned sequence position (base 0)
public final int pos1;
+ // base 2 aligned sequence position (base 0)
public final int pos2;
+ // base 3 aligned sequence position (base 0)
public final int pos3;
+ // peptide aligned sequence position (base 0)
+ public final int peptideCol;
+
+ // peptide coded for by this codon
public final String product;
public AlignedCodon(int i, int j, int k)
{
- this(i, j, k, null);
+ this(i, j, k, null, 0);
}
- public AlignedCodon(int i, int j, int k, String prod)
+ public AlignedCodon(int i, int j, int k, String prod, int prodCol)
{
pos1 = i;
pos2 = j;
pos3 = k;
product = prod;
+ peptideCol = prodCol;
}
/**
diff --git a/src/jalview/datamodel/AlignedCodonFrame.java b/src/jalview/datamodel/AlignedCodonFrame.java
index 9c642cf..3fc8c28 100644
--- a/src/jalview/datamodel/AlignedCodonFrame.java
+++ b/src/jalview/datamodel/AlignedCodonFrame.java
@@ -33,23 +33,40 @@ import java.util.List;
public class AlignedCodonFrame
{
- /**
- * tied array of na Sequence objects.
+ /*
+ * Data bean to hold mappings from one sequence to another
*/
- private SequenceI[] dnaSeqs = null;
+ private class SequenceToSequenceMapping
+ {
+ private SequenceI fromSeq;
- /**
- * tied array of Mappings to protein sequence Objects and SequenceI[]
- * aaSeqs=null; MapLists where each maps from the corresponding dnaSeqs
- * element to corresponding aaSeqs element
- */
- private Mapping[] dnaToProt = null;
+ private Mapping mapping;
+
+ SequenceToSequenceMapping(SequenceI from, Mapping map)
+ {
+ this.fromSeq = from;
+ this.mapping = map;
+ }
+
+ /**
+ * Readable representation for debugging only, not guaranteed not to change
+ */
+ @Override
+ public String toString()
+ {
+ return String.format("From %s %s", fromSeq.getName(),
+ mapping.toString());
+ }
+ }
+
+ private List mappings;
/**
* Constructor
*/
public AlignedCodonFrame()
{
+ mappings = new ArrayList();
}
/**
@@ -62,68 +79,75 @@ public class AlignedCodonFrame
*/
public void addMap(SequenceI dnaseq, SequenceI aaseq, MapList map)
{
- int nlen = 1;
- if (dnaSeqs != null)
- {
- nlen = dnaSeqs.length + 1;
- }
- SequenceI[] ndna = new SequenceI[nlen];
- Mapping[] ndtp = new Mapping[nlen];
- if (dnaSeqs != null)
- {
- System.arraycopy(dnaSeqs, 0, ndna, 0, dnaSeqs.length);
- System.arraycopy(dnaToProt, 0, ndtp, 0, dnaSeqs.length);
- }
- dnaSeqs = ndna;
- dnaToProt = ndtp;
- nlen--;
- dnaSeqs[nlen] = (dnaseq.getDatasetSequence() == null) ? dnaseq : dnaseq
- .getDatasetSequence();
- Mapping mp = new Mapping(map);
// JBPNote DEBUG! THIS !
// dnaseq.transferAnnotation(aaseq, mp);
// aaseq.transferAnnotation(dnaseq, new Mapping(map.getInverse()));
- mp.to = (aaseq.getDatasetSequence() == null) ? aaseq : aaseq
+
+ SequenceI fromSeq = (dnaseq.getDatasetSequence() == null) ? dnaseq
+ : dnaseq.getDatasetSequence();
+ SequenceI toSeq = (aaseq.getDatasetSequence() == null) ? aaseq : aaseq
.getDatasetSequence();
- dnaToProt[nlen] = mp;
+
+ /*
+ * if we already hold a mapping between these sequences, just add to it
+ */
+ for (SequenceToSequenceMapping ssm : mappings)
+ {
+ if (ssm.fromSeq == fromSeq && ssm.mapping.to == toSeq)
+ {
+ ssm.mapping.map.addMapList(map);
+ return;
+ }
+ }
+
+ /*
+ * otherwise, add a new sequence mapping
+ */
+ Mapping mp = new Mapping(toSeq, map);
+ mappings.add(new SequenceToSequenceMapping(fromSeq, mp));
}
public SequenceI[] getdnaSeqs()
{
- return dnaSeqs;
+ // TODO return a list instead?
+ // return dnaSeqs;
+ List seqs = new ArrayList();
+ for (SequenceToSequenceMapping ssm : mappings)
+ {
+ seqs.add(ssm.fromSeq);
+ }
+ return seqs.toArray(new SequenceI[seqs.size()]);
}
public SequenceI[] getAaSeqs()
{
- if (dnaToProt == null)
+ // TODO not used - remove?
+ List seqs = new ArrayList();
+ for (SequenceToSequenceMapping ssm : mappings)
{
- return null;
- }
- SequenceI[] sqs = new SequenceI[dnaToProt.length];
- for (int sz = 0; sz < dnaToProt.length; sz++)
- {
- sqs[sz] = dnaToProt[sz].to;
+ seqs.add(ssm.mapping.to);
}
- return sqs;
+ return seqs.toArray(new SequenceI[seqs.size()]);
}
public MapList[] getdnaToProt()
{
- if (dnaToProt == null)
- {
- return null;
- }
- MapList[] sqs = new MapList[dnaToProt.length];
- for (int sz = 0; sz < dnaToProt.length; sz++)
+ List maps = new ArrayList();
+ for (SequenceToSequenceMapping ssm : mappings)
{
- sqs[sz] = dnaToProt[sz].map;
+ maps.add(ssm.mapping.map);
}
- return sqs;
+ return maps.toArray(new MapList[maps.size()]);
}
public Mapping[] getProtMappings()
{
- return dnaToProt;
+ List maps = new ArrayList();
+ for (SequenceToSequenceMapping ssm : mappings)
+ {
+ maps.add(ssm.mapping);
+ }
+ return maps.toArray(new Mapping[maps.size()]);
}
/**
@@ -135,18 +159,14 @@ public class AlignedCodonFrame
*/
public Mapping getMappingForSequence(SequenceI seq)
{
- if (dnaSeqs == null)
- {
- return null;
- }
SequenceI seqDs = seq.getDatasetSequence();
seqDs = seqDs != null ? seqDs : seq;
- for (int ds = 0; ds < dnaSeqs.length; ds++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaSeqs[ds] == seqDs || dnaToProt[ds].to == seqDs)
+ if (ssm.fromSeq == seqDs || ssm.mapping.to == seqDs)
{
- return dnaToProt[ds];
+ return ssm.mapping;
}
}
return null;
@@ -161,16 +181,12 @@ public class AlignedCodonFrame
*/
public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef)
{
- if (dnaSeqs == null)
- {
- return null;
- }
SequenceI dnads = dnaSeqRef.getDatasetSequence();
- for (int ds = 0; ds < dnaSeqs.length; ds++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaSeqs[ds] == dnaSeqRef || dnaSeqs[ds] == dnads)
+ if (ssm.fromSeq == dnaSeqRef || ssm.fromSeq == dnads)
{
- return dnaToProt[ds].to;
+ return ssm.mapping.to;
}
}
return null;
@@ -183,16 +199,12 @@ public class AlignedCodonFrame
*/
public SequenceI getDnaForAaSeq(SequenceI aaSeqRef)
{
- if (dnaToProt == null)
- {
- return null;
- }
SequenceI aads = aaSeqRef.getDatasetSequence();
- for (int as = 0; as < dnaToProt.length; as++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaToProt[as].to == aaSeqRef || dnaToProt[as].to == aads)
+ if (ssm.mapping.to == aaSeqRef || ssm.mapping.to == aads)
{
- return dnaSeqs[as];
+ return ssm.fromSeq;
}
}
return null;
@@ -224,36 +236,30 @@ public class AlignedCodonFrame
public void markMappedRegion(SequenceI seq, int index,
SearchResults results)
{
- if (dnaToProt == null)
- {
- return;
- }
int[] codon;
SequenceI ds = seq.getDatasetSequence();
- for (int mi = 0; mi < dnaToProt.length; mi++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaSeqs[mi] == seq || dnaSeqs[mi] == ds)
+ if (ssm.fromSeq == seq || ssm.fromSeq == ds)
{
- // DEBUG System.err.println("dna pos "+index);
- codon = dnaToProt[mi].map.locateInTo(index, index);
+ codon = ssm.mapping.map.locateInTo(index, index);
if (codon != null)
{
for (int i = 0; i < codon.length; i += 2)
{
- results.addResult(dnaToProt[mi].to, codon[i], codon[i + 1]);
+ results.addResult(ssm.mapping.to, codon[i], codon[i + 1]);
}
}
}
- else if (dnaToProt[mi].to == seq || dnaToProt[mi].to == ds)
+ else if (ssm.mapping.to == seq || ssm.mapping.to == ds)
{
- // DEBUG System.err.println("aa pos "+index);
{
- codon = dnaToProt[mi].map.locateInFrom(index, index);
+ codon = ssm.mapping.map.locateInFrom(index, index);
if (codon != null)
{
for (int i = 0; i < codon.length; i += 2)
{
- results.addResult(dnaSeqs[mi], codon[i], codon[i + 1]);
+ results.addResult(ssm.fromSeq, codon[i], codon[i + 1]);
}
}
}
@@ -282,13 +288,15 @@ public class AlignedCodonFrame
* Adapted from markMappedRegion().
*/
MapList ml = null;
- for (int i = 0; i < dnaToProt.length; i++)
+ int i = 0;
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaSeqs[i] == seq)
+ if (ssm.fromSeq == seq)
{
ml = getdnaToProt()[i];
break;
}
+ i++;
}
return ml == null ? null : ml.locateInFrom(aaPos, aaPos);
}
@@ -307,18 +315,16 @@ public class AlignedCodonFrame
/*
* Search mapped protein ('to') sequences first.
*/
- if (this.dnaToProt != null)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- for (int i = 0; i < dnaToProt.length; i++)
+ if (ssm.fromSeq == seq)
{
- if (this.dnaSeqs[i] == seq)
+ for (SequenceI sourceAligned : al.getSequences())
{
- for (SequenceI sourceAligned : al.getSequences())
+ if (ssm.mapping.to == sourceAligned.getDatasetSequence()
+ || ssm.mapping.to == sourceAligned)
{
- if (this.dnaToProt[i].to == sourceAligned.getDatasetSequence())
- {
- return sourceAligned;
- }
+ return sourceAligned;
}
}
}
@@ -327,18 +333,15 @@ public class AlignedCodonFrame
/*
* Then try mapped dna sequences.
*/
- if (this.dnaToProt != null)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- for (int i = 0; i < dnaToProt.length; i++)
+ if (ssm.mapping.to == seq)
{
- if (this.dnaToProt[i].to == seq)
+ for (SequenceI sourceAligned : al.getSequences())
{
- for (SequenceI sourceAligned : al.getSequences())
+ if (ssm.fromSeq == sourceAligned.getDatasetSequence())
{
- if (this.dnaSeqs[i] == sourceAligned.getDatasetSequence())
- {
- return sourceAligned;
- }
+ return sourceAligned;
}
}
}
@@ -348,31 +351,45 @@ public class AlignedCodonFrame
}
/**
- * Returns the region in the 'mappedFrom' sequence's dataset that is mapped to
- * position 'pos' (base 1) in the 'mappedTo' sequence's dataset. The region is
- * a set of start/end position pairs.
+ * Returns the region in the target sequence's dataset that is mapped to the
+ * given position (base 1) in the query sequence's dataset. The region is a
+ * set of start/end position pairs.
*
- * @param mappedFrom
- * @param mappedTo
- * @param pos
+ * @param target
+ * @param query
+ * @param queryPos
* @return
*/
- public int[] getMappedRegion(SequenceI mappedFrom, SequenceI mappedTo,
- int pos)
+ public int[] getMappedRegion(SequenceI target, SequenceI query,
+ int queryPos)
{
- SequenceI targetDs = mappedFrom.getDatasetSequence() == null ? mappedFrom
- : mappedFrom.getDatasetSequence();
- SequenceI sourceDs = mappedTo.getDatasetSequence() == null ? mappedTo
- : mappedTo.getDatasetSequence();
- if (targetDs == null || sourceDs == null || dnaToProt == null)
+ SequenceI targetDs = target.getDatasetSequence() == null ? target
+ : target.getDatasetSequence();
+ SequenceI queryDs = query.getDatasetSequence() == null ? query : query
+ .getDatasetSequence();
+ if (targetDs == null || queryDs == null /*|| dnaToProt == null*/)
{
return null;
}
- for (int mi = 0; mi < dnaToProt.length; mi++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaSeqs[mi] == targetDs && dnaToProt[mi].to == sourceDs)
+ /*
+ * try mapping from target to query
+ */
+ if (ssm.fromSeq == targetDs && ssm.mapping.to == queryDs)
+ {
+ int[] codon = ssm.mapping.map.locateInFrom(queryPos, queryPos);
+ if (codon != null)
+ {
+ return codon;
+ }
+ }
+ /*
+ * else try mapping from query to target
+ */
+ else if (ssm.fromSeq == queryDs && ssm.mapping.to == targetDs)
{
- int[] codon = dnaToProt[mi].map.locateInFrom(pos, pos);
+ int[] codon = ssm.mapping.map.locateInTo(queryPos, queryPos);
if (codon != null)
{
return codon;
@@ -383,8 +400,10 @@ public class AlignedCodonFrame
}
/**
- * Returns the DNA codon for the given position (base 1) in a mapped protein
- * sequence, or null if no mapping is found.
+ * Returns the mapped DNA codons for the given position in a protein sequence,
+ * or null if no mapping is found. Returns a list of (e.g.) ['g', 'c', 't']
+ * codons. There may be more than one codon mapped to the protein if (for
+ * example), there are mappings to cDNA variants.
*
* @param protein
* the peptide dataset sequence
@@ -392,41 +411,36 @@ public class AlignedCodonFrame
* residue position (base 1) in the peptide sequence
* @return
*/
- public char[] getMappedCodon(SequenceI protein, int aaPos)
+ public List getMappedCodons(SequenceI protein, int aaPos)
{
- if (dnaToProt == null)
- {
- return null;
- }
MapList ml = null;
SequenceI dnaSeq = null;
- for (int i = 0; i < dnaToProt.length; i++)
+ List result = new ArrayList();
+
+ for (SequenceToSequenceMapping ssm : mappings)
{
- if (dnaToProt[i].to == protein)
+ if (ssm.mapping.to == protein)
{
- ml = getdnaToProt()[i];
- dnaSeq = dnaSeqs[i];
- break;
+ ml = ssm.mapping.map;
+ dnaSeq = ssm.fromSeq;
+
+ int[] codonPos = ml.locateInFrom(aaPos, aaPos);
+ if (codonPos == null)
+ {
+ return null;
+ }
+
+ /*
+ * Read off the mapped nucleotides (converting to position base 0)
+ */
+ codonPos = MappingUtils.flattenRanges(codonPos);
+ char[] dna = dnaSeq.getSequence();
+ int start = dnaSeq.getStart();
+ result.add(new char[] { dna[codonPos[0] - start],
+ dna[codonPos[1] - start], dna[codonPos[2] - start] });
}
}
- if (ml == null)
- {
- return null;
- }
- int[] codonPos = ml.locateInFrom(aaPos, aaPos);
- if (codonPos == null)
- {
- return null;
- }
-
- /*
- * Read off the mapped nucleotides (converting to position base 0)
- */
- codonPos = MappingUtils.flattenRanges(codonPos);
- char[] dna = dnaSeq.getSequence();
- int start = dnaSeq.getStart();
- return new char[] { dna[codonPos[0] - start], dna[codonPos[1] - start],
- dna[codonPos[2] - start] };
+ return result.isEmpty() ? null : result;
}
/**
@@ -439,18 +453,14 @@ public class AlignedCodonFrame
public List getMappingsForSequence(SequenceI seq)
{
List result = new ArrayList();
- if (dnaSeqs == null)
- {
- return result;
- }
List related = new ArrayList();
SequenceI seqDs = seq.getDatasetSequence();
seqDs = seqDs != null ? seqDs : seq;
- for (int ds = 0; ds < dnaSeqs.length; ds++)
+ for (SequenceToSequenceMapping ssm : mappings)
{
- final Mapping mapping = dnaToProt[ds];
- if (dnaSeqs[ds] == seqDs || mapping.to == seqDs)
+ final Mapping mapping = ssm.mapping;
+ if (ssm.fromSeq == seqDs || mapping.to == seqDs)
{
if (!related.contains(mapping.to))
{
@@ -461,4 +471,171 @@ public class AlignedCodonFrame
}
return result;
}
+
+ /**
+ * Test whether the given sequence is substitutable for one or more dummy
+ * sequences in this mapping
+ *
+ * @param map
+ * @param seq
+ * @return
+ */
+ public boolean isRealisableWith(SequenceI seq)
+ {
+ return realiseWith(seq, false) > 0;
+ }
+
+ /**
+ * Replace any matchable mapped dummy sequences with the given real one.
+ * Returns the count of sequence mappings instantiated.
+ *
+ * @param seq
+ * @return
+ */
+ public int realiseWith(SequenceI seq)
+ {
+ return realiseWith(seq, true);
+ }
+
+ /**
+ * Returns the number of mapped dummy sequences that could be replaced with
+ * the given real sequence.
+ *
+ * @param seq
+ * a dataset sequence
+ * @param doUpdate
+ * if true, performs replacements, else only counts
+ * @return
+ */
+ protected int realiseWith(SequenceI seq, boolean doUpdate)
+ {
+ SequenceI ds = seq.getDatasetSequence() != null ? seq
+ .getDatasetSequence() : seq;
+ int count = 0;
+
+ /*
+ * check for replaceable DNA ('map from') sequences
+ */
+ for (SequenceToSequenceMapping ssm : mappings)
+ {
+ SequenceI dna = ssm.fromSeq;
+ if (dna instanceof SequenceDummy
+ && dna.getName().equals(ds.getName()))
+ {
+ Mapping mapping = ssm.mapping;
+ int mapStart = mapping.getMap().getFromLowest();
+ int mapEnd = mapping.getMap().getFromHighest();
+ boolean mappable = couldRealiseSequence(dna, ds, mapStart, mapEnd);
+ if (mappable)
+ {
+ count++;
+ if (doUpdate)
+ {
+ // TODO: new method ? ds.realise(dna);
+ // might want to copy database refs as well
+ ds.setSequenceFeatures(dna.getSequenceFeatures());
+ // dnaSeqs[i] = ds;
+ ssm.fromSeq = ds;
+ System.out.println("Realised mapped sequence " + ds.getName());
+ }
+ }
+ }
+
+ /*
+ * check for replaceable protein ('map to') sequences
+ */
+ Mapping mapping = ssm.mapping;
+ SequenceI prot = mapping.getTo();
+ int mapStart = mapping.getMap().getToLowest();
+ int mapEnd = mapping.getMap().getToHighest();
+ boolean mappable = couldRealiseSequence(prot, ds, mapStart, mapEnd);
+ if (mappable)
+ {
+ count++;
+ if (doUpdate)
+ {
+ // TODO: new method ? ds.realise(dna);
+ // might want to copy database refs as well
+ ds.setSequenceFeatures(dna.getSequenceFeatures());
+ ssm.mapping.setTo(ds);
+ }
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Helper method to test whether a 'real' sequence could replace a 'dummy'
+ * sequence in the map. The criteria are that they have the same name, and
+ * that the mapped region overlaps the candidate sequence.
+ *
+ * @param existing
+ * @param replacement
+ * @param mapStart
+ * @param mapEnd
+ * @return
+ */
+ protected static boolean couldRealiseSequence(SequenceI existing,
+ SequenceI replacement, int mapStart, int mapEnd)
+ {
+ if (existing instanceof SequenceDummy
+ && !(replacement instanceof SequenceDummy)
+ && existing.getName().equals(replacement.getName()))
+ {
+ int start = replacement.getStart();
+ int end = replacement.getEnd();
+ boolean mappingOverlapsSequence = (mapStart >= start && mapStart <= end)
+ || (mapEnd >= start && mapEnd <= end);
+ if (mappingOverlapsSequence)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Change any mapping to the given sequence to be to its dataset sequence
+ * instead. For use when mappings are created before their referenced
+ * sequences are instantiated, for example when parsing GFF data.
+ *
+ * @param seq
+ */
+ public void updateToDataset(SequenceI seq)
+ {
+ if (seq == null || seq.getDatasetSequence() == null)
+ {
+ return;
+ }
+ SequenceI ds = seq.getDatasetSequence();
+
+ for (SequenceToSequenceMapping ssm : mappings)
+ /*
+ * 'from' sequences
+ */
+ {
+ if (ssm.fromSeq == seq)
+ {
+ ssm.fromSeq = ds;
+ }
+
+ /*
+ * 'to' sequences
+ */
+ if (ssm.mapping.to == seq)
+ {
+ ssm.mapping.to = ds;
+ }
+ }
+ }
+
+ /**
+ * Answers true if this object contains no mappings
+ *
+ * @return
+ */
+ public boolean isEmpty()
+ {
+ return mappings.isEmpty();
+ }
}
diff --git a/src/jalview/datamodel/Alignment.java b/src/jalview/datamodel/Alignment.java
index 7ea9985..1134857 100755
--- a/src/jalview/datamodel/Alignment.java
+++ b/src/jalview/datamodel/Alignment.java
@@ -22,13 +22,14 @@ package jalview.datamodel;
import jalview.analysis.AlignmentUtils;
import jalview.io.FastaFile;
+import jalview.util.Comparison;
import jalview.util.MessageManager;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -47,8 +48,7 @@ public class Alignment implements AlignmentI
protected List sequences;
- protected List groups = java.util.Collections
- .synchronizedList(new ArrayList());
+ protected List groups;
protected char gapCharacter = '-';
@@ -60,20 +60,21 @@ public class Alignment implements AlignmentI
public boolean hasRNAStructure = false;
- /** DOCUMENT ME!! */
public AlignmentAnnotation[] annotations;
- HiddenSequences hiddenSequences = new HiddenSequences(this);
+ HiddenSequences hiddenSequences;
public Hashtable alignmentProperties;
- private Set codonFrameList = new LinkedHashSet();
+ private List codonFrameList;
private void initAlignment(SequenceI[] seqs)
{
- int i = 0;
+ groups = Collections.synchronizedList(new ArrayList());
+ hiddenSequences = new HiddenSequences(this);
+ codonFrameList = new ArrayList();
- if (jalview.util.Comparison.isNucleotide(seqs))
+ if (Comparison.isNucleotide(seqs))
{
type = NUCLEOTIDE;
}
@@ -82,10 +83,9 @@ public class Alignment implements AlignmentI
type = PROTEIN;
}
- sequences = java.util.Collections
- .synchronizedList(new ArrayList());
+ sequences = Collections.synchronizedList(new ArrayList());
- for (i = 0; i < seqs.length; i++)
+ for (int i = 0; i < seqs.length; i++)
{
sequences.add(seqs[i]);
}
@@ -104,13 +104,12 @@ public class Alignment implements AlignmentI
seqs[i] = new Sequence(seqs[i]);
}
+ initAlignment(seqs);
+
/*
- * Share the same dataset sequence mappings (if any). TODO: find a better
- * place for these to live (alignment dataset?).
+ * Share the same dataset sequence mappings (if any).
*/
- this.codonFrameList = ((Alignment) al).codonFrameList;
-
- initAlignment(seqs);
+ this.setCodonFrames(al.getCodonFrames());
}
/**
@@ -991,25 +990,7 @@ public class Alignment implements AlignmentI
{
if (dataset == null && data == null)
{
- // Create a new dataset for this alignment.
- // Can only be done once, if dataset is not null
- // This will not be performed
- SequenceI[] seqs = new SequenceI[getHeight()];
- SequenceI currentSeq;
- for (int i = 0; i < getHeight(); i++)
- {
- currentSeq = getSequenceAt(i);
- if (currentSeq.getDatasetSequence() != null)
- {
- seqs[i] = currentSeq.getDatasetSequence();
- }
- else
- {
- seqs[i] = currentSeq.createDatasetSequence();
- }
- }
-
- dataset = new Alignment(seqs);
+ createDatasetAlignment();
}
else if (dataset == null && data != null)
{
@@ -1040,6 +1021,37 @@ public class Alignment implements AlignmentI
}
/**
+ * Creates a new dataset for this alignment. Can only be done once - if
+ * dataset is not null this will not be performed.
+ */
+ public void createDatasetAlignment()
+ {
+ if (dataset != null)
+ {
+ return;
+ }
+ SequenceI[] seqs = new SequenceI[getHeight()];
+ SequenceI currentSeq;
+ for (int i = 0; i < getHeight(); i++)
+ {
+ currentSeq = getSequenceAt(i);
+ if (currentSeq.getDatasetSequence() != null)
+ {
+ seqs[i] = currentSeq.getDatasetSequence();
+ }
+ else
+ {
+ seqs[i] = currentSeq.createDatasetSequence();
+ }
+ }
+
+ dataset = new Alignment(seqs);
+ // move mappings to the dataset alignment
+ dataset.codonFrameList = this.codonFrameList;
+ this.codonFrameList = null;
+ }
+
+ /**
* reference count for number of alignments referencing this one.
*/
int alignmentRefs = 0;
@@ -1261,19 +1273,17 @@ public class Alignment implements AlignmentI
return alignmentProperties;
}
- /*
- * (non-Javadoc)
- *
- * @see
- * jalview.datamodel.AlignmentI#addCodonFrame(jalview.datamodel.AlignedCodonFrame
- * )
+ /**
+ * Adds the given mapping to the stored set. Note this may be held on the
+ * dataset alignment.
*/
@Override
public void addCodonFrame(AlignedCodonFrame codons)
{
- if (codons != null)
+ List acfs = getCodonFrames();
+ if (codons != null && acfs != null && !acfs.contains(codons))
{
- codonFrameList.add(codons);
+ acfs.add(codons);
}
}
@@ -1291,7 +1301,7 @@ public class Alignment implements AlignmentI
return null;
}
List cframes = new ArrayList();
- for (AlignedCodonFrame acf : codonFrameList)
+ for (AlignedCodonFrame acf : getCodonFrames())
{
if (acf.involvesSequence(seq))
{
@@ -1302,42 +1312,50 @@ public class Alignment implements AlignmentI
}
/**
- * Sets the codon frame mappings (replacing any existing mappings).
+ * Sets the codon frame mappings (replacing any existing mappings). Note the
+ * mappings are set on the dataset alignment instead if there is one.
*
* @see jalview.datamodel.AlignmentI#setCodonFrames()
*/
@Override
- public void setCodonFrames(Set acfs)
+ public void setCodonFrames(List acfs)
{
- this.codonFrameList = acfs;
+ if (dataset != null)
+ {
+ dataset.setCodonFrames(acfs);
+ }
+ else
+ {
+ this.codonFrameList = acfs;
+ }
}
/**
* Returns the set of codon frame mappings. Any changes to the returned set
- * will affect the alignment.
+ * will affect the alignment. The mappings are held on (and read from) the
+ * dataset alignment if there is one.
*
* @see jalview.datamodel.AlignmentI#getCodonFrames()
*/
@Override
- public Set getCodonFrames()
+ public List getCodonFrames()
{
- return codonFrameList;
+ return dataset != null ? dataset.getCodonFrames() : codonFrameList;
}
- /*
- * (non-Javadoc)
- *
- * @seejalview.datamodel.AlignmentI#removeCodonFrame(jalview.datamodel.
- * AlignedCodonFrame)
+ /**
+ * Removes the given mapping from the stored set. Note that the mappings are
+ * held on the dataset alignment if there is one.
*/
@Override
public boolean removeCodonFrame(AlignedCodonFrame codons)
{
- if (codons == null || codonFrameList == null)
+ List acfs = getCodonFrames();
+ if (codons == null || acfs == null)
{
return false;
}
- return codonFrameList.remove(codons);
+ return acfs.remove(codons);
}
@Override
@@ -1383,7 +1401,7 @@ public class Alignment implements AlignmentI
addAnnotation(alan[a]);
}
- this.codonFrameList.addAll(toappend.getCodonFrames());
+ getCodonFrames().addAll(toappend.getCodonFrames());
List sg = toappend.getGroups();
if (sg != null)
@@ -1595,6 +1613,7 @@ public class Alignment implements AlignmentI
*
* @return the representative sequence for this group
*/
+ @Override
public SequenceI getSeqrep()
{
return seqrep;
@@ -1607,6 +1626,7 @@ public class Alignment implements AlignmentI
* @param seqrep
* the seqrep to set (null means no sequence representative)
*/
+ @Override
public void setSeqrep(SequenceI seqrep)
{
this.seqrep = seqrep;
@@ -1616,6 +1636,7 @@ public class Alignment implements AlignmentI
*
* @return true if group has a sequence representative
*/
+ @Override
public boolean hasSeqrep()
{
return seqrep != null;
@@ -1748,4 +1769,46 @@ public class Alignment implements AlignmentI
}
return hasValidSeq;
}
+
+ /**
+ * Update any mappings to 'virtual' sequences to compatible real ones, if
+ * present in the added sequences. Returns a count of mappings updated.
+ *
+ * @param seqs
+ * @return
+ */
+ @Override
+ public int realiseMappings(List seqs)
+ {
+ int count = 0;
+ for (SequenceI seq : seqs)
+ {
+ for (AlignedCodonFrame mapping : getCodonFrames())
+ {
+ count += mapping.realiseWith(seq);
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Returns the first AlignedCodonFrame that has a mapping between the given
+ * dataset sequences
+ *
+ * @param mapFrom
+ * @param mapTo
+ * @return
+ */
+ @Override
+ public AlignedCodonFrame getMapping(SequenceI mapFrom, SequenceI mapTo)
+ {
+ for (AlignedCodonFrame acf : getCodonFrames())
+ {
+ if (acf.getAaForDnaSeq(mapFrom) == mapTo)
+ {
+ return acf;
+ }
+ }
+ return null;
+ }
}
diff --git a/src/jalview/datamodel/AlignmentI.java b/src/jalview/datamodel/AlignmentI.java
index de79488..396ef2d 100755
--- a/src/jalview/datamodel/AlignmentI.java
+++ b/src/jalview/datamodel/AlignmentI.java
@@ -375,12 +375,12 @@ public interface AlignmentI extends AnnotatedCollectionI
*
* @return
*/
- Set getCodonFrames();
+ List getCodonFrames();
/**
- * Set the codon frame mappings (replacing any existing set).
+ * Set the codon frame mappings (replacing any existing list).
*/
- void setCodonFrames(Set acfs);
+ void setCodonFrames(List acfs);
/**
* get codon frames involving sequenceI
@@ -524,4 +524,23 @@ public interface AlignmentI extends AnnotatedCollectionI
* @return
*/
public boolean hasValidSequence();
+
+ /**
+ * Update any mappings to 'virtual' sequences to compatible real ones, if
+ * present in the added sequences. Returns a count of mappings updated.
+ *
+ * @param seqs
+ * @return
+ */
+ int realiseMappings(List seqs);
+
+ /**
+ * Returns the first AlignedCodonFrame that has a mapping between the given
+ * dataset sequences
+ *
+ * @param mapFrom
+ * @param mapTo
+ * @return
+ */
+ AlignedCodonFrame getMapping(SequenceI mapFrom, SequenceI mapTo);
}
diff --git a/src/jalview/datamodel/DBRefSource.java b/src/jalview/datamodel/DBRefSource.java
index 6a676cf..8783e4f 100755
--- a/src/jalview/datamodel/DBRefSource.java
+++ b/src/jalview/datamodel/DBRefSource.java
@@ -79,62 +79,18 @@ public class DBRefSource
public static final String GENEDB = "GeneDB".toUpperCase();
/**
- * List of databases whose sequences might have coding regions annotated
- */
- public static final String[] DNACODINGDBS = { EMBL, EMBLCDS, GENEDB };
-
- public static final String[] CODINGDBS = { EMBLCDS, GENEDB };
-
- public static final String[] PROTEINDBS = { UNIPROT, PDB, UNIPROTKB,
- EMBLCDSProduct };
-
- public static final String[] PROTEINSEQ = { UNIPROT, UNIPROTKB,
- EMBLCDSProduct };
-
- public static final String[] PROTEINSTR = { PDB };
-
- public static final String[] DOMAINDBS = { PFAM, RFAM };
-
- /**
- * set of unique DBRefSource property constants. These could be used to
- * reconstruct the above groupings
- */
- public static final Object SEQDB = "SQ";
-
- /**
- * database of nucleic acid sequences
- */
- public static final Object DNASEQDB = "NASQ";
-
- /**
- * database of amino acid sequences
- */
- public static final Object PROTSEQDB = "PROTSQ";
-
- /**
- * database of cDNA sequences
- */
- public static final Object CODINGSEQDB = "CODING";
-
- /**
- * database of na sequences with exon annotation
+ * Ensembl
*/
- public static final Object DNACODINGSEQDB = "XONCODING";
+ public static final String ENSEMBL = "ENSEMBL";
/**
- * DB returns several sequences associated with a protein/nucleotide domain
+ * List of databases whose sequences might have coding regions annotated
*/
- public static final Object DOMAINDB = "DOMAIN";
+ public static final String[] DNACODINGDBS = { EMBL, EMBLCDS, GENEDB,
+ ENSEMBL };
- /**
- * DB query can take multiple accession codes concatenated by a separator.
- * Value of property indicates maximum number of accession codes to send at a
- * time.
- */
- public static final Object MULTIACC = "MULTIACC";
+ public static final String[] CODINGDBS = { EMBLCDS, GENEDB, ENSEMBL };
- /**
- * DB query returns an alignment for each accession provided.
- */
- public static final Object ALIGNMENTDB = "ALIGNMENTS";
+ public static final String[] PROTEINDBS = { UNIPROT, PDB, UNIPROTKB,
+ EMBLCDSProduct, ENSEMBL }; // Ensembl ENSP* entries are protein
}
diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java
index 6c619ce..bd83fe9 100644
--- a/src/jalview/datamodel/Mapping.java
+++ b/src/jalview/datamodel/Mapping.java
@@ -155,8 +155,9 @@ public class Mapping
int[] alignedCodon = getAlignedCodon(codon);
String peptide = getPeptide();
+ int peptideCol = toPosition - 1 - Mapping.this.to.getStart();
return new AlignedCodon(alignedCodon[0], alignedCodon[1],
- alignedCodon[2], peptide);
+ alignedCodon[2], peptide, peptideCol);
}
/**
@@ -164,6 +165,8 @@ public class Mapping
* sequence.
*
* @return
+ * @throws NoSuchElementException
+ * if the 'toRange' is exhausted (nothing to map to)
*/
private String getPeptide()
{
@@ -693,6 +696,7 @@ public class Mapping
*
* @see java.lang.Object#finalize()
*/
+ @Override
protected void finalize() throws Throwable
{
map = null;
@@ -700,9 +704,28 @@ public class Mapping
super.finalize();
}
+ /**
+ * Returns an iterator which can serve up the aligned codon column positions
+ * and their corresponding peptide products
+ *
+ * @param seq
+ * an aligned (i.e. possibly gapped) sequence
+ * @param gapChar
+ * @return
+ */
public Iterator getCodonIterator(SequenceI seq, char gapChar)
{
return new AlignedCodonIterator(seq, gapChar);
}
+ /**
+ * Readable representation for debugging only, not guaranteed not to change
+ */
+ @Override
+ public String toString()
+ {
+ return String.format("%s %s", this.map.toString(), this.to == null ? ""
+ : this.to.getName());
+ }
+
}
diff --git a/src/jalview/datamodel/MappingType.java b/src/jalview/datamodel/MappingType.java
new file mode 100644
index 0000000..c0c69aa
--- /dev/null
+++ b/src/jalview/datamodel/MappingType.java
@@ -0,0 +1,63 @@
+package jalview.datamodel;
+
+/**
+ * An enumeration of the kinds of mapping (from nucleotide or peptide, to
+ * nucleotide or peptide), and the corresponding word lengths
+ */
+public enum MappingType
+{
+ NucleotideToPeptide(3, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return PeptideToNucleotide;
+ }
+ },
+ PeptideToNucleotide(1, 3)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return NucleotideToPeptide;
+ }
+ },
+ NucleotideToNucleotide(1, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return NucleotideToNucleotide;
+ }
+ },
+ PeptideToPeptide(1, 1)
+ {
+ @Override
+ public MappingType getInverse()
+ {
+ return PeptideToPeptide;
+ }
+ };
+
+ private int fromRatio;
+
+ private int toRatio;
+
+ private MappingType(int fromSize, int toSize)
+ {
+ fromRatio = fromSize;
+ toRatio = toSize;
+ }
+
+ public abstract MappingType getInverse();
+
+ public int getFromRatio()
+ {
+ return fromRatio;
+ }
+
+ public int getToRatio()
+ {
+ return toRatio;
+ }
+}
diff --git a/src/jalview/datamodel/SearchResults.java b/src/jalview/datamodel/SearchResults.java
index ad0e472..b9db461 100755
--- a/src/jalview/datamodel/SearchResults.java
+++ b/src/jalview/datamodel/SearchResults.java
@@ -67,8 +67,22 @@ public class SearchResults
public Match(SequenceI seq, int start, int end)
{
sequence = seq;
- this.start = start;
- this.end = end;
+
+ /*
+ * always hold in forwards order, even if given in reverse order
+ * (such as from a mapping to a reverse strand); this avoids
+ * trouble for routines that highlight search results etc
+ */
+ if (start <= end)
+ {
+ this.start = start;
+ this.end = end;
+ }
+ else
+ {
+ this.start = end;
+ this.end = start;
+ }
}
public SequenceI getSequence()
diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java
index ac2f9c1..3ea510b 100755
--- a/src/jalview/datamodel/Sequence.java
+++ b/src/jalview/datamodel/Sequence.java
@@ -265,11 +265,10 @@ public class Sequence extends ASequence implements SequenceI
}
if (seq.getAllPDBEntries() != null)
{
- Vector ids = seq.getAllPDBEntries();
- Enumeration e = ids.elements();
- while (e.hasMoreElements())
+ Vector ids = seq.getAllPDBEntries();
+ for (PDBEntry pdb : ids)
{
- this.addPDBId(new PDBEntry((PDBEntry) e.nextElement()));
+ this.addPDBId(new PDBEntry(pdb));
}
}
}
@@ -289,6 +288,7 @@ public class Sequence extends ASequence implements SequenceI
@Override
public synchronized void addSequenceFeature(SequenceFeature sf)
{
+ // TODO add to dataset sequence instead if there is one?
if (sequenceFeatures == null)
{
sequenceFeatures = new SequenceFeature[0];
@@ -611,17 +611,15 @@ public class Sequence extends ASequence implements SequenceI
}
/**
- * DOCUMENT ME!
- *
- * @param i
- * DOCUMENT ME!
+ * Returns the character of the aligned sequence at the given position (base
+ * zero), or space if the position is not within the sequence's bounds
*
- * @return DOCUMENT ME!
+ * @return
*/
@Override
public char getCharAt(int i)
{
- if (i < sequence.length)
+ if (i >= 0 && i < sequence.length)
{
return sequence[i];
}
@@ -931,6 +929,7 @@ public class Sequence extends ASequence implements SequenceI
@Override
public void addDBRef(DBRefEntry entry)
{
+ // TODO add to dataset sequence instead if there is one?
if (dbrefs == null)
{
dbrefs = new DBRefEntry[0];
@@ -964,6 +963,7 @@ public class Sequence extends ASequence implements SequenceI
@Override
public void setDatasetSequence(SequenceI seq)
{
+ // TODO check for circular reference before setting?
datasetSequence = seq;
}
diff --git a/src/jalview/datamodel/SequenceDummy.java b/src/jalview/datamodel/SequenceDummy.java
index 7e3c187..172c25f 100644
--- a/src/jalview/datamodel/SequenceDummy.java
+++ b/src/jalview/datamodel/SequenceDummy.java
@@ -20,7 +20,7 @@
*/
package jalview.datamodel;
-public class SequenceDummy extends Sequence implements SequenceI
+public class SequenceDummy extends Sequence
{
public SequenceDummy(String sequenceId)
{
@@ -50,4 +50,14 @@ public class SequenceDummy extends Sequence implements SequenceI
{
return dummy;
}
+
+ /**
+ * Always suppress /start-end for display name as we don't know it
+ */
+ @Override
+ public String getDisplayId(boolean jvsuffix)
+ {
+ // required for correct behaviour of SequenceIdMatcher
+ return super.getDisplayId(false);
+ }
}
diff --git a/src/jalview/datamodel/SequenceFeature.java b/src/jalview/datamodel/SequenceFeature.java
index 1b6498f..8146400 100755
--- a/src/jalview/datamodel/SequenceFeature.java
+++ b/src/jalview/datamodel/SequenceFeature.java
@@ -20,7 +20,8 @@
*/
package jalview.datamodel;
-import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Vector;
/**
@@ -31,6 +32,15 @@ import java.util.Vector;
*/
public class SequenceFeature
{
+ private static final String STATUS = "status";
+
+ private static final String STRAND = "STRAND";
+
+ // private key for Phase designed not to conflict with real GFF data
+ private static final String PHASE = "!Phase";
+
+ private static final String ATTRIBUTES = "ATTRIBUTES";
+
public int begin;
public int end;
@@ -41,7 +51,7 @@ public class SequenceFeature
public String description;
- public Hashtable otherDetails;
+ public Map otherDetails;
public Vector links;
@@ -54,9 +64,9 @@ public class SequenceFeature
}
/**
- * Constructs a duplicate feature. Note: Uses clone on the otherDetails so
- * only shallow copies are made of additional properties and method will
- * silently fail if unclonable objects are found in the hash.
+ * Constructs a duplicate feature. Note: Uses makes a shallow copy of the
+ * otherDetails map, so the new and original SequenceFeature may reference the
+ * same objects in the map.
*
* @param cpy
*/
@@ -83,10 +93,11 @@ public class SequenceFeature
{
try
{
- otherDetails = (Hashtable) cpy.otherDetails.clone();
+ otherDetails = (Map) ((HashMap) cpy.otherDetails)
+ .clone();
} catch (Exception e)
{
- // Uncloneable objects in the otherDetails - don't complain
+ // ignore
}
}
if (cpy.links != null && cpy.links.size() > 0)
@@ -105,7 +116,7 @@ public class SequenceFeature
{
this.type = type;
this.description = desc;
- setValue("status", status);
+ setValue(STATUS, status);
this.begin = begin;
this.end = end;
this.featureGroup = featureGroup;
@@ -122,23 +133,93 @@ public class SequenceFeature
this.featureGroup = featureGroup;
}
- public boolean equals(SequenceFeature sf)
+ /**
+ * Two features are considered equal if they have the same type, group,
+ * description, start, end, phase, strand, and (if present) 'Name', ID' and
+ * 'Parent' attributes.
+ *
+ * Note we need to check Parent to distinguish the same exon occurring in
+ * different transcripts (in Ensembl GFF). This allows assembly of transcript
+ * sequences from their component exon regions.
+ */
+ @Override
+ public boolean equals(Object o)
+ {
+ return equals(o, false);
+ }
+
+ /**
+ * Overloaded method allows the equality test to optionally ignore the
+ * 'Parent' attribute of a feature. This supports avoiding adding many
+ * superficially duplicate 'exon' or CDS features to genomic or protein
+ * sequence.
+ *
+ * @param o
+ * @param ignoreParent
+ * @return
+ */
+ public boolean equals(Object o, boolean ignoreParent)
{
+ if (o == null || !(o instanceof SequenceFeature))
+ {
+ return false;
+ }
+
+ SequenceFeature sf = (SequenceFeature) o;
if (begin != sf.begin || end != sf.end || score != sf.score)
{
return false;
}
- if (!(type + description + featureGroup).equals(sf.type
- + sf.description + sf.featureGroup))
+ if (getStrand() != sf.getStrand())
{
return false;
}
+ if (!(type + description + featureGroup + getPhase()).equals(sf.type
+ + sf.description + sf.featureGroup + sf.getPhase()))
+ {
+ return false;
+ }
+ if (!equalAttribute(getValue("ID"), sf.getValue("ID")))
+ {
+ return false;
+ }
+ if (!equalAttribute(getValue("Name"), sf.getValue("Name")))
+ {
+ return false;
+ }
+ if (!ignoreParent)
+ {
+ if (!equalAttribute(getValue("Parent"), sf.getValue("Parent")))
+ {
+ return false;
+ }
+ }
return true;
}
/**
+ * Returns true if both values are null, are both non-null and equal
+ *
+ * @param att1
+ * @param att2
+ * @return
+ */
+ protected static boolean equalAttribute(Object att1, Object att2)
+ {
+ if (att1 == null && att2 == null)
+ {
+ return true;
+ }
+ if (att1 != null)
+ {
+ return att1.equals(att2);
+ }
+ return att2.equals(att1);
+ }
+
+ /**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
@@ -229,7 +310,7 @@ public class SequenceFeature
}
/**
- * Used for getting values which are not in the basic set. eg STRAND, FRAME
+ * Used for getting values which are not in the basic set. eg STRAND, PHASE
* for GFF file
*
* @param key
@@ -248,6 +329,20 @@ public class SequenceFeature
}
/**
+ * Returns a property value for the given key if known, else the specified
+ * default value
+ *
+ * @param key
+ * @param defaultValue
+ * @return
+ */
+ public Object getValue(String key, Object defaultValue)
+ {
+ Object value = getValue(key);
+ return value == null ? defaultValue : value;
+ }
+
+ /**
* Used for setting values which are not in the basic set. eg STRAND, FRAME
* for GFF file
*
@@ -262,7 +357,7 @@ public class SequenceFeature
{
if (otherDetails == null)
{
- otherDetails = new Hashtable();
+ otherDetails = new HashMap();
}
otherDetails.put(key, value);
@@ -275,20 +370,22 @@ public class SequenceFeature
*/
public void setStatus(String status)
{
- setValue("status", status);
+ setValue(STATUS, status);
}
public String getStatus()
{
- if (otherDetails != null)
- {
- String stat = (String) otherDetails.get("status");
- if (stat != null)
- {
- return new String(stat);
- }
- }
- return null;
+ return (String) getValue(STATUS);
+ }
+
+ public void setAttributes(String attr)
+ {
+ setValue(ATTRIBUTES, attr);
+ }
+
+ public String getAttributes()
+ {
+ return (String) getValue(ATTRIBUTES);
}
public void setPosition(int pos)
@@ -302,23 +399,67 @@ public class SequenceFeature
return begin;
}
+ /**
+ * Return 1 for forward strand ('+' in GFF), -1 for reverse strand ('-' in
+ * GFF), and 0 for unknown or not (validly) specified
+ *
+ * @return
+ */
public int getStrand()
{
- String str;
- if (otherDetails == null
- || (str = otherDetails.get("STRAND").toString()) == null)
- {
- return 0;
- }
- if (str.equals("-"))
- {
- return -1;
- }
- if (str.equals("+"))
+ int strand = 0;
+ if (otherDetails != null)
{
- return 1;
+ Object str = otherDetails.get(STRAND);
+ if ("-".equals(str))
+ {
+ strand = -1;
+ }
+ else if ("+".equals(str))
+ {
+ strand = 1;
+ }
}
- return 0;
+ return strand;
+ }
+
+ public void setStrand(String strand)
+ {
+ setValue(STRAND, strand);
+ }
+
+ public void setPhase(String phase)
+ {
+ setValue(PHASE, phase);
+ }
+
+ public String getPhase()
+ {
+ return (String) getValue(PHASE);
}
+ /**
+ * Readable representation, for debug only, not guaranteed not to change
+ * between versions
+ */
+ @Override
+ public String toString()
+ {
+ return String.format("%d %d %s %s", getBegin(), getEnd(), getType(),
+ getDescription());
+ }
+
+ /**
+ * Overridden to ensure that whenever two objects are equal, they have the
+ * same hashCode
+ */
+ @Override
+ public int hashCode()
+ {
+ String s = getType() + getDescription() + getFeatureGroup()
+ + getValue("ID") + getValue("Name") + getValue("Parent")
+ + getPhase();
+ return s.hashCode() + getBegin() + getEnd() + (int) getScore()
+ + getStrand();
+ }
}
diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java
new file mode 100644
index 0000000..856be74
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblCdna.java
@@ -0,0 +1,125 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.SequenceFeature;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
+
+import java.util.Arrays;
+import java.util.List;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * A client to fetch CDNA sequence from Ensembl (i.e. that part of the genomic
+ * sequence that is transcribed to RNA, but not necessarily translated to
+ * protein)
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblCdna extends EnsemblSeqProxy
+{
+ private static final List CROSS_REFERENCES = Arrays
+ .asList(new String[] { "Uniprot/SWISSPROT", "Uniprot/SPTREMBL" });
+
+ /*
+ * accepts ENST or ENSTG with 11 digits
+ * or ENSMUST or similar for other species
+ * or CCDSnnnnn.nn with at least 3 digits
+ */
+ private static final Regex ACCESSION_REGEX = new Regex(
+ "(ENS([A-Z]{3}|)[TG][0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)");
+
+ /*
+ * fetch exon features on genomic sequence (to identify the cdna regions)
+ * and cds and variation features (to retain)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.exon, EnsemblFeatureType.cds,
+ EnsemblFeatureType.variation };
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblCdna()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblCdna(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (CDNA)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.CDNA;
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'transcript' (or a sub-type in the
+ * Sequence Ontology).
+ */
+ @Override
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ if (isTranscript(sf.getType()))
+ {
+ return false;
+ }
+ return featureMayBelong(sf, accessionId);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'exon' (or a subtype of exon
+ * in the Sequence Ontology), and the Parent of the feature is the transcript
+ * we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+ SequenceOntologyI.EXON))
+ {
+ String parentFeature = (String) sf.getValue(PARENT);
+ if (("transcript:" + accId).equals(parentFeature))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ protected List getCrossReferenceDatabases()
+ {
+ return CROSS_REFERENCES;
+ // 30/01/16 also found Vega_transcript, OTTT, ENS_LRG_transcript, UCSC,
+ // HGNC_trans_name, RefSeq_mRNA, RefSeq_mRNA_predicted
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java
new file mode 100644
index 0000000..2086eba
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblCds.java
@@ -0,0 +1,118 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A client for direct fetching of CDS sequences from Ensembl (i.e. that part of
+ * the genomic sequence that is translated to protein)
+ *
+ * TODO: not currently used as CDS sequences are computed from CDS features on
+ * transcripts - delete this class?
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblCds extends EnsemblSeqProxy
+{
+ /*
+ * fetch cds features on genomic sequence (to identify the CDS regions)
+ * and exon and variation features (to retain for display)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.cds, EnsemblFeatureType.exon,
+ EnsemblFeatureType.variation };
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblCds()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblCds(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (CDS)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.CDS;
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'CDS' (or a sub-type of CDS in the
+ * Sequence Ontology). CDS features are only retrieved in order to identify
+ * the cds sequence range, and are redundant information on the cds sequence
+ * itself.
+ */
+ @Override
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+ SequenceOntologyI.CDS))
+ {
+ return false;
+ }
+ return featureMayBelong(sf, accessionId);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'CDS' (or a subtype of CDS in
+ * the Sequence Ontology), and the Parent of the feature is the transcript we
+ * are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+ SequenceOntologyI.CDS))
+ {
+ String parentFeature = (String) sf.getValue(PARENT);
+ if (("transcript:" + accId).equals(parentFeature))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Overrides this method to trivially return a range which is the whole of the
+ * nucleotide sequence. This is both faster than scanning for CDS features,
+ * and also means we don't need to keep CDS features on CDS sequence (where
+ * they are redundant information).
+ */
+ protected List getCdsRanges(SequenceI dnaSeq)
+ {
+ int len = dnaSeq.getLength();
+ List ranges = new ArrayList();
+ ranges.add(new int[] { 1, len });
+ return ranges;
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblFeatures.java b/src/jalview/ext/ensembl/EnsemblFeatures.java
new file mode 100644
index 0000000..0547433
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblFeatures.java
@@ -0,0 +1,141 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.io.FeaturesFile;
+import jalview.io.FileParse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A client for fetching and processing Ensembl feature data in GFF format by
+ * calling the overlap REST service
+ *
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ */
+class EnsemblFeatures extends EnsemblRestClient
+{
+ /*
+ * The default features to retrieve from Ensembl
+ * can override in getSequenceRecords parameter
+ */
+ private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds,
+ EnsemblFeatureType.exon, EnsemblFeatureType.variation };
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblFeatures()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblFeatures(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (features)";
+ }
+
+ /**
+ * Makes a query to the REST overlap endpoint for the given sequence
+ * identifier. This returns an 'alignment' consisting of one 'dummy sequence'
+ * (the genomic sequence for which overlap features are returned by the
+ * service). This sequence will have on it sequence features which are the
+ * real information of interest, such as CDS regions or sequence variations.
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws IOException
+ {
+ // TODO: use a vararg String... for getSequenceRecords instead?
+ List queries = new ArrayList();
+ queries.add(query);
+ FileParse fp = getSequenceReader(queries);
+ FeaturesFile fr = new FeaturesFile(fp);
+ return new Alignment(fr.getSeqsAsArray());
+ }
+
+ /**
+ * Returns a URL for the REST overlap endpoint
+ *
+ * @param ids
+ * @return
+ */
+ @Override
+ protected URL getUrl(List ids) throws MalformedURLException
+ {
+ StringBuffer urlstring = new StringBuffer(128);
+ urlstring.append(getDomain()).append("/overlap/id/")
+ .append(ids.get(0));
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ urlstring.append("?content-type=text/x-gff3");
+
+ /*
+ * specify features to retrieve
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ * could make the list a configurable entry in jalview.properties
+ */
+ for (EnsemblFeatureType feature : featuresWanted)
+ {
+ urlstring.append("&feature=").append(feature.name());
+ }
+
+ return new URL(urlstring.toString());
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return true;
+ }
+
+ /**
+ * Returns the MIME type for GFF3. For GET requests the Content-type header
+ * describes the required encoding of the response.
+ */
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Returns the MIME type for GFF3.
+ */
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Overloaded method that allows a list of features to retrieve to be
+ * specified
+ *
+ * @param accId
+ * @param features
+ * @return
+ * @throws IOException
+ */
+ protected AlignmentI getSequenceRecords(String accId,
+ EnsemblFeatureType[] features) throws IOException
+ {
+ featuresWanted = features;
+ return getSequenceRecords(accId);
+ }
+}
diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java
new file mode 100644
index 0000000..fa1e474
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblGene.java
@@ -0,0 +1,571 @@
+package jalview.ext.ensembl;
+
+import jalview.api.FeatureColourI;
+import jalview.api.FeatureSettingsModelI;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
+import jalview.schemes.FeatureColourAdapter;
+import jalview.schemes.FeatureSettingsAdapter;
+import jalview.util.MapList;
+import jalview.util.StringUtils;
+
+import java.awt.Color;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * A class that fetches genomic sequence and all transcripts for an Ensembl gene
+ *
+ * @author gmcarstairs
+ */
+public class EnsemblGene extends EnsemblSeqProxy
+{
+ private static final List CROSS_REFERENCES = Arrays
+ .asList(new String[] { "CCDS" });
+
+ private static final String GENE_PREFIX = "gene:";
+
+ /*
+ * accepts anything as we will attempt lookup of gene or
+ * transcript id or gene name
+ */
+ private static final Regex ACCESSION_REGEX = new Regex(".*");
+
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
+ EnsemblFeatureType.exon, EnsemblFeatureType.cds,
+ EnsemblFeatureType.variation };
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblGene()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblGene(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL";
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.GENOMIC;
+ }
+
+ /**
+ * Returns an alignment containing the gene(s) for the given gene or
+ * transcript identifier, or external identifier (e.g. Uniprot id). If given a
+ * gene name or external identifier, returns any related gene sequences found
+ * for model organisms. If only a single gene is queried for, then its
+ * transcripts are also retrieved and added to the alignment.
+ * Method:
+ *
+ * - resolves a transcript identifier by looking up its parent gene id
+ * - resolves an external identifier by looking up xref-ed gene ids
+ * - fetches the gene sequence
+ * - fetches features on the sequence
+ * - identifies "transcript" features whose Parent is the requested gene
+ * - fetches the transcript sequence for each transcript
+ * - makes a mapping from the gene to each transcript
+ * - copies features from gene to transcript sequences
+ * - fetches the protein sequence for each transcript, maps and saves it as
+ * a cross-reference
+ * - aligns each transcript against the gene sequence based on the position
+ * mappings
+ *
+ *
+ * @param query
+ * one or more identifiers separated by a space
+ * @return an alignment containing one or more genes, and possibly
+ * transcripts, or null
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws Exception
+ {
+ // todo: tidy up handling of one or multiple accession ids
+ String[] queries = query.split(getAccessionSeparator());
+
+ /*
+ * if given a transcript id, look up its gene parent
+ */
+ if (isTranscriptIdentifier(query))
+ {
+ // we are assuming all transcripts have the same gene parent here
+ query = new EnsemblLookup(getDomain()).getParent(queries[0]);
+ if (query == null)
+ {
+ return null;
+ }
+ }
+
+ /*
+ * if given a gene or other external name, lookup and fetch
+ * the corresponding gene for all model organisms
+ */
+ if (!isGeneIdentifier(query))
+ {
+ List geneIds = new EnsemblSymbol(getDomain()).getIds(query);
+ if (geneIds.isEmpty())
+ {
+ return null;
+ }
+ String theIds = StringUtils.listToDelimitedString(geneIds,
+ getAccessionSeparator());
+ return getSequenceRecords(theIds);
+ }
+
+ /*
+ * fetch the gene sequence(s) with features and xrefs
+ */
+ AlignmentI al = super.getSequenceRecords(query);
+
+ /*
+ * if we retrieved a single gene, get its transcripts as well
+ */
+ if (al.getHeight() == 1)
+ {
+ getTranscripts(al, query);
+ }
+
+ return al;
+ }
+
+ /**
+ * Attempts to get Ensembl stable identifiers for model organisms for a gene
+ * name by calling the xrefs symbol REST service to resolve the gene name.
+ *
+ * @param query
+ * @return
+ */
+ protected String getGeneIdentifiersForName(String query)
+ {
+ List ids = new EnsemblSymbol(getDomain()).getIds(query);
+ if (ids != null)
+ {
+ for (String id : ids)
+ {
+ if (isGeneIdentifier(id))
+ {
+ return id;
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Constructs all transcripts for the gene, as identified by "transcript"
+ * features whose Parent is the requested gene. The coding transcript
+ * sequences (i.e. with introns omitted) are added to the alignment.
+ *
+ * @param al
+ * @param accId
+ * @throws Exception
+ */
+ protected void getTranscripts(AlignmentI al, String accId)
+ throws Exception
+ {
+ SequenceI gene = al.getSequenceAt(0);
+ List transcriptFeatures = getTranscriptFeatures(accId,
+ gene);
+
+ for (SequenceFeature transcriptFeature : transcriptFeatures)
+ {
+ makeTranscript(transcriptFeature, al, gene);
+ }
+
+ clearGeneFeatures(gene);
+ }
+
+ /**
+ * Remove unwanted features (transcript, exon, CDS) from the gene sequence
+ * after we have used them to derive transcripts and transfer features
+ *
+ * @param gene
+ */
+ protected void clearGeneFeatures(SequenceI gene)
+ {
+ SequenceFeature[] sfs = gene.getSequenceFeatures();
+ if (sfs != null)
+ {
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ List filtered = new ArrayList();
+ for (SequenceFeature sf : sfs)
+ {
+ String type = sf.getType();
+ if (!isTranscript(type) && !so.isA(type, SequenceOntologyI.EXON)
+ && !so.isA(type, SequenceOntologyI.CDS))
+ {
+ filtered.add(sf);
+ }
+ }
+ gene.setSequenceFeatures(filtered
+ .toArray(new SequenceFeature[filtered
+ .size()]));
+ }
+ }
+
+ /**
+ * Constructs a spliced transcript sequence by finding 'exon' features for the
+ * given id (or failing that 'CDS'). Copies features on to the new sequence.
+ * 'Aligns' the new sequence against the gene sequence by padding with gaps,
+ * and adds it to the alignment.
+ *
+ * @param transcriptFeature
+ * @param al
+ * the alignment to which to add the new sequence
+ * @param gene
+ * the parent gene sequence, with features
+ * @return
+ */
+ SequenceI makeTranscript(SequenceFeature transcriptFeature,
+ AlignmentI al, SequenceI gene)
+ {
+ String accId = getTranscriptId(transcriptFeature);
+ if (accId == null)
+ {
+ return null;
+ }
+
+ /*
+ * NB we are mapping from gene sequence (not genome), so do not
+ * need to check for reverse strand (gene and transcript sequences
+ * are in forward sense)
+ */
+
+ /*
+ * make a gene-length sequence filled with gaps
+ * we will fill in the bases for transcript regions
+ */
+ char[] seqChars = new char[gene.getLength()];
+ Arrays.fill(seqChars, al.getGapCharacter());
+
+ /*
+ * look for exon features of the transcript, failing that for CDS
+ * (for example ENSG00000124610 has 1 CDS but no exon features)
+ */
+ String parentId = "transcript:" + accId;
+ List splices = findFeatures(gene,
+ SequenceOntologyI.EXON, parentId);
+ if (splices.isEmpty())
+ {
+ splices = findFeatures(gene, SequenceOntologyI.CDS, parentId);
+ }
+
+ int transcriptLength = 0;
+ final char[] geneChars = gene.getSequence();
+ int offset = gene.getStart(); // to convert to 0-based positions
+ List mappedFrom = new ArrayList();
+
+ for (SequenceFeature sf : splices)
+ {
+ int start = sf.getBegin() - offset;
+ int end = sf.getEnd() - offset;
+ int spliceLength = end - start + 1;
+ System.arraycopy(geneChars, start, seqChars, start, spliceLength);
+ transcriptLength += spliceLength;
+ mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() });
+ }
+
+ Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength);
+
+ /*
+ * Ensembl has gene name as transcript Name
+ * EnsemblGenomes doesn't, but has a url-encoded description field
+ */
+ String description = (String) transcriptFeature.getValue(NAME);
+ if (description == null)
+ {
+ description = (String) transcriptFeature.getValue(DESCRIPTION);
+ }
+ if (description != null)
+ {
+ try
+ {
+ transcript.setDescription(URLDecoder.decode(description, "UTF-8"));
+ } catch (UnsupportedEncodingException e)
+ {
+ e.printStackTrace(); // as if
+ }
+ }
+ transcript.createDatasetSequence();
+
+ al.addSequence(transcript);
+
+ /*
+ * transfer features to the new sequence; we use EnsemblCdna to do this,
+ * to filter out unwanted features types (see method retainFeature)
+ */
+ List mapTo = new ArrayList();
+ mapTo.add(new int[] { 1, transcriptLength });
+ MapList mapping = new MapList(mappedFrom, mapTo, 1, 1);
+ EnsemblCdna cdna = new EnsemblCdna(getDomain());
+ cdna.transferFeatures(gene.getSequenceFeatures(),
+ transcript.getDatasetSequence(), mapping, parentId);
+
+ /*
+ * fetch and save cross-references
+ */
+ cdna.getCrossReferences(transcript);
+
+ /*
+ * and finally fetch the protein product and save as a cross-reference
+ */
+ cdna.addProteinProduct(transcript);
+
+ return transcript;
+ }
+
+ /**
+ * Returns the 'transcript_id' property of the sequence feature (or null)
+ *
+ * @param feature
+ * @return
+ */
+ protected String getTranscriptId(SequenceFeature feature)
+ {
+ return (String) feature.getValue("transcript_id");
+ }
+
+ /**
+ * Returns a list of the transcript features on the sequence whose Parent is
+ * the gene for the accession id.
+ *
+ * @param accId
+ * @param geneSequence
+ * @return
+ */
+ protected List getTranscriptFeatures(String accId,
+ SequenceI geneSequence)
+ {
+ List transcriptFeatures = new ArrayList();
+
+ String parentIdentifier = GENE_PREFIX + accId;
+ SequenceFeature[] sfs = geneSequence.getSequenceFeatures();
+
+ if (sfs != null)
+ {
+ for (SequenceFeature sf : sfs)
+ {
+ if (isTranscript(sf.getType()))
+ {
+ String parent = (String) sf.getValue(PARENT);
+ if (parentIdentifier.equals(parent))
+ {
+ transcriptFeatures.add(sf);
+ }
+ }
+ }
+ }
+
+ return transcriptFeatures;
+ }
+
+ @Override
+ public String getDescription()
+ {
+ return "Fetches all transcripts and variant features for a gene or transcript";
+ }
+
+ /**
+ * Default test query is a gene id (can also enter a transcript id)
+ */
+ @Override
+ public String getTestQuery()
+ {
+ return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand
+ // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand
+ // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand
+ // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand
+ }
+
+ /**
+ * Answers true for a feature of type 'gene' (or a sub-type of gene in the
+ * Sequence Ontology), whose ID is the accession we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
+ SequenceOntologyI.GENE))
+ {
+ String id = (String) sf.getValue(ID);
+ if ((GENE_PREFIX + accId).equals(id))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Answers true unless feature type is 'gene', or 'transcript' with a parent
+ * which is a different gene. We need the gene features to identify the range,
+ * but it is redundant information on the gene sequence. Checking the parent
+ * allows us to drop transcript features which belong to different
+ * (overlapping) genes.
+ */
+ @Override
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ String type = sf.getType();
+ if (so.isA(type, SequenceOntologyI.GENE))
+ {
+ return false;
+ }
+ if (isTranscript(type))
+ {
+ String parent = (String) sf.getValue(PARENT);
+ if (!(GENE_PREFIX + accessionId).equals(parent))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Answers false. This allows an optimisation - a single 'gene' feature is all
+ * that is needed to identify the positions of the gene on the genomic
+ * sequence.
+ */
+ @Override
+ protected boolean isSpliceable()
+ {
+ return false;
+ }
+
+ @Override
+ protected List getCrossReferenceDatabases()
+ {
+ // found these for ENSG00000157764 on 30/01/2016:
+ // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress",
+ // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"};
+ return CROSS_REFERENCES;
+ }
+
+ /**
+ * Override to do nothing as Ensembl doesn't return a protein sequence for a
+ * gene identifier
+ */
+ @Override
+ protected void addProteinProduct(SequenceI querySeq)
+ {
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ /**
+ * Returns a descriptor for suitable feature display settings with
+ *
+ * - only exon or sequence_variant features (or their subtypes in the
+ * Sequence Ontology) visible
+ * - variant features coloured red
+ * - exon features coloured by label (exon name)
+ * - variants displayed above (on top of) exons
+ *
+ */
+ @Override
+ public FeatureSettingsModelI getFeatureColourScheme()
+ {
+ return new FeatureSettingsAdapter()
+ {
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ @Override
+ public boolean isFeatureDisplayed(String type)
+ {
+ return (so.isA(type, SequenceOntologyI.EXON) || so.isA(type,
+ SequenceOntologyI.SEQUENCE_VARIANT));
+ }
+
+ @Override
+ public FeatureColourI getFeatureColour(String type)
+ {
+ if (so.isA(type, SequenceOntologyI.EXON))
+ {
+ return new FeatureColourAdapter()
+ {
+ @Override
+ public boolean isColourByLabel()
+ {
+ return true;
+ }
+ };
+ }
+ if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ return new FeatureColourAdapter()
+ {
+
+ @Override
+ public Color getColour()
+ {
+ return Color.RED;
+ }
+ };
+ }
+ return null;
+ }
+
+ /**
+ * order to render sequence_variant after exon after the rest
+ */
+ @Override
+ public int compare(String feature1, String feature2)
+ {
+ if (so.isA(feature1, SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ return +1;
+ }
+ if (so.isA(feature2, SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ return -1;
+ }
+ if (so.isA(feature1, SequenceOntologyI.EXON))
+ {
+ return +1;
+ }
+ if (so.isA(feature2, SequenceOntologyI.EXON))
+ {
+ return -1;
+ }
+ return 0;
+ }
+ };
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java
new file mode 100644
index 0000000..20987e1
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblGenome.java
@@ -0,0 +1,95 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.SequenceFeature;
+
+/**
+ * A client to fetch genomic sequence from Ensembl
+ *
+ * TODO: not currently used - delete?
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblGenome extends EnsemblSeqProxy
+{
+ /*
+ * fetch transcript features on genomic sequence (to identify the transcript
+ * regions) and cds, exon and variation features (to retain)
+ */
+ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
+ EnsemblFeatureType.transcript, EnsemblFeatureType.exon,
+ EnsemblFeatureType.cds, EnsemblFeatureType.variation };
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblGenome()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblGenome(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (Genomic)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.GENOMIC;
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ return FEATURES_TO_FETCH;
+ }
+
+ /**
+ * Answers true unless the feature type is 'transcript' (or a sub-type of
+ * transcript in the Sequence Ontology), or has a parent other than the given
+ * accession id. Transcript features are only retrieved in order to identify
+ * the transcript sequence range, and are redundant information on the
+ * transcript sequence itself.
+ */
+ @Override
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ if (isTranscript(sf.getType()))
+ {
+ return false;
+ }
+ return featureMayBelong(sf, accessionId);
+ }
+
+ /**
+ * Answers true if the sequence feature type is 'transcript' (or a subtype of
+ * transcript in the Sequence Ontology), and the ID of the feature is the
+ * transcript we are retrieving
+ */
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ if (isTranscript(sf.getType()))
+ {
+ String id = (String) sf.getValue(ID);
+ if (("transcript:" + accId).equals(id))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblGenomes.java b/src/jalview/ext/ensembl/EnsemblGenomes.java
new file mode 100644
index 0000000..9ba2e1c
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblGenomes.java
@@ -0,0 +1,46 @@
+package jalview.ext.ensembl;
+
+
+/**
+ * A class to behave much like EnsemblGene but referencing the ensemblgenomes
+ * domain and data
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblGenomes extends EnsemblGene
+{
+ /**
+ * Constructor sets domain to rest.ensemblgenomes.org instead of the 'usual'
+ * rest.ensembl.org
+ */
+ public EnsemblGenomes()
+ {
+ super(ENSEMBL_GENOMES_REST);
+ }
+
+ @Override
+ public boolean isGeneIdentifier(String query)
+ {
+ return true;
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "EnsemblGenomes";
+ }
+
+ @Override
+ public String getTestQuery()
+ {
+ return "DDB_G0283883";
+ }
+
+ @Override
+ public String getDbSource()
+ {
+ return "EnsemblGenomes";
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblLookup.java b/src/jalview/ext/ensembl/EnsemblLookup.java
new file mode 100644
index 0000000..c5945ae
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblLookup.java
@@ -0,0 +1,160 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.List;
+
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+/**
+ * A client for the Ensembl lookup REST endpoint; used to find the Parent gene
+ * identifier given a transcript identifier.
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblLookup extends EnsemblRestClient
+{
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblLookup()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param
+ */
+ public EnsemblLookup(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL";
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null;
+ }
+
+ @Override
+ protected URL getUrl(List ids) throws MalformedURLException
+ {
+ String identifier = ids.get(0);
+ return getUrl(identifier);
+ }
+
+ /**
+ * @param identifier
+ * @return
+ */
+ protected URL getUrl(String identifier)
+ {
+ String url = getDomain() + "/lookup/id/" + identifier
+ + "?content-type=application/json";
+ try
+ {
+ return new URL(url);
+ } catch (MalformedURLException e)
+ {
+ return null;
+ }
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return true;
+ }
+
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return "application/json";
+ }
+
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "application/json";
+ }
+
+ /**
+ * Calls the Ensembl lookup REST endpoint and retrieves the 'Parent' for the
+ * given identifier, or null if not found
+ *
+ * @param identifier
+ * @return
+ */
+ public String getParent(String identifier)
+ {
+ List ids = Arrays.asList(new String[] { identifier });
+
+ BufferedReader br = null;
+ try
+ {
+ URL url = getUrl(identifier);
+ if (url != null)
+ {
+ br = getHttpResponse(url, ids);
+ }
+ return (parseResponse(br));
+ } catch (IOException e)
+ {
+ // ignore
+ return null;
+ } finally
+ {
+ if (br != null)
+ {
+ try
+ {
+ br.close();
+ } catch (IOException e)
+ {
+ // ignore
+ }
+ }
+ }
+ }
+
+ /**
+ * Parses "Parent" from the JSON response and returns the value, or null if
+ * not found
+ *
+ * @param br
+ * @return
+ * @throws IOException
+ */
+ protected String parseResponse(BufferedReader br) throws IOException
+ {
+ String parent = null;
+ JSONParser jp = new JSONParser();
+ try
+ {
+ JSONObject val = (JSONObject) jp.parse(br);
+ parent = val.get("Parent").toString();
+ } catch (ParseException e)
+ {
+ // ignore
+ }
+ return parent;
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java
new file mode 100644
index 0000000..97796a5
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblProtein.java
@@ -0,0 +1,137 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceFeature;
+
+import java.util.Arrays;
+import java.util.List;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * A client to fetch protein translated sequence for an Ensembl identifier
+ *
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblProtein extends EnsemblSeqProxy
+{
+ /*
+ * accepts ENSP with 11 digits
+ * or ENSMUSP or similar for other species
+ * or CCDSnnnnn.nn with at least 3 digits
+ */
+ private static final Regex ACCESSION_REGEX = new Regex(
+ "(ENS([A-Z]{3}|)P[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)");
+
+ private static final List CROSSREFS = Arrays.asList(new String[] {
+ "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" });
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblProtein()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblProtein(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (Protein)";
+ }
+
+ @Override
+ protected EnsemblSeqType getSourceEnsemblType()
+ {
+ return EnsemblSeqType.PROTEIN;
+ }
+
+ /**
+ * Returns false, as this fetcher does not retrieve DNA sequences.
+ */
+ @Override
+ public boolean isDnaCoding()
+ {
+ return false;
+ }
+
+ /**
+ * Test query is to the protein translation of transcript ENST00000288602
+ */
+ @Override
+ public String getTestQuery()
+ {
+ return "ENSP00000288602";
+ }
+
+ /**
+ * Overrides base class method to do nothing - genomic features are not
+ * applicable to the protein product sequence
+ */
+ @Override
+ protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
+ {
+ }
+
+ @Override
+ protected EnsemblFeatureType[] getFeaturesToFetch()
+ {
+ // not applicable - can't fetch genomic features for a protein sequence
+ return null;
+ }
+
+ @Override
+ protected boolean identifiesSequence(SequenceFeature sf, String accId)
+ {
+ // not applicable - protein sequence is not a 'subset' of genomic sequence
+ return false;
+ }
+
+ @Override
+ protected List getCrossReferenceDatabases()
+ {
+ return CROSSREFS;
+ }
+
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ /**
+ * Returns an accession id for a query, including conversion of ENST* to
+ * ENSP*. This supports querying for the protein sequence for a transcript
+ * (ENST identifier) and returning the ENSP identifier.
+ */
+ @Override
+ public String getAccessionIdFromQuery(String query)
+ {
+ String accId = super.getAccessionIdFromQuery(query);
+
+ /*
+ * ensure last character before (11) digits is P
+ * ENST00000288602 -> ENSP00000288602
+ * ENSMUST00000288602 -> ENSMUSP00000288602
+ */
+ if (accId != null && accId.length() >= 12)
+ {
+ char[] chars = accId.toCharArray();
+ chars[chars.length - 12] = 'P';
+ accId = new String(chars);
+ }
+ return accId;
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java
new file mode 100644
index 0000000..34f8816
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblRestClient.java
@@ -0,0 +1,361 @@
+package jalview.ext.ensembl;
+
+import jalview.io.FileParse;
+
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import javax.ws.rs.HttpMethod;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * Base class for Ensembl REST service clients
+ *
+ * @author gmcarstairs
+ */
+abstract class EnsemblRestClient extends EnsemblSequenceFetcher
+{
+ private final static String ENSEMBL_REST = "http://rest.ensembl.org";
+
+ protected final static String ENSEMBL_GENOMES_REST = "http://rest.ensemblgenomes.org";
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ private static final String PING_URL = "http://rest.ensembl.org/info/ping.json";
+
+ private final static long RETEST_INTERVAL = 10000L; // 10 seconds
+
+ private static final Regex TRANSCRIPT_REGEX = new Regex(
+ "(ENS)([A-Z]{3}|)T[0-9]{11}$");
+
+ private static final Regex GENE_REGEX = new Regex(
+ "(ENS)([A-Z]{3}|)G[0-9]{11}$");
+
+ private String domain = ENSEMBL_REST;
+
+ private static boolean ensemblRestAvailable = false;
+
+ private static long lastCheck = -1;
+
+ /*
+ * absolute time to wait till if we overloaded the REST service
+ */
+ private static long retryAfter;
+
+ protected volatile boolean inProgress = false;
+
+ /**
+ * Default constructor to use rest.ensembl.org
+ */
+ public EnsemblRestClient()
+ {
+ this(ENSEMBL_REST);
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblRestClient(String d)
+ {
+ domain = d;
+ }
+
+ String getDomain()
+ {
+ return domain;
+ }
+
+ void setDomain(String d)
+ {
+ domain = d;
+ }
+
+ public boolean isTranscriptIdentifier(String query)
+ {
+ return query == null ? false : TRANSCRIPT_REGEX.search(query);
+ }
+
+ public boolean isGeneIdentifier(String query)
+ {
+ return query == null ? false : GENE_REGEX.search(query);
+ }
+
+ @Override
+ public boolean queryInProgress()
+ {
+ return inProgress;
+ }
+
+ @Override
+ public StringBuffer getRawRecords()
+ {
+ return null;
+ }
+
+ /**
+ * Returns the URL for the client http request
+ *
+ * @param ids
+ * @return
+ * @throws MalformedURLException
+ */
+ protected abstract URL getUrl(List ids)
+ throws MalformedURLException;
+
+ /**
+ * Returns true if client uses GET method, false if it uses POST
+ *
+ * @return
+ */
+ protected abstract boolean useGetRequest();
+
+ /**
+ * Return the desired value for the Content-Type request header
+ *
+ * @param multipleIds
+ *
+ * @return
+ * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
+ */
+ protected abstract String getRequestMimeType(boolean multipleIds);
+
+ /**
+ * Return the desired value for the Accept request header
+ *
+ * @return
+ * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers
+ */
+ protected abstract String getResponseMimeType();
+
+ /**
+ * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if
+ * successful, else false
+ *
+ * @return
+ */
+ private boolean checkEnsembl()
+ {
+ try
+ {
+ // note this format works for both ensembl and ensemblgenomes
+ // info/ping.json works for ensembl only (March 2016)
+ URL ping = new URL(getDomain()
+ + "/info/ping?content-type=application/json");
+ HttpURLConnection conn = (HttpURLConnection) ping.openConnection();
+ int rc = conn.getResponseCode();
+ conn.disconnect();
+ if (rc >= 200 && rc < 300)
+ {
+ return true;
+ }
+ } catch (Throwable t)
+ {
+ System.err.println("Error connecting to " + PING_URL + ": "
+ + t.getMessage());
+ }
+ return false;
+ }
+
+ /**
+ * returns a reader to a Fasta response from the Ensembl sequence endpoint
+ *
+ * @param ids
+ * @return
+ * @throws IOException
+ */
+ protected FileParse getSequenceReader(List ids)
+ throws IOException
+ {
+ URL url = getUrl(ids);
+
+ BufferedReader reader = getHttpResponse(url, ids);
+ FileParse fp = new FileParse(reader, url.toString(), "HTTP_POST");
+ return fp;
+ }
+
+ /**
+ * Writes the HTTP request and gets the response as a reader.
+ *
+ * @param url
+ * @param ids
+ * written as Json POST body if more than one
+ * @return
+ * @throws IOException
+ * if response code was not 200, or other I/O error
+ */
+ protected BufferedReader getHttpResponse(URL url, List ids)
+ throws IOException
+ {
+ // long now = System.currentTimeMillis();
+ HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+
+ /*
+ * POST method allows multiple queries in one request; it is supported for
+ * sequence queries, but not for overlap
+ */
+ boolean multipleIds = ids.size() > 1;// useGetRequest();
+ connection.setRequestMethod(multipleIds ? HttpMethod.POST
+ : HttpMethod.GET);
+ connection.setRequestProperty("Content-Type",
+ getRequestMimeType(multipleIds));
+ connection.setRequestProperty("Accept", getResponseMimeType());
+
+ connection.setUseCaches(false);
+ connection.setDoInput(true);
+ connection.setDoOutput(multipleIds);
+
+ if (multipleIds)
+ {
+ writePostBody(connection, ids);
+ }
+
+ InputStream response = connection.getInputStream();
+ int responseCode = connection.getResponseCode();
+
+ if (responseCode != 200)
+ {
+ /*
+ * note: a GET request for an invalid id returns an error code e.g. 415
+ * but POST request returns 200 and an empty Fasta response
+ */
+ throw new IOException(
+ "Response code was not 200. Detected response was "
+ + responseCode);
+ }
+ // System.out.println(getClass().getName() + " took "
+ // + (System.currentTimeMillis() - now) + "ms to fetch");
+
+ checkRateLimits(connection);
+
+ BufferedReader reader = null;
+ reader = new BufferedReader(new InputStreamReader(response, "UTF-8"));
+ return reader;
+ }
+
+ /**
+ * Inspect response headers for any sign of server overload and respect any
+ * 'retry-after' directive
+ *
+ * @see https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits
+ * @param connection
+ */
+ void checkRateLimits(HttpURLConnection connection)
+ {
+ // number of requests allowed per time interval:
+ String limit = connection.getHeaderField("X-RateLimit-Limit");
+ // length of quota time interval in seconds:
+ // String period = connection.getHeaderField("X-RateLimit-Period");
+ // seconds remaining until usage quota is reset:
+ String reset = connection.getHeaderField("X-RateLimit-Reset");
+ // number of requests remaining from quota for current period:
+ String remaining = connection.getHeaderField("X-RateLimit-Remaining");
+ // number of seconds to wait before retrying (if remaining == 0)
+ String retryDelay = connection.getHeaderField("Retry-After");
+
+ // to test:
+ // retryDelay = "5";
+
+ if (retryDelay != null)
+ {
+ System.err.println("Ensembl REST service rate limit exceeded, wait "
+ + retryDelay + " seconds before retrying");
+ try
+ {
+ retryAfter = System.currentTimeMillis()
+ + (1000 * Integer.valueOf(retryDelay));
+ } catch (NumberFormatException e)
+ {
+ System.err.println("Unexpected value for Retry-After: "
+ + retryDelay);
+ }
+ }
+ else
+ {
+ retryAfter = 0;
+ // debug:
+ // System.out.println(String.format(
+ // "%s Ensembl requests remaining of %s (reset in %ss)",
+ // remaining, limit, reset));
+ }
+ }
+ /**
+ * Rechecks if Ensembl is responding, unless the last check was successful and
+ * the retest interval has not yet elapsed. Returns true if Ensembl is up,
+ * else false.
+ *
+ * @return
+ */
+ protected boolean isEnsemblAvailable()
+ {
+ long now = System.currentTimeMillis();
+
+ /*
+ * check if we are waiting for 'Retry-After' to expire
+ */
+ if (retryAfter > now)
+ {
+ System.err.println("Still " + (1 + (retryAfter - now) / 1000)
+ + " secs to wait before retrying Ensembl");
+ return false;
+ }
+ else
+ {
+ retryAfter = 0;
+ }
+
+ boolean retest = now - lastCheck > RETEST_INTERVAL;
+ if (ensemblRestAvailable && !retest)
+ {
+ return true;
+ }
+ ensemblRestAvailable = checkEnsembl();
+ lastCheck = now;
+ return ensemblRestAvailable;
+ }
+
+ /**
+ * Constructs, writes and flushes the POST body of the request, containing the
+ * query ids in JSON format
+ *
+ * @param connection
+ * @param ids
+ * @throws IOException
+ */
+ protected void writePostBody(HttpURLConnection connection,
+ List ids) throws IOException
+ {
+ boolean first;
+ StringBuilder postBody = new StringBuilder(64);
+ postBody.append("{\"ids\":[");
+ first = true;
+ for (String id : ids)
+ {
+ if (!first)
+ {
+ postBody.append(",");
+ }
+ first = false;
+ postBody.append("\"");
+ postBody.append(id.trim());
+ postBody.append("\"");
+ }
+ postBody.append("]}");
+ byte[] thepostbody = postBody.toString().getBytes();
+ connection.setRequestProperty("Content-Length",
+ Integer.toString(thepostbody.length));
+ DataOutputStream wr = new DataOutputStream(connection.getOutputStream());
+ wr.write(thepostbody);
+ wr.flush();
+ wr.close();
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
new file mode 100644
index 0000000..b4c708d
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java
@@ -0,0 +1,892 @@
+package jalview.ext.ensembl;
+
+import jalview.analysis.AlignmentUtils;
+import jalview.analysis.Dna;
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.Mapping;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.exceptions.JalviewException;
+import jalview.io.FastaFile;
+import jalview.io.FileParse;
+import jalview.io.gff.SequenceOntologyFactory;
+import jalview.io.gff.SequenceOntologyI;
+import jalview.util.DBRefUtils;
+import jalview.util.MapList;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+/**
+ * Base class for Ensembl sequence fetchers
+ *
+ * @see http://rest.ensembl.org/documentation/info/sequence_id
+ * @author gmcarstairs
+ */
+public abstract class EnsemblSeqProxy extends EnsemblRestClient
+{
+ private static final String ALLELES = "alleles";
+
+ private static final List CROSS_REFERENCES = Arrays
+ .asList(new String[] { "CCDS", "Uniprot/SWISSPROT",
+ "Uniprot/SPTREMBL" });
+
+ protected static final String CONSEQUENCE_TYPE = "consequence_type";
+
+ protected static final String PARENT = "Parent";
+
+ protected static final String ID = "ID";
+
+ protected static final String NAME = "Name";
+
+ protected static final String DESCRIPTION = "description";
+
+ /*
+ * enum for 'type' parameter to the /sequence REST service
+ */
+ public enum EnsemblSeqType
+ {
+ /**
+ * type=genomic to fetch full dna including introns
+ */
+ GENOMIC("genomic"),
+
+ /**
+ * type=cdna to fetch dna including UTRs
+ */
+ CDNA("cdna"),
+
+ /**
+ * type=cds to fetch coding dna excluding UTRs
+ */
+ CDS("cds"),
+
+ /**
+ * type=protein to fetch peptide product sequence
+ */
+ PROTEIN("protein");
+
+ /*
+ * the value of the 'type' parameter to fetch this version of
+ * an Ensembl sequence
+ */
+ private String type;
+
+ EnsemblSeqType(String t)
+ {
+ type = t;
+ }
+
+ public String getType()
+ {
+ return type;
+ }
+
+ }
+
+ /**
+ * A comparator to sort ranges into ascending start position order
+ */
+ private class RangeSorter implements Comparator
+ {
+ boolean forwards;
+
+ RangeSorter(boolean forward)
+ {
+ forwards = forward;
+ }
+
+ @Override
+ public int compare(int[] o1, int[] o2)
+ {
+ return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
+ }
+
+ }
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblSeqProxy()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ */
+ public EnsemblSeqProxy(String d)
+ {
+ super(d);
+ }
+
+ /**
+ * Makes the sequence queries to Ensembl's REST service and returns an
+ * alignment consisting of the returned sequences.
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws Exception
+ {
+ // TODO use a String... query vararg instead?
+
+ // danger: accession separator used as a regex here, a string elsewhere
+ // in this case it is ok (it is just a space), but (e.g.) '\' would not be
+ List allIds = Arrays.asList(query
+ .split(getAccessionSeparator()));
+ AlignmentI alignment = null;
+ inProgress = true;
+
+ /*
+ * execute queries, if necessary in batches of the
+ * maximum allowed number of ids
+ */
+ int maxQueryCount = getMaximumQueryCount();
+ for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount)
+ {
+ int p = Math.min(vSize, v + maxQueryCount);
+ List ids = allIds.subList(v, p);
+ try
+ {
+ alignment = fetchSequences(ids, alignment);
+ } catch (Throwable r)
+ {
+ inProgress = false;
+ String msg = "Aborting ID retrieval after " + v
+ + " chunks. Unexpected problem (" + r.getLocalizedMessage()
+ + ")";
+ System.err.println(msg);
+ break;
+ }
+ }
+
+ if (alignment == null)
+ {
+ return null;
+ }
+
+ /*
+ * fetch and transfer genomic sequence features,
+ * fetch protein product and add as cross-reference
+ */
+ for (String accId : allIds)
+ {
+ addFeaturesAndProduct(accId, alignment);
+ }
+
+ for (SequenceI seq : alignment.getSequences())
+ {
+ getCrossReferences(seq);
+ }
+
+ return alignment;
+ }
+
+ /**
+ * Fetches Ensembl features using the /overlap REST endpoint, and adds them to
+ * the sequence in the alignment. Also fetches the protein product, maps it
+ * from the CDS features of the sequence, and saves it as a cross-reference of
+ * the dna sequence.
+ *
+ * @param accId
+ * @param alignment
+ */
+ protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
+ {
+ if (alignment == null)
+ {
+ return;
+ }
+
+ try
+ {
+ /*
+ * get 'dummy' genomic sequence with exon, cds and variation features
+ */
+ SequenceI genomicSequence = null;
+ EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain());
+ EnsemblFeatureType[] features = getFeaturesToFetch();
+ AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
+ features);
+ if (geneFeatures.getHeight() > 0)
+ {
+ genomicSequence = geneFeatures.getSequenceAt(0);
+ }
+ if (genomicSequence != null)
+ {
+ /*
+ * transfer features to the query sequence
+ */
+ SequenceI querySeq = alignment.findName(accId);
+ if (transferFeatures(accId, genomicSequence, querySeq))
+ {
+
+ /*
+ * fetch and map protein product, and add it as a cross-reference
+ * of the retrieved sequence
+ */
+ addProteinProduct(querySeq);
+ }
+ }
+ } catch (IOException e)
+ {
+ System.err.println("Error transferring Ensembl features: "
+ + e.getMessage());
+ }
+ }
+
+ /**
+ * Returns those sequence feature types to fetch from Ensembl. We may want
+ * features either because they are of interest to the user, or as means to
+ * identify the locations of the sequence on the genomic sequence (CDS
+ * features identify CDS, exon features identify cDNA etc).
+ *
+ * @return
+ */
+ protected abstract EnsemblFeatureType[] getFeaturesToFetch();
+
+ /**
+ * Fetches and maps the protein product, and adds it as a cross-reference of
+ * the retrieved sequence
+ */
+ protected void addProteinProduct(SequenceI querySeq)
+ {
+ String accId = querySeq.getName();
+ try
+ {
+ AlignmentI protein = new EnsemblProtein(getDomain())
+ .getSequenceRecords(accId);
+ if (protein == null || protein.getHeight() == 0)
+ {
+ System.out.println("No protein product found for " + accId);
+ return;
+ }
+ SequenceI proteinSeq = protein.getSequenceAt(0);
+
+ /*
+ * need dataset sequences (to be the subject of mappings)
+ */
+ proteinSeq.createDatasetSequence();
+ querySeq.createDatasetSequence();
+
+ MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, proteinSeq);
+ if (mapList != null)
+ {
+ // clunky: ensure Uniprot xref if we have one is on mapped sequence
+ SequenceI ds = proteinSeq.getDatasetSequence();
+ ds.setSourceDBRef(proteinSeq.getSourceDBRef());
+ Mapping map = new Mapping(ds, mapList);
+ DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
+ accId, map);
+ querySeq.getDatasetSequence().addDBRef(dbr);
+
+ /*
+ * copy exon features to protein, compute peptide variants from dna
+ * variants and add as features on the protein sequence ta-da
+ */
+ AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList);
+ }
+ } catch (Exception e)
+ {
+ System.err
+ .println(String.format("Error retrieving protein for %s: %s",
+ accId, e.getMessage()));
+ }
+ }
+
+ /**
+ * Get database xrefs from Ensembl, and attach them to the sequence
+ *
+ * @param seq
+ */
+ protected void getCrossReferences(SequenceI seq)
+ {
+ while (seq.getDatasetSequence() != null)
+ {
+ seq = seq.getDatasetSequence();
+ }
+
+ EnsemblXref xrefFetcher = new EnsemblXref(getDomain());
+ List xrefs = xrefFetcher.getCrossReferences(seq.getName(),
+ getCrossReferenceDatabases());
+ for (DBRefEntry xref : xrefs)
+ {
+ seq.addDBRef(xref);
+ /*
+ * Save any Uniprot xref to be the reference for SIFTS mapping
+ */
+ if (DBRefSource.UNIPROT.equals(xref.getSource()))
+ {
+ seq.setSourceDBRef(xref);
+ }
+ }
+ }
+
+ /**
+ * Returns a list of database names to be used when fetching cross-references.
+ * Specifically, the names are used to filter data returned by the Ensembl
+ * xrefs REST service on the value in field 'dbname'.
+ *
+ * @return
+ */
+ protected List getCrossReferenceDatabases()
+ {
+ return CROSS_REFERENCES;
+ }
+
+ /**
+ * Fetches sequences for the list of accession ids and adds them to the
+ * alignment. Returns the extended (or created) alignment.
+ *
+ * @param ids
+ * @param alignment
+ * @return
+ * @throws JalviewException
+ * @throws IOException
+ */
+ protected AlignmentI fetchSequences(List ids, AlignmentI alignment)
+ throws JalviewException, IOException
+ {
+ if (!isEnsemblAvailable())
+ {
+ inProgress = false;
+ throw new JalviewException("ENSEMBL Rest API not available.");
+ }
+ FileParse fp = getSequenceReader(ids);
+ FastaFile fr = new FastaFile(fp);
+ if (fr.hasWarningMessage())
+ {
+ System.out.println(String.format(
+ "Warning when retrieving %d ids %s\n%s", ids.size(),
+ ids.toString(), fr.getWarningMessage()));
+ }
+ else if (fr.getSeqs().size() != ids.size())
+ {
+ System.out.println(String.format(
+ "Only retrieved %d sequences for %d query strings", fr
+ .getSeqs().size(), ids.size()));
+ }
+
+ if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0)
+ {
+ /*
+ * POST request has returned an empty FASTA file e.g. for invalid id
+ */
+ throw new IOException("No data returned for " + ids);
+ }
+
+ if (fr.getSeqs().size() > 0)
+ {
+ AlignmentI seqal = new Alignment(
+ fr.getSeqsAsArray());
+ for (SequenceI sq:seqal.getSequences())
+ {
+ if (sq.getDescription() == null)
+ {
+ sq.setDescription(getDbName());
+ }
+ String name = sq.getName();
+ if (ids.contains(name)
+ || ids.contains(name.replace("ENSP", "ENST")))
+ {
+ DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name);
+ }
+ }
+ if (alignment == null)
+ {
+ alignment = seqal;
+ }
+ else
+ {
+ alignment.append(seqal);
+ }
+ }
+ return alignment;
+ }
+
+ /**
+ * Returns the URL for the REST call
+ *
+ * @return
+ * @throws MalformedURLException
+ */
+ @Override
+ protected URL getUrl(List ids) throws MalformedURLException
+ {
+ /*
+ * a single id is included in the URL path
+ * multiple ids go in the POST body instead
+ */
+ StringBuffer urlstring = new StringBuffer(128);
+ urlstring.append(getDomain() + "/sequence/id");
+ if (ids.size() == 1)
+ {
+ urlstring.append("/").append(ids.get(0));
+ }
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ urlstring.append("?type=").append(getSourceEnsemblType().getType());
+ urlstring.append(("&Accept=text/x-fasta"));
+
+ URL url = new URL(urlstring.toString());
+ return url;
+ }
+
+ /**
+ * A sequence/id POST request currently allows up to 50 queries
+ *
+ * @see http://rest.ensembl.org/documentation/info/sequence_id_post
+ */
+ @Override
+ public int getMaximumQueryCount()
+ {
+ return 50;
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return false;
+ }
+
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return multipleIds ? "application/json" : "text/x-fasta";
+ }
+
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "text/x-fasta";
+ }
+
+ /**
+ *
+ * @return the configured sequence return type for this source
+ */
+ protected abstract EnsemblSeqType getSourceEnsemblType();
+
+ /**
+ * Returns a list of [start, end] genomic ranges corresponding to the sequence
+ * being retrieved.
+ *
+ * The correspondence between the frames of reference is made by locating
+ * those features on the genomic sequence which identify the retrieved
+ * sequence. Specifically
+ *
+ * - genomic sequence is identified by "transcript" features with
+ * ID=transcript:transcriptId
+ * - cdna sequence is identified by "exon" features with
+ * Parent=transcript:transcriptId
+ * - cds sequence is identified by "CDS" features with
+ * Parent=transcript:transcriptId
+ *
+ *
+ * The returned ranges are sorted to run forwards (for positive strand) or
+ * backwards (for negative strand). Aborts and returns null if both positive
+ * and negative strand are found (this should not normally happen).
+ *
+ * @param sourceSequence
+ * @param accId
+ * @param start
+ * the start position of the sequence we are mapping to
+ * @return
+ */
+ protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
+ String accId, int start)
+ {
+ SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
+ if (sfs == null)
+ {
+ return null;
+ }
+
+ /*
+ * generously initial size for number of cds regions
+ * (worst case titin Q8WZ42 has c. 313 exons)
+ */
+ List regions = new ArrayList(100);
+ int mappedLength = 0;
+ int direction = 1; // forward
+ boolean directionSet = false;
+
+ for (SequenceFeature sf : sfs)
+ {
+ /*
+ * accept the target feature type or a specialisation of it
+ * (e.g. coding_exon for exon)
+ */
+ if (identifiesSequence(sf, accId))
+ {
+ int strand = sf.getStrand();
+ strand = strand == 0 ? 1 : strand; // treat unknown as forward
+
+ if (directionSet && strand != direction)
+ {
+ // abort - mix of forward and backward
+ System.err.println("Error: forward and backward strand for "
+ + accId);
+ return null;
+ }
+ direction = strand;
+ directionSet = true;
+
+ /*
+ * add to CDS ranges, semi-sorted forwards/backwards
+ */
+ if (strand < 0)
+ {
+ regions.add(0, new int[] { sf.getEnd(), sf.getBegin() });
+ }
+ else
+ {
+ regions.add(new int[] { sf.getBegin(), sf.getEnd() });
+ }
+ mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
+
+ if (!isSpliceable())
+ {
+ /*
+ * 'gene' sequence is contiguous so we can stop as soon as its
+ * identifying feature has been found
+ */
+ break;
+ }
+ }
+ }
+
+ if (regions.isEmpty())
+ {
+ System.out.println("Failed to identify target sequence for " + accId
+ + " from genomic features");
+ return null;
+ }
+
+ /*
+ * a final sort is needed since Ensembl returns CDS sorted within source
+ * (havana / ensembl_havana)
+ */
+ Collections.sort(regions, new RangeSorter(direction == 1));
+
+ List to = Arrays.asList(new int[] { start,
+ start + mappedLength - 1 });
+
+ return new MapList(regions, to, 1, 1);
+ }
+
+ /**
+ * Answers true if the sequence being retrieved may occupy discontiguous
+ * regions on the genomic sequence.
+ */
+ protected boolean isSpliceable()
+ {
+ return true;
+ }
+
+ /**
+ * Returns true if the sequence feature marks positions of the genomic
+ * sequence feature which are within the sequence being retrieved. For
+ * example, an 'exon' feature whose parent is the target transcript marks the
+ * cdna positions of the transcript.
+ *
+ * @param sf
+ * @param accId
+ * @return
+ */
+ protected abstract boolean identifiesSequence(SequenceFeature sf,
+ String accId);
+
+ /**
+ * Transfers the sequence feature to the target sequence, locating its start
+ * and end range based on the mapping. Features which do not overlap the
+ * target sequence are ignored.
+ *
+ * @param sf
+ * @param targetSequence
+ * @param mapping
+ * mapping from the sequence feature's coordinates to the target
+ * sequence
+ * @param forwardStrand
+ */
+ protected void transferFeature(SequenceFeature sf,
+ SequenceI targetSequence, MapList mapping, boolean forwardStrand)
+ {
+ int start = sf.getBegin();
+ int end = sf.getEnd();
+ int[] mappedRange = mapping.locateInTo(start, end);
+
+ if (mappedRange != null)
+ {
+ SequenceFeature copy = new SequenceFeature(sf);
+ copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
+ copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
+ targetSequence.addSequenceFeature(copy);
+
+ /*
+ * for sequence_variant on reverse strand, have to convert the allele
+ * values to their complements
+ */
+ if (!forwardStrand
+ && SequenceOntologyFactory.getInstance().isA(sf.getType(),
+ SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ reverseComplementAlleles(copy);
+ }
+ }
+ }
+
+ /**
+ * Change the 'alleles' value of a feature by converting to complementary
+ * bases, and also update the feature description to match
+ *
+ * @param sf
+ */
+ static void reverseComplementAlleles(SequenceFeature sf)
+ {
+ final String alleles = (String) sf.getValue(ALLELES);
+ if (alleles == null)
+ {
+ return;
+ }
+ StringBuilder complement = new StringBuilder(alleles.length());
+ for (String allele : alleles.split(","))
+ {
+ reverseComplementAllele(complement, allele);
+ }
+ String comp = complement.toString();
+ sf.setValue(ALLELES, comp);
+ sf.setDescription(comp);
+
+ /*
+ * replace value of "alleles=" in sf.ATTRIBUTES as well
+ * so 'output as GFF' shows reverse complement alleles
+ */
+ String atts = sf.getAttributes();
+ if (atts != null)
+ {
+ atts = atts.replace(ALLELES + "=" + alleles, ALLELES + "=" + comp);
+ sf.setAttributes(atts);
+ }
+ }
+
+ /**
+ * Makes the 'reverse complement' of the given allele and appends it to the
+ * buffer, after a comma separator if not the first
+ *
+ * @param complement
+ * @param allele
+ */
+ static void reverseComplementAllele(StringBuilder complement,
+ String allele)
+ {
+ if (complement.length() > 0)
+ {
+ complement.append(",");
+ }
+ if ("HGMD_MUTATION".equalsIgnoreCase(allele))
+ {
+ complement.append(allele);
+ }
+ else
+ {
+ char[] alleles = allele.toCharArray();
+ for (int i = alleles.length - 1; i >= 0; i--)
+ {
+ complement.append(Dna.getComplement(alleles[i]));
+ }
+ }
+ }
+
+ /**
+ * Transfers features from sourceSequence to targetSequence
+ *
+ * @param accessionId
+ * @param sourceSequence
+ * @param targetSequence
+ * @return true if any features were transferred, else false
+ */
+ protected boolean transferFeatures(String accessionId,
+ SequenceI sourceSequence, SequenceI targetSequence)
+ {
+ if (sourceSequence == null || targetSequence == null)
+ {
+ return false;
+ }
+
+ // long start = System.currentTimeMillis();
+ SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
+ MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
+ targetSequence.getStart());
+ if (mapping == null)
+ {
+ return false;
+ }
+
+ boolean result = transferFeatures(sfs, targetSequence, mapping,
+ accessionId);
+ // System.out.println("transferFeatures (" + (sfs.length) + " --> "
+ // + targetSequence.getSequenceFeatures().length + ") to "
+ // + targetSequence.getName()
+ // + " took " + (System.currentTimeMillis() - start) + "ms");
+ return result;
+ }
+
+ /**
+ * Transfer features to the target sequence. The start/end positions are
+ * converted using the mapping. Features which do not overlap are ignored.
+ * Features whose parent is not the specified identifier are also ignored.
+ *
+ * @param features
+ * @param targetSequence
+ * @param mapping
+ * @param parentId
+ * @return
+ */
+ protected boolean transferFeatures(SequenceFeature[] features,
+ SequenceI targetSequence, MapList mapping, String parentId)
+ {
+ final boolean forwardStrand = mapping.isFromForwardStrand();
+
+ /*
+ * sort features by start position (which corresponds to end
+ * position descending if reverse strand) so as to add them in
+ * 'forwards' order to the target sequence
+ */
+ sortFeatures(features, forwardStrand);
+
+ boolean transferred = false;
+ for (SequenceFeature sf : features)
+ {
+ if (retainFeature(sf, parentId))
+ {
+ transferFeature(sf, targetSequence, mapping, forwardStrand);
+ transferred = true;
+ }
+ }
+ return transferred;
+ }
+
+ /**
+ * Sort features by start position ascending (if on forward strand), or end
+ * position descending (if on reverse strand)
+ *
+ * @param features
+ * @param forwardStrand
+ */
+ protected static void sortFeatures(SequenceFeature[] features,
+ final boolean forwardStrand)
+ {
+ Arrays.sort(features, new Comparator()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ if (forwardStrand)
+ {
+ return Integer.compare(o1.getBegin(), o2.getBegin());
+ }
+ else
+ {
+ return Integer.compare(o2.getEnd(), o1.getEnd());
+ }
+ }
+ });
+ }
+
+ /**
+ * Answers true if the feature type is one we want to keep for the sequence.
+ * Some features are only retrieved in order to identify the sequence range,
+ * and may then be discarded as redundant information (e.g. "CDS" feature for
+ * a CDS sequence).
+ */
+ @SuppressWarnings("unused")
+ protected boolean retainFeature(SequenceFeature sf, String accessionId)
+ {
+ return true; // override as required
+ }
+
+ /**
+ * Answers true if the feature has a Parent which refers to the given
+ * accession id, or if the feature has no parent. Answers false if the
+ * feature's Parent is for a different accession id.
+ *
+ * @param sf
+ * @param identifier
+ * @return
+ */
+ protected boolean featureMayBelong(SequenceFeature sf, String identifier)
+ {
+ String parent = (String) sf.getValue(PARENT);
+ // using contains to allow for prefix "gene:", "transcript:" etc
+ if (parent != null && !parent.contains(identifier))
+ {
+ // this genomic feature belongs to a different transcript
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public String getDescription()
+ {
+ return "Ensembl " + getSourceEnsemblType().getType()
+ + " sequence with variant features";
+ }
+
+ /**
+ * Returns a (possibly empty) list of features on the sequence which have the
+ * specified sequence ontology type (or a sub-type of it), and the given
+ * identifier as parent
+ *
+ * @param sequence
+ * @param type
+ * @param parentId
+ * @return
+ */
+ protected List findFeatures(SequenceI sequence,
+ String type, String parentId)
+ {
+ List result = new ArrayList();
+
+ SequenceFeature[] sfs = sequence.getSequenceFeatures();
+ if (sfs != null) {
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+ for (SequenceFeature sf :sfs) {
+ if (so.isA(sf.getType(), type))
+ {
+ String parent = (String) sf.getValue(PARENT);
+ if (parent.equals(parentId))
+ {
+ result.add(sf);
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Answers true if the feature type is either 'NMD_transcript_variant' or
+ * 'transcript' or one of its sub-types in the Sequence Ontology. This is
+ * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl
+ * although strictly speaking it is not (it is a sub-type of
+ * sequence_variant).
+ *
+ * @param featureType
+ * @return
+ */
+ public static boolean isTranscript(String featureType)
+ {
+ return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
+ || SequenceOntologyFactory.getInstance().isA(featureType,
+ SequenceOntologyI.TRANSCRIPT);
+ }
+}
diff --git a/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java
new file mode 100644
index 0000000..9a4952e
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblSequenceFetcher.java
@@ -0,0 +1,93 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.DBRefSource;
+import jalview.ws.seqfetcher.DbSourceProxyImpl;
+
+import com.stevesoft.pat.Regex;
+
+/**
+ * A base class for Ensembl sequence fetchers
+ *
+ * @author gmcarstairs
+ */
+abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
+{
+ /*
+ * accepts ENSG/T/E/P with 11 digits
+ * or ENSMUSP or similar for other species
+ * or CCDSnnnnn.nn with at least 3 digits
+ */
+ private static final Regex ACCESSION_REGEX = new Regex(
+ "(ENS([A-Z]{3}|)[GTEP]{1}[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)");
+
+ /*
+ * possible values for the 'feature' parameter of the /overlap REST service
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ */
+ protected enum EnsemblFeatureType
+ {
+ gene, transcript, cds, exon, repeat, simple, misc, variation,
+ somatic_variation, structural_variation, somatic_structural_variation,
+ constrained, regulatory
+ }
+
+ @Override
+ public String getDbSource()
+ {
+ // NB ensure Uniprot xrefs are canonicalised from "Ensembl" to "ENSEMBL"
+ return DBRefSource.ENSEMBL; // "ENSEMBL"
+ }
+
+ @Override
+ public String getDbVersion()
+ {
+ return "0";
+ }
+
+ @Override
+ public String getAccessionSeparator()
+ {
+ return " ";
+ }
+
+ /**
+ * Ensembl accession are ENST + 11 digits for human transcript, ENSG for human
+ * gene. Other species insert 3 letters e.g. ENSMUST..., ENSMUSG...
+ *
+ * @see http://www.ensembl.org/Help/View?id=151
+ */
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
+ @Override
+ public boolean isValidReference(String accession)
+ {
+ return getAccessionValidator().search(accession);
+ }
+
+ @Override
+ public int getTier()
+ {
+ return 0;
+ }
+
+ /**
+ * Default test query is a transcript
+ */
+ @Override
+ public String getTestQuery()
+ {
+ // has CDS on reverse strand:
+ return "ENST00000288602";
+ // ENST00000461457 // forward strand
+ }
+
+ @Override
+ public boolean isDnaCoding()
+ {
+ return true;
+ }
+}
diff --git a/src/jalview/ext/ensembl/EnsemblSymbol.java b/src/jalview/ext/ensembl/EnsemblSymbol.java
new file mode 100644
index 0000000..08f26c7
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblSymbol.java
@@ -0,0 +1,138 @@
+package jalview.ext.ensembl;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+/**
+ * A client for the Ensembl xrefs/symbol REST service;
+ *
+ * @see http://rest.ensembl.org/documentation/info/xref_external
+ * @author gmcarstairs
+ *
+ */
+public class EnsemblSymbol extends EnsemblXref
+{
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblSymbol(String d)
+ {
+ super(d);
+ }
+
+ /**
+ * Returns the first "id" value in gene identifier format from the JSON
+ * response, or null if none found
+ *
+ * @param br
+ * @return
+ * @throws IOException
+ */
+ protected String parseResponse(BufferedReader br)
+ throws IOException
+ {
+ JSONParser jp = new JSONParser();
+ String result = null;
+ try
+ {
+ JSONArray responses = (JSONArray) jp.parse(br);
+ Iterator rvals = responses.iterator();
+ while (rvals.hasNext())
+ {
+ JSONObject val = (JSONObject) rvals.next();
+ String id = val.get("id").toString();
+ if (id != null && isGeneIdentifier(id))
+ {
+ result = id;
+ break;
+ }
+ }
+ } catch (ParseException e)
+ {
+ // ignore
+ }
+ return result;
+ }
+
+ protected URL getUrl(String id, Species species)
+ {
+ String url = getDomain() + "/xrefs/symbol/" + species.toString() + "/"
+ + id
+ + "?content-type=application/json";
+ try
+ {
+ return new URL(url);
+ } catch (MalformedURLException e)
+ {
+ return null;
+ }
+ }
+
+ /**
+ * Calls the Ensembl xrefs REST 'symbol' endpoint and retrieves any gene ids
+ * for the given identifier, for any known model organisms
+ *
+ * @param identifier
+ * @return
+ */
+ public List getIds(String identifier)
+ {
+ List result = new ArrayList();
+ List ids = new ArrayList();
+ ids.add(identifier);
+
+ String[] queries = identifier.split(getAccessionSeparator());
+ BufferedReader br = null;
+ try
+ {
+ for (String query : queries)
+ {
+ for (Species taxon : Species.values())
+ {
+ if (taxon.isModelOrganism())
+ {
+ URL url = getUrl(query, taxon);
+ if (url != null)
+ {
+ br = getHttpResponse(url, ids);
+ }
+ String geneId = parseResponse(br);
+ if (geneId != null)
+ {
+ result.add(geneId);
+ }
+ }
+ }
+ }
+ } catch (IOException e)
+ {
+ // ignore
+ } finally
+ {
+ if (br != null)
+ {
+ try
+ {
+ br.close();
+ } catch (IOException e)
+ {
+ // ignore
+ }
+ }
+ }
+ return result;
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/EnsemblXref.java b/src/jalview/ext/ensembl/EnsemblXref.java
new file mode 100644
index 0000000..f19b4d4
--- /dev/null
+++ b/src/jalview/ext/ensembl/EnsemblXref.java
@@ -0,0 +1,188 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefEntry;
+import jalview.util.DBRefUtils;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+/**
+ * A class to fetch cross-references from Ensembl by calling the /xrefs REST
+ * service
+ *
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/documentation/info/xref_id
+ */
+class EnsemblXref extends EnsemblRestClient
+{
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param d
+ */
+ public EnsemblXref(String d)
+ {
+ super(d);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (xref)";
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null;
+ }
+
+ @Override
+ protected URL getUrl(List ids) throws MalformedURLException
+ {
+ return getUrl(ids.get(0));
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return true;
+ }
+
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return "application/json";
+ }
+
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "application/json";
+ }
+
+ /**
+ * Calls the Ensembl xrefs REST endpoint and retrieves any cross-references
+ * ("primary_id") for the given identifier (Ensembl accession id) and database
+ * names. The "dbname" returned by Ensembl is canonicalised to Jalview's
+ * standard version, and a DBRefEntry constructed. If no databases are
+ * specified, all available cross-references are retrieved.
+ *
+ * @param identifier
+ * @param databases
+ * @return
+ */
+ public List getCrossReferences(String identifier,
+ List databases)
+ {
+ List result = new ArrayList();
+ List ids = new ArrayList();
+ ids.add(identifier);
+
+ BufferedReader br = null;
+ try
+ {
+ URL url = getUrl(identifier);
+ if (url != null)
+ {
+ br = getHttpResponse(url, ids);
+ }
+ return (parseResponse(br, databases));
+ } catch (IOException e)
+ {
+ // ignore
+ } finally
+ {
+ if (br != null)
+ {
+ try
+ {
+ br.close();
+ } catch (IOException e)
+ {
+ // ignore
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Parses "primary_id" and "dbname" values from the JSON response and
+ * constructs a DBRefEntry if the dbname is in the list supplied. Returns a
+ * list of DBRefEntry created.
+ *
+ * @param br
+ * @param databases
+ * @return
+ * @throws IOException
+ */
+ protected List parseResponse(BufferedReader br,
+ List databases)
+ throws IOException
+ {
+ JSONParser jp = new JSONParser();
+ List result = new ArrayList();
+ try
+ {
+ JSONArray responses = (JSONArray) jp.parse(br);
+ Iterator rvals = responses.iterator();
+ while (rvals.hasNext())
+ {
+ JSONObject val = (JSONObject) rvals.next();
+ String dbName = val.get("dbname").toString();
+ if (databases != null && !databases.isEmpty()
+ && !databases.contains(dbName))
+ {
+ continue;
+ }
+ String id = val.get("primary_id").toString();
+ if (dbName != null && id != null)
+ {
+ dbName = DBRefUtils.getCanonicalName(dbName);
+ DBRefEntry dbref = new DBRefEntry(dbName, "0", id);
+ result.add(dbref);
+ }
+ }
+ } catch (ParseException e)
+ {
+ // ignore
+ }
+ return result;
+ }
+
+ /**
+ * Returns the URL for the REST endpoint to fetch all cross-references for an
+ * identifier. Note this may return protein cross-references for nucleotide.
+ * Filter the returned list as required.
+ *
+ * @param identifier
+ * @return
+ */
+ protected URL getUrl(String identifier)
+ {
+ String url = getDomain() + "/xrefs/id/" + identifier
+ + "?content-type=application/json&all_levels=1";
+ try
+ {
+ return new URL(url);
+ } catch (MalformedURLException e)
+ {
+ return null;
+ }
+ }
+
+}
diff --git a/src/jalview/ext/ensembl/Species.java b/src/jalview/ext/ensembl/Species.java
new file mode 100644
index 0000000..d8a00a5
--- /dev/null
+++ b/src/jalview/ext/ensembl/Species.java
@@ -0,0 +1,32 @@
+package jalview.ext.ensembl;
+
+/**
+ * Selected species identifiers used by Ensembl
+ *
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/info/species?content-type=text/xml
+ */
+enum Species
+{
+ /*
+ * using any suitably readable alias as the enum name; these are all
+ * valid species parameters to Ensembl REST services where applicable
+ */
+ human(true), mouse(true), s_cerevisiae(true), cow(false), pig(false),
+ rat(true), celegans(true), sheep(false), horse(false), gorilla(false),
+ rabbit(false), gibbon(false), dog(false), orangutan(false),
+ xenopus(true), chimpanzee(false), cat(false), zebrafish(true), chicken(
+ true), dmelanogaster(true);
+
+ boolean modelOrganism;
+
+ private Species(boolean model)
+ {
+ this.modelOrganism = model;
+ }
+
+ boolean isModelOrganism()
+ {
+ return modelOrganism;
+ }
+}
diff --git a/src/jalview/ext/htsjdk/HtsContigDb.java b/src/jalview/ext/htsjdk/HtsContigDb.java
new file mode 100644
index 0000000..f3b5098
--- /dev/null
+++ b/src/jalview/ext/htsjdk/HtsContigDb.java
@@ -0,0 +1,210 @@
+package jalview.ext.htsjdk;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.reference.ReferenceSequence;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
+import htsjdk.samtools.util.StringUtil;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+import java.math.BigInteger;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * a source of sequence data accessed via the HTSJDK
+ *
+ * @author jprocter
+ *
+ */
+public class HtsContigDb
+{
+
+ private String name;
+
+ private File dbLocation;
+
+ private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null;
+
+ public HtsContigDb(String name, File descriptor) throws Exception
+ {
+ if (descriptor.isFile())
+ {
+ this.name = name;
+ dbLocation = descriptor;
+ }
+ initSource();
+ }
+
+ private void initSource() throws Exception
+ {
+ if (refFile != null)
+ {
+ return;
+ }
+
+ refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
+ dbLocation, true);
+ if (refFile == null || refFile.getSequenceDictionary() == null)
+ {
+ // refFile = initSequenceDictionaryFor(dbLocation);
+ }
+
+ }
+
+
+ SAMSequenceDictionary rrefDict = null;
+ private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2) throws Exception
+ {
+ rrefDict = getDictionary(dbLocation2, true);
+ if (rrefDict != null)
+ {
+ ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(dbLocation2, true);
+ return rrefFile;
+ }
+ return null;
+ }
+ /**
+ * code below hacked out from picard ----
+ *
+ * picard/src/java/picard/sam/CreateSequenceDictionary.java
+ * https://github.com/
+ * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876
+ */
+
+
+ /*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+ /**
+ *
+ * @param f
+ * @param truncate
+ * @return
+ * @throws Exception
+ */
+ SAMSequenceDictionary getDictionary(File f, boolean truncate)
+ throws Exception
+ {
+ if (md5 == null)
+ {
+ initCreateSequenceDictionary();
+ }
+ final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory
+ .getReferenceSequenceFile(f, truncate);
+ ReferenceSequence refSeq;
+ List ret = new ArrayList();
+ Set