From 2ea52575d29eccae73a4262f52f59ff3fcf713b1 Mon Sep 17 00:00:00 2001 From: jprocter Date: Thu, 12 Apr 2007 14:46:07 +0000 Subject: [PATCH] basic support for multi-exon CDS features --- src/jalview/datamodel/xdb/embl/EmblEntry.java | 165 +++++++++++++++++++- .../datamodel/xdb/embl/EmblFeatureLocations.java | 18 +++ 2 files changed, 176 insertions(+), 7 deletions(-) diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 22437e7..6a1146d 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -5,6 +5,8 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import java.util.Enumeration; +import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; @@ -165,7 +167,148 @@ public class EmblEntry { public void setVersion(String version) { this.version = version; } +/* + * EMBL Feature support is limited. The text below is included for the benefit of + * any developer working on improving EMBL feature import in Jalview. + * Extract from EMBL feature specification + * see http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html +3.5 Location +3.5.1 Purpose +The location indicates the region of the presented sequence which corresponds +to a feature. + +3.5.2 Format and conventions +The location contains at least one sequence location descriptor and may +contain one or more operators with one or more sequence location descriptors. +Base numbers refer to the numbering in the entry. This numbering designates +the first base (5' end) of the presented sequence as base 1. +Base locations beyond the range of the presented sequence may not be used in +location descriptors, the only exception being location in a remote entry (see +3.5.2.1, e). + +Location operators and descriptors are discussed in more detail below. + +3.5.2.1 Location descriptors + +The location descriptor can be one of the following: +(a) a single base number +(b) a site between two indicated adjoining bases +(c) a single base chosen from within a specified range of bases (not allowed for new + entries) +(d) the base numbers delimiting a sequence span +(e) a remote entry identifier followed by a local location descriptor + (i.e., a-d) + +A site between two adjoining nucleotides, such as endonucleolytic cleavage +site, is indicated by listing the two points separated by a carat (^). The +permitted formats for this descriptor are n^n+1 (for example 55^56), or, for +circular molecules, n^1, where "n" is the full length of the molecule, ie +1000^1 for circular molecule with length 1000. + +A single base chosen from a range of bases is indicated by the first base +number and the last base number of the range separated by a single period +(e.g., '12.21' indicates a single base taken from between the indicated +points). From October 2006 the usage of this descriptor is restricted : +it is illegal to use "a single base from a range" (c) either on its own or +in combination with the "sequence span" (d) descriptor for newly created entries. +The existing entries where such descriptors exist are going to be retrofitted. + +Sequence spans are indicated by the starting base number and the ending base +number separated by two periods (e.g., '34..456'). The '<' and '>' symbols may +be used with the starting and ending base numbers to indicate that an end +point is beyond the specified base number. The starting and ending base +positions can be represented as distinct base numbers ('34..456') or a site +between two indicated adjoining bases. + +A location in a remote entry (not the entry to which the feature table +belongs) can be specified by giving the accession-number and sequence version +of the remote entry, followed by a colon ":", followed by a location +descriptor which applies to that entry's sequence (i.e. J12345.1:1..15, see +also examples below) + +3.5.2.2 Operators + +The location operator is a prefix that specifies what must be done to the +indicated sequence to find or construct the location corresponding to the +feature. A list of operators is given below with their definitions and most +common format. + +complement(location) +Find the complement of the presented sequence in the span specified by " +location" (i.e., read the complement of the presented strand in its 5'-to-3' +direction) + +join(location,location, ... location) +The indicated elements should be joined (placed end-to-end) to form one +contiguous sequence + +order(location,location, ... location) +The elements can be found in the +specified order (5' to 3' direction), but nothing is implied about the +reasonableness about joining them + +Note : location operator "complement" can be used in combination with either " +join" or "order" within the same location; combinations of "join" and "order" +within the same location (nested operators) are illegal. + + + +3.5.3 Location examples + +The following is a list of common location descriptors with their meanings: + +Location Description + +467 Points to a single base in the presented sequence + +340..565 Points to a continuous range of bases bounded by and + including the starting and ending bases + +<345..500 Indicates that the exact lower boundary point of a feature + is unknown. The location begins at some base previous to + the first base specified (which need not be contained in + the presented sequence) and continues to and includes the + ending base + +<1..888 The feature starts before the first sequenced base and + continues to and includes base 888 + +1..>888 The feature starts at the first sequenced base and + continues beyond base 888 + +102.110 Indicates that the exact location is unknown but that it is + one of the bases between bases 102 and 110, inclusive + +123^124 Points to a site between bases 123 and 124 + +join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to form + one contiguous sequence + + +complement(34..126) Start at the base complementary to 126 and finish at the + base complementary to base 34 (the feature is on the strand + complementary to the presented strand) + + +complement(join(2691..4571,4918..5163)) + Joins regions 2691 to 4571 and 4918 to 5163, then + complements the joined segments (the feature is on the + strand complementary to the presented strand) + +join(complement(4918..5163),complement(2691..4571)) + Complements regions 4918 to 5163 and 2691 to 4571, then + joins the complemented segments (the feature is on the + strand complementary to the presented strand) + +J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in + this database) with primary accession number 'J00194' + +join(1..100,J00194.1:100..202) + Joins region 1..100 of the existing entry with the region + 100..202 of remote entry J00194 + + */ /** * Recover annotated sequences from EMBL file * @param noNa don't return nucleic acid sequences @@ -215,6 +358,7 @@ public class EmblEntry { String prseq=null; String prname=new String(); String prid=null; + Hashtable vals=new Hashtable(); int prstart=1; // get qualifiers if (feature.getQualifiers()!=null && feature.getQualifiers().size()>0) { @@ -234,13 +378,12 @@ public class EmblEntry { { prstart = Integer.parseInt(q.getValue()); } - else { - // throw anything else into the title - if (prname.length()==0) { - prname = q.getValue(); - } else { - prname = prname + q.getName()+":"+q.getValue(); - } + else + if (q.getName().equals("product")){ + prname = q.getValue(); + } else { + // throw anything else into the additional properties hash + vals.put(q.getName(), q.getValue()); } } } @@ -262,6 +405,14 @@ public class EmblEntry { sf.setType(feature.getName()); sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL); sf.setDescription("Exon "+(1+xint)+" for protein '"+prname+"' EMBLCDS:"+prid); + if (vals!=null && vals.size()>0) { + Enumeration kv = vals.elements(); + while (kv.hasMoreElements()) { + Object key=kv.nextElement(); + if (key!=null) + sf.setValue(key.toString(), vals.get(key)); + } + } dna.addSequenceFeature(sf); } } diff --git a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java index 878042d..bbd3948 100644 --- a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java +++ b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java @@ -45,6 +45,7 @@ public class EmblFeatureLocations { } /** * Return all location elements as start-end pairs on referenced sequence + * TODO: pass back complement and 'less than or more than' range information * @return int[] { start1, end1, ... } */ public int[] getElementRanges() { @@ -61,6 +62,23 @@ public class EmblFeatureLocations { } return se; } + if (locationType.equalsIgnoreCase("join")) { + int[] se = new int[locElements.size()*2]; + int sepos=0; + for (Iterator le=locElements.iterator();le.hasNext();) { + EmblFeatureLocElement loce = (EmblFeatureLocElement) le.next(); + BasePosition bp[] = loce.getBasePositions(); + if (bp.length==2) { + se[sepos++] = Integer.parseInt(bp[0].getPos()); + se[sepos++] = Integer.parseInt(bp[1].getPos()); + } + } + return se; + } + if (locationType!=null) + { + jalview.bin.Cache.log.error("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"+locationType+"'"); + } return null; } } \ No newline at end of file -- 1.7.10.2