From: jprocter Date: Fri, 13 Jul 2007 15:04:44 +0000 (+0000) Subject: debugged mappings and translated sequence recovery X-Git-Tag: Release_2_4~350 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=385e63c78d289d4beae9c1ad2b187c0ea311ffbc;p=jalview.git debugged mappings and translated sequence recovery --- diff --git a/src/jalview/datamodel/DBRefEntry.java b/src/jalview/datamodel/DBRefEntry.java index e5eecbb..8bbb5f7 100755 --- a/src/jalview/datamodel/DBRefEntry.java +++ b/src/jalview/datamodel/DBRefEntry.java @@ -102,4 +102,18 @@ public Mapping getMap() { public void setMap(Mapping map) { this.map = map; } +public boolean hasMap() +{ + // TODO Auto-generated method stub + return map!=null; +} +/** + * + * @return source+":"+accessionId + */ +public String getSrcAccString() +{ + return ((source!=null) ? source : "") + + ":" + ((accessionId!=null) ? accessionId : ""); +} } diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java index 0063faa..e54b53e 100644 --- a/src/jalview/datamodel/Mapping.java +++ b/src/jalview/datamodel/Mapping.java @@ -1,121 +1,157 @@ package jalview.datamodel; +import java.util.Vector; + import jalview.util.MapList; -public class Mapping { +public class Mapping +{ /** - * Contains the - * start-end pairs mapping from - * the associated sequence to the - * sequence in the database - * coordinate system - * it also takes care of step difference between coordinate systems + * Contains the start-end pairs mapping from the associated sequence to the + * sequence in the database coordinate system it also takes care of step + * difference between coordinate systems */ - MapList map=null; + MapList map = null; + /** * The seuqence that map maps the associated seuqence to (if any). */ - SequenceI to=null; - public Mapping(MapList map) { + SequenceI to = null; + + public Mapping(MapList map) + { super(); this.map = map; } - public Mapping(SequenceI to, MapList map) { + + public Mapping(SequenceI to, MapList map) + { this(map); this.to = to; } + /** * create a new mapping from - * @param to the sequence being mapped - * @param exon int[] {start,end,start,end} series on associated sequence - * @param is int[] {start,end,...} ranges on the reference frame being mapped to - * @param i step size on associated sequence - * @param j step size on mapped frame + * + * @param to + * the sequence being mapped + * @param exon + * int[] {start,end,start,end} series on associated sequence + * @param is + * int[] {start,end,...} ranges on the reference frame being mapped + * to + * @param i + * step size on associated sequence + * @param j + * step size on mapped frame */ public Mapping(SequenceI to, int[] exon, int[] is, int i, int j) { this(to, new MapList(exon, is, i, j)); } + /** - * create a duplicate (and independent) mapping object with - * the same reference to any SequenceI being mapped to. + * create a duplicate (and independent) mapping object with the same reference + * to any SequenceI being mapped to. + * * @param map2 */ public Mapping(Mapping map2) { - if (map2!=this && map2!=null) { - if (map2.map!=null) + if (map2 != this && map2 != null) + { + if (map2.map != null) { - map=new MapList(map2.map); + map = new MapList(map2.map); } to = map2.to; } } + /** * @return the map */ - public MapList getMap() { + public MapList getMap() + { return map; } /** - * @param map the map to set + * @param map + * the map to set */ - public void setMap(MapList map) { + public void setMap(MapList map) + { this.map = map; } + /** * Equals that compares both the to references and MapList mappings. + * * @param other * @return */ - public boolean equals(Mapping other) { - if (other==null) + public boolean equals(Mapping other) + { + if (other == null) return false; - if (other==this) + if (other == this) return true; - if (other.to!=to) + if (other.to != to) return false; - if ((map!=null && other.map==null) || (map==null && other.map!=null)) + if ((map != null && other.map == null) + || (map == null && other.map != null)) return false; if (map.equals(other.map)) return true; return false; } + /** - * get the 'initial' position in the associated - * sequence for a position in the mapped reference frame + * get the 'initial' position in the associated sequence for a position in the + * mapped reference frame + * * @param mpos * @return */ public int getPosition(int mpos) { - if (map!=null) { + if (map != null) + { int[] mp = map.shiftTo(mpos); - if (mp!=null) + if (mp != null) { return mp[0]; } } return mpos; } + /** - * gets boundary in direction of mapping - * @param position in mapped reference frame - * @return int{start, end} positions in associated sequence (in direction of mapped word) + * gets boundary in direction of mapping + * + * @param position + * in mapped reference frame + * @return int{start, end} positions in associated sequence (in direction of + * mapped word) */ - public int[] getWord(int mpos) { - if (map!=null) { + public int[] getWord(int mpos) + { + if (map != null) + { return map.getToWord(mpos); } return null; } + /** * width of mapped unit in associated sequence - * + * */ - public int getWidth() { - if (map!=null) { + public int getWidth() + { + if (map != null) + { return map.getFromRatio(); } return 1; @@ -123,103 +159,263 @@ public class Mapping { /** * width of unit in mapped reference frame + * * @return */ - public int getMappedWidth() { - if (map!=null) { + public int getMappedWidth() + { + if (map != null) + { return map.getToRatio(); } return 1; } + /** - * get mapped position in the associated - * reference frame for position pos in the - * associated sequence. + * get mapped position in the associated reference frame for position pos in + * the associated sequence. + * * @param pos * @return */ - public int getMappedPosition(int pos) { - if (map!=null) { + public int getMappedPosition(int pos) + { + if (map != null) + { int[] mp = map.shiftFrom(pos); - if (mp!=null) + if (mp != null) { return mp[0]; } } return pos; } - public int[] getMappedWord(int pos) { - if (map!=null) { + + public int[] getMappedWord(int pos) + { + if (map != null) + { int[] mp = map.shiftFrom(pos); - if (mp!=null) + if (mp != null) { - return new int[] { mp[0], mp[0]+mp[2]*(map.getToRatio()-1)}; + return new int[] + { mp[0], mp[0] + mp[2] * (map.getToRatio() - 1) }; } } return null; } + /** - * locates the region of feature f in the associated sequence's reference frame + * locates the region of feature f in the associated sequence's reference + * frame + * * @param f * @return one or more features corresponding to f */ public SequenceFeature[] locateFeature(SequenceFeature f) { - if (true) { // f.getBegin()!=f.getEnd()) { - if (map!=null) { + if (true) + { // f.getBegin()!=f.getEnd()) { + if (map != null) + { int[] frange = map.locateInFrom(f.getBegin(), f.getEnd()); - SequenceFeature[] vf = new SequenceFeature[frange.length/2]; - for (int i=0,v=0;i2) - vf[v].setDescription(f.getDescription() +"\nPart "+v); + vf[v].setEnd(frange[i + 1]); + if (frange.length > 2) + vf[v].setDescription(f.getDescription() + "\nPart " + v); } return vf; } } - if (false) //else + if (false) // else { int[] word = getWord(f.getBegin()); - if (word[0]word[1]) + if (word[0] > word[1]) { f.setEnd(word[0]); - } else { + } + else + { f.setEnd(word[1]); } } // give up and just return the feature. - return new SequenceFeature[] { f }; + return new SequenceFeature[] + { f }; } - /** - * return a series of contigs on the associated sequence corresponding to - * the from,to interval on the mapped reference frame + /** + * return a series of contigs on the associated sequence corresponding to the + * from,to interval on the mapped reference frame + * * @param from * @param to - * @return + * @return int[] { from_i, to_i for i=1 to n contiguous regions in the + * associated sequence} */ - public int[] locateRange(int from, int to) { - //TODO - return null; + public int[] locateRange(int from, int to) + { + if (map != null) + { + if (from <= to) + { + from = (map.getToLowest() < from) ? from : map.getToLowest(); + to = (map.getToHighest() > to) ? to : map.getToHighest(); + if (from > to) + return null; + } + else + { + from = (map.getToHighest() > from) ? from : map.getToHighest(); + to = (map.getToLowest() < to) ? to : map.getToLowest(); + if (from < to) + return null; + } + return map.locateInFrom(from, to); + } + return new int[] + { from, to }; } + /** - * return a series of contigs on the mapped reference frame corresponding to - * the from,to interval on the associated sequence + * return a series of mapped contigs mapped from a range on the associated + * sequence + * * @param from * @param to * @return */ - public int[] locateMappedRange(int from, int to) { - //TODO - return null; + public int[] locateMappedRange(int from, int to) + { + if (map != null) + { + + if (from <= to) + { + from = (map.getFromLowest() < from) ? from : map.getFromLowest(); + to = (map.getFromHighest() > to) ? to : map.getFromHighest(); + if (from > to) + return null; + } + else + { + from = (map.getFromHighest() > from) ? from : map.getFromHighest(); + to = (map.getFromLowest() < to) ? to : map.getFromLowest(); + if (from < to) + return null; + } + return map.locateInTo(from, to); + } + return new int[] + { from, to }; + } + + /** + * return a new mapping object with a maplist modifed to only map the visible + * regions defined by viscontigs. + * + * @param viscontigs + * @return + */ + public Mapping intersectVisContigs(int[] viscontigs) + { + Mapping copy = new Mapping(this); + if (map != null) + { + int vpos = 0; + int apos = 0; + Vector toRange = new Vector(); + Vector fromRange = new Vector(); + for (int vc = 0; vc < viscontigs.length; vc += 2) + { + // find a mapped range in this visible region + int[] mpr = locateMappedRange(1+viscontigs[vc], viscontigs[vc + 1]-1); + if (mpr != null) + { + for (int m = 0; m < mpr.length; m += 2) + { + toRange.addElement(new int[] + { mpr[m], mpr[m + 1] }); + int[] xpos = locateRange(mpr[m], mpr[m + 1]); + for (int x = 0; x < xpos.length; x += 2) + { + fromRange.addElement(new int[] + { xpos[x], xpos[x + 1] }); + } + } + } + } + int[] from = new int[fromRange.size()*2]; + int[] to = new int[toRange.size()*2]; + int[] r; + for (int f=0,fSize=fromRange.size(); f Protein exon map and a range of visContigs + */ + MapList fk = new MapList(new int[] { 1,6,8,13,15,23}, new int[] { 1,7}, 3, 1); + Mapping m = new Mapping(fk); + Mapping m_1 = m.intersectVisContigs(new int[] {fk.getFromLowest(), fk.getFromHighest()}); + Mapping m_2 = m.intersectVisContigs(new int[] {1,7,11,20}); + System.out.println(""+m_1.map.getFromRanges()); + + + } + /** + * get the sequence being mapped to - if any + * @return null or a dataset sequence + */ + public SequenceI getTo() + { + return to; + } + /** + * set the dataset sequence being mapped to if any + * @param tto + */ + public void setTo(SequenceI tto) + { + to = tto; + } + + /* (non-Javadoc) + * @see java.lang.Object#finalize() + */ + protected void finalize() throws Throwable + { + map = null; + to = null; + super.finalize(); } + } diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index a22eff7..906122d 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -3,6 +3,7 @@ package jalview.datamodel.xdb.embl; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -405,7 +406,6 @@ public class EmblEntry { for (Iterator i = features.iterator(); i.hasNext();) { - boolean nextFeature=false; EmblFeature feature = (EmblFeature) i.next(); if (!noNa) { @@ -416,133 +416,9 @@ public class EmblEntry ; } } - if (FeatureProperties.isCodingFeature(DBRefSource.EMBL, "CDS")) + if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - // extract coding region(s) - jalview.datamodel.Mapping map = null; - int[] exon = null; - if (feature.locations != null && feature.locations.size() > 0) - { - for (Enumeration locs = feature.locations.elements(); locs - .hasMoreElements();) - { - EmblFeatureLocations loc = (EmblFeatureLocations) locs - .nextElement(); - int[] se = loc.getElementRanges(accession); - if (exon == null) - { - exon = se; - } - else - { - int[] t = new int[exon.length + se.length]; - System.arraycopy(exon, 0, t, 0, exon.length); - System.arraycopy(se, 0, t, exon.length, se.length); - exon = t; - } - } - } - String prseq = null; - String prname = new String(); - String prid = null; - Hashtable vals = new Hashtable(); - int prstart = 1; - // get qualifiers - if (feature.getQualifiers() != null - && feature.getQualifiers().size() > 0) - { - for (Iterator quals = feature.getQualifiers().iterator(); quals - .hasNext();) - { - Qualifier q = (Qualifier) quals.next(); - if (q.getName().equals("translation")) - { - prseq = q.getValues()[0]; - } - else if (q.getName().equals("protein_id")) - { - prid = q.getValues()[0]; - } - else if (q.getName().equals("codon_start")) - { - prstart = Integer.parseInt(q.getValues()[0]); - } - else if (q.getName().equals("product")) - { - prname = q.getValues()[0]; - } - else - { - // throw anything else into the additional properties hash - vals.put(q.getName(), q.getValues().toString()); - } - } - } - Sequence product = null; - if (prseq != null && prname != null && prid != null) - { - // extract proteins. - if (!noPeptide) - { - product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid - + "|" + prname, prseq, prstart, prstart - + prseq.length() - 1); - product.setDescription("Protein Product from " + sourceDb); - seqs.add(product); - } - // we have everything - create the mapping and perhaps the protein - // sequence - map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); - // add cds feature to dna seq - this may include the stop codon - for (int xint = 0; xint < exon.length; xint += 2) - { - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(exon[xint]); - sf.setEnd(exon[xint + 1]); - sf.setType(feature.getName()); - sf.setFeatureGroup(jalview.datamodel.DBRefSource.EMBL); - sf.setDescription("Exon " + (1 + xint) + " for protein '" - + prname + "' EMBLCDS:" + prid); - sf.setValue(FeatureProperties.EXONPOS, new Integer(1+xint)); - sf.setValue(FeatureProperties.EXONPRODUCT, prname); - if (vals != null && vals.size() > 0) - { - Enumeration kv = vals.elements(); - while (kv.hasMoreElements()) - { - Object key = kv.nextElement(); - if (key != null) - sf.setValue(key.toString(), vals.get(key)); - } - } - dna.addSequenceFeature(sf); - } - } - // add dbRefs to sequence - if (feature.dbRefs != null && feature.dbRefs.size() > 0) - { - for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) - { - DBRefEntry ref = (DBRefEntry) dbr.next(); - ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref - .getSource())); - // Hard code the kind of protein product accessions that EMBL cite - if (ref.getSource().equals( - jalview.datamodel.DBRefSource.UNIPROT)) - { - ref.setMap(map); - } - if (product != null) - { - DBRefEntry pref = new DBRefEntry(ref.getSource(), ref - .getVersion(), ref.getAccessionId()); - pref.setMap(null); // reference is direct - } - dna.addDBRef(ref); - } - } - + parseCodingFeature(feature, sourceDb, seqs, dna, noPeptide); } else { @@ -558,15 +434,17 @@ public class EmblEntry } } } - } catch (Exception e) + } + catch (Exception e) { System.err.println("EMBL Record Features parsing error!"); - System.err.println("Please report the following to help@jalview.org :"); - System.err.println("EMBL Record "+accession); - System.err.println("Resulted in exception: "+e.getMessage()); + System.err + .println("Please report the following to help@jalview.org :"); + System.err.println("EMBL Record " + accession); + System.err.println("Resulted in exception: " + e.getMessage()); e.printStackTrace(System.err); } - if (!noNa && dna!=null) + if (!noNa && dna != null) { seqs.add(dna); } @@ -578,4 +456,218 @@ public class EmblEntry } return sqs; } + + /** + * attempt to extract coding region and product from a feature and properly decorate it with annotations. + * @param feature coding feature + * @param sourceDb source database for the EMBLXML + * @param seqs place where sequences go + * @param dna parent dna sequence for this record + * @param noPeptide flag for generation of Peptide sequence objects + */ + private void parseCodingFeature(EmblFeature feature, String sourceDb, Vector seqs, Sequence dna, boolean noPeptide) + { + boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); + // extract coding region(s) + jalview.datamodel.Mapping map = null; + int[] exon = null; + if (feature.locations != null && feature.locations.size() > 0) + { + for (Enumeration locs = feature.locations.elements(); locs + .hasMoreElements();) + { + EmblFeatureLocations loc = (EmblFeatureLocations) locs + .nextElement(); + int[] se = loc.getElementRanges(accession); + if (exon == null) + { + exon = se; + } + else + { + int[] t = new int[exon.length + se.length]; + System.arraycopy(exon, 0, t, 0, exon.length); + System.arraycopy(se, 0, t, exon.length, se.length); + exon = t; + } + } + } + String prseq = null; + String prname = new String(); + String prid = null; + Hashtable vals = new Hashtable(); + int prstart = 1; + // get qualifiers + if (feature.getQualifiers() != null + && feature.getQualifiers().size() > 0) + { + for (Iterator quals = feature.getQualifiers().iterator(); quals + .hasNext();) + { + Qualifier q = (Qualifier) quals.next(); + if (q.getName().equals("translation")) + { + StringBuffer prsq = new StringBuffer(q.getValues()[0]); + int p = prsq.indexOf(" "); + while (p > -1) + { + prsq.deleteCharAt(p); + p = prsq.indexOf(" ", p); + } + prseq = prsq.toString(); + prsq = null; + + } + else if (q.getName().equals("protein_id")) + { + prid = q.getValues()[0]; + } + else if (q.getName().equals("codon_start")) + { + prstart = Integer.parseInt(q.getValues()[0]); + } + else if (q.getName().equals("product")) + { + prname = q.getValues()[0]; + } + else + { + // throw anything else into the additional properties hash + vals.put(q.getName(), q.getValues().toString()); + } + } + } + Sequence product = null; + if (prseq != null && prname != null && prid != null) + { + // extract proteins. + product = new Sequence(sourceDb + "|" + "EMBLCDS|" + prid + +((prname.length()==0) ? "" : " " + prname), prseq, prstart, prstart + + prseq.length() - 1); + product.setDescription("Protein Product from " + sourceDb); + + if (!noPeptide) + { + // Protein is also added to vector of sequences returned + seqs.add(product); + } + // we have everything - create the mapping and perhaps the protein + // sequence + if (exon == null || exon.length == 0) + { + System.err + .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + + sourceDb + ":" + getAccession() + ")"); + if (prseq.length() * 3 == dna.getSequence().length) + { + // this might occur for CDS sequences where no features are + // marked. + exon = new int[] + { dna.getStart(), dna.getEnd() }; + map = new jalview.datamodel.Mapping(product, exon, + new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + } + if ((prseq.length() + 1) * 3 == dna.getSequence().length) + { + exon = new int[] + { dna.getStart(), dna.getEnd() - 3 }; + map = new jalview.datamodel.Mapping(product, exon, + new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + } + } + else + { + if (isEmblCdna) + { + // TODO: Add a DbRef back to the parent EMBL sequence with the exon + // map + + // make a new feature annotating the coding contig + } + else + { + map = new jalview.datamodel.Mapping(product, exon, + new int[] + { prstart, prstart + prseq.length() - 1 }, 3, 1); + // reconstruct the EMBLCDS entry + DBRefEntry pcdnaref = new DBRefEntry(); + pcdnaref.setAccessionId(prid); + pcdnaref.setSource(DBRefSource.EMBLCDS); + pcdnaref.setVersion(getVersion()); // same as parent EMBL version. + jalview.util.MapList mp = new jalview.util.MapList(new int[] + { 1+(prstart-1)*3, 1+(prstart-1)*3 + (prseq.length()-1)*3 }, new int[] { prstart, prstart+prseq.length() - 1 }, 3, 1); + pcdnaref.setMap(new Mapping(mp)); + if (product!=null) + product.addDBRef(pcdnaref); + + } + } + // add cds feature to dna seq - this may include the stop codon + for (int xint = 0; exon != null && xint < exon.length; xint += 2) + { + SequenceFeature sf = new SequenceFeature(); + sf.setBegin(exon[xint]); + sf.setEnd(exon[xint + 1]); + sf.setType(feature.getName()); + sf.setFeatureGroup(sourceDb); + sf.setDescription("Exon " + (1 + xint) + " for protein '" + + prname + "' EMBLCDS:" + prid); + sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint)); + sf.setValue(FeatureProperties.EXONPRODUCT, prname); + if (vals != null && vals.size() > 0) + { + Enumeration kv = vals.elements(); + while (kv.hasMoreElements()) + { + Object key = kv.nextElement(); + if (key != null) + sf.setValue(key.toString(), vals.get(key)); + } + } + dna.addSequenceFeature(sf); + } + } + // add dbRefs to sequence + if (feature.dbRefs != null && feature.dbRefs.size() > 0) + { + for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();) + { + DBRefEntry ref = (DBRefEntry) dbr.next(); + ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref + .getSource())); + // Hard code the kind of protein product accessions that EMBL cite + if (ref.getSource().equals( + jalview.datamodel.DBRefSource.UNIPROT)) + { + ref.setMap(map); + /*if (map.getTo()!=null) + { + map.getTo().setName(map.getTo().getName()+"|"+ref.getSource()+"|"+ref.getAccessionId()); + }*/ + } + if (product != null) + { + DBRefEntry pref = new DBRefEntry(ref.getSource(), ref + .getVersion(), ref.getAccessionId()); + pref.setMap(null); // reference is direct + product.addDBRef(pref); + // Add converse mapping reference + if (map != null) + { + Mapping pmap = new Mapping(dna, map.getMap().getInverse()); + pref = new DBRefEntry(sourceDb, getVersion(), this + .getAccession()); + pref.setMap(pmap); + if (map.getTo()!=null) + { + map.getTo().addDBRef(pref); + } + } + } + dna.addDBRef(ref); + } + } + } } diff --git a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java index a9fe76e..0341ec3 100644 --- a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java +++ b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java @@ -78,7 +78,7 @@ public class EmblFeatureLocations { } } } - } + } else if (locationType.equalsIgnoreCase("join")) { for (Enumeration le=locElements.elements();le.hasMoreElements();) { EmblFeatureLocElement loce = (EmblFeatureLocElement) le.nextElement(); @@ -92,10 +92,13 @@ public class EmblFeatureLocations { } } return se; - } + } else if (locationType!=null) { - jalview.bin.Cache.log.error("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"+locationType+"'"); + if (jalview.bin.Cache.log!=null) + jalview.bin.Cache.log.error("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"+locationType+"'"); + else + System.err.println("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='"+locationType+"'"); } // trim range if necessary. if (se!=null && sepos!=se.length)