src/jalview/datamodel/xdb/embl/EmblEntry.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel.xdb.embl;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.DBRefEntry;
  25 import jalview.datamodel.DBRefSource;
  26 import jalview.datamodel.FeatureProperties;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.DBRefUtils;
  32 import jalview.util.MapList;
  33 import jalview.util.MappingUtils;
  34 import jalview.util.StringUtils;
  35
  36 import java.util.Hashtable;
  37 import java.util.List;
  38 import java.util.Map;
  39 import java.util.Map.Entry;
  40 import java.util.Vector;
  41 import java.util.regex.Pattern;
  42
  43 /**
  44  * Data model for one entry returned from an EMBL query, as marshalled by a
  45  * Castor binding file
  46  *
  47  * For example:
  48  * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
  49  * &format=emblxml
  50  *
  51  * @see embl_mapping.xml
  52  */
  53 public class EmblEntry
  54 {
  55   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  56
  57   String accession;
  58
  59   String version;
  60
  61   String taxDivision;
  62
  63   String desc;
  64
  65   String rCreated;
  66
  67   String rLastUpdated;
  68
  69   String lastUpdated;
  70
  71   Vector<String> keywords;
  72
  73   Vector<DBRefEntry> dbRefs;
  74
  75   Vector<EmblFeature> features;
  76
  77   EmblSequence sequence;
  78
  79   /**
  80    * @return the accession
  81    */
  82   public String getAccession()
  83   {
  84     return accession;
  85   }
  86
  87   /**
  88    * @param accession
  89    *          the accession to set
  90    */
  91   public void setAccession(String accession)
  92   {
  93     this.accession = accession;
  94   }
  95
  96   /**
  97    * @return the dbRefs
  98    */
  99   public Vector<DBRefEntry> getDbRefs()
 100   {
 101     return dbRefs;
 102   }
 103
 104   /**
 105    * @param dbRefs
 106    *          the dbRefs to set
 107    */
 108   public void setDbRefs(Vector<DBRefEntry> dbRefs)
 109   {
 110     this.dbRefs = dbRefs;
 111   }
 112
 113   /**
 114    * @return the desc
 115    */
 116   public String getDesc()
 117   {
 118     return desc;
 119   }
 120
 121   /**
 122    * @param desc
 123    *          the desc to set
 124    */
 125   public void setDesc(String desc)
 126   {
 127     this.desc = desc;
 128   }
 129
 130   /**
 131    * @return the features
 132    */
 133   public Vector<EmblFeature> getFeatures()
 134   {
 135     return features;
 136   }
 137
 138   /**
 139    * @param features
 140    *          the features to set
 141    */
 142   public void setFeatures(Vector<EmblFeature> features)
 143   {
 144     this.features = features;
 145   }
 146
 147   /**
 148    * @return the keywords
 149    */
 150   public Vector<String> getKeywords()
 151   {
 152     return keywords;
 153   }
 154
 155   /**
 156    * @param keywords
 157    *          the keywords to set
 158    */
 159   public void setKeywords(Vector<String> keywords)
 160   {
 161     this.keywords = keywords;
 162   }
 163
 164   /**
 165    * @return the lastUpdated
 166    */
 167   public String getLastUpdated()
 168   {
 169     return lastUpdated;
 170   }
 171
 172   /**
 173    * @param lastUpdated
 174    *          the lastUpdated to set
 175    */
 176   public void setLastUpdated(String lastUpdated)
 177   {
 178     this.lastUpdated = lastUpdated;
 179   }
 180
 181   /**
 182    * @return the releaseCreated
 183    */
 184   public String getRCreated()
 185   {
 186     return rCreated;
 187   }
 188
 189   /**
 190    * @param releaseCreated
 191    *          the releaseCreated to set
 192    */
 193   public void setRCreated(String releaseCreated)
 194   {
 195     this.rCreated = releaseCreated;
 196   }
 197
 198   /**
 199    * @return the releaseLastUpdated
 200    */
 201   public String getRLastUpdated()
 202   {
 203     return rLastUpdated;
 204   }
 205
 206   /**
 207    * @param releaseLastUpdated
 208    *          the releaseLastUpdated to set
 209    */
 210   public void setRLastUpdated(String releaseLastUpdated)
 211   {
 212     this.rLastUpdated = releaseLastUpdated;
 213   }
 214
 215   /**
 216    * @return the sequence
 217    */
 218   public EmblSequence getSequence()
 219   {
 220     return sequence;
 221   }
 222
 223   /**
 224    * @param sequence
 225    *          the sequence to set
 226    */
 227   public void setSequence(EmblSequence sequence)
 228   {
 229     this.sequence = sequence;
 230   }
 231
 232   /**
 233    * @return the taxDivision
 234    */
 235   public String getTaxDivision()
 236   {
 237     return taxDivision;
 238   }
 239
 240   /**
 241    * @param taxDivision
 242    *          the taxDivision to set
 243    */
 244   public void setTaxDivision(String taxDivision)
 245   {
 246     this.taxDivision = taxDivision;
 247   }
 248
 249   /**
 250    * @return the version
 251    */
 252   public String getVersion()
 253   {
 254     return version;
 255   }
 256
 257   /**
 258    * @param version
 259    *          the version to set
 260    */
 261   public void setVersion(String version)
 262   {
 263     this.version = version;
 264   }
 265
 266   /**
 267    * Recover annotated sequences from EMBL file
 268    *
 269    * @param sourceDb
 270    * @param peptides
 271    *          a list of protein products found so far (to add to)
 272    * @return dna dataset sequence with DBRefs and features
 273    */
 274   public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
 275   {
 276     SequenceI dna = new Sequence(sourceDb + "|" + accession,
 277             sequence.getSequence());
 278     dna.setDescription(desc);
 279     DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
 280     dna.addDBRef(retrievedref);
 281     // add map to indicate the sequence is a valid coordinate frame for the
 282     // dbref
 283     retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
 284             new int[] { 1, dna.getLength() }, 1, 1));
 285     // TODO: transform EMBL Database refs to canonical form
 286     if (dbRefs != null)
 287     {
 288       for (DBRefEntry dbref : dbRefs)
 289       {
 290         dna.addDBRef(dbref);
 291       }
 292     }
 293
 294     try
 295     {
 296       for (EmblFeature feature : features)
 297       {
 298         if (feature.dbRefs != null)
 299         {
 300           for (DBRefEntry dbref : feature.dbRefs)
 301           {
 302             dna.addDBRef(dbref);
 303           }
 304         }
 305         if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
 306         {
 307           parseCodingFeature(feature, sourceDb, dna, peptides);
 308         }
 309       }
 310     } catch (Exception e)
 311     {
 312       System.err.println("EMBL Record Features parsing error!");
 313       System.err
 314               .println("Please report the following to help@jalview.org :");
 315       System.err.println("EMBL Record " + accession);
 316       System.err.println("Resulted in exception: " + e.getMessage());
 317       e.printStackTrace(System.err);
 318     }
 319
 320     return dna;
 321   }
 322
 323   /**
 324    * Extracts coding region and product from a CDS feature and properly decorate
 325    * it with annotations.
 326    *
 327    * @param feature
 328    *          coding feature
 329    * @param sourceDb
 330    *          source database for the EMBLXML
 331    * @param dna
 332    *          parent dna sequence for this record
 333    * @param peptides
 334    *          list of protein product sequences for Embl entry
 335    */
 336   void parseCodingFeature(EmblFeature feature, String sourceDb,
 337           SequenceI dna, List<SequenceI> peptides)
 338   {
 339     boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 340
 341     int[] exon = getCdsRanges(feature);
 342
 343     String prseq = null;
 344     String prname = "";
 345     String prid = null;
 346     Map<String, String> vals = new Hashtable<String, String>();
 347     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 348
 349     /*
 350      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 351      * (phase is required for CDS features in GFF3 format)
 352      */
 353     int codonStart = 1;
 354
 355     /*
 356      * parse qualifiers, saving protein translation, protein id,
 357      * codon start position, product (name), and 'other values'
 358      */
 359     if (feature.getQualifiers() != null)
 360     {
 361       for (Qualifier q : feature.getQualifiers())
 362       {
 363         String qname = q.getName();
 364         if (qname.equals("translation"))
 365         {
 366           // remove all spaces (precompiled String.replaceAll(" ", ""))
 367           prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
 368         }
 369         else if (qname.equals("protein_id"))
 370         {
 371           prid = q.getValues()[0];
 372         }
 373         else if (qname.equals("codon_start"))
 374         {
 375           try
 376           {
 377             codonStart = Integer.parseInt(q.getValues()[0]);
 378           } catch (NumberFormatException e)
 379           {
 380             System.err.println("Invalid codon_start in XML for "
 381                     + accession + ": " + e.getMessage());
 382           }
 383         }
 384         else if (qname.equals("product"))
 385         {
 386           // sometimes name is returned e.g. for V00488
 387           prname = q.getValues()[0];
 388         }
 389         else
 390         {
 391           // throw anything else into the additional properties hash
 392           String[] qvals = q.getValues();
 393           if (qvals != null)
 394           {
 395             String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
 396                     ",");
 397             vals.put(qname, commaSeparated);
 398           }
 399         }
 400       }
 401     }
 402
 403     // SequenceI product = null;
 404     DBRefEntry protEMBLCDS = null;
 405     exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
 406     boolean noProteinDbref = true;
 407
 408     SequenceI product = null;
 409     Mapping map = null;
 410     if (prseq != null && prname != null && prid != null)
 411     {
 412       /*
 413        * look for product in peptides list, if not found, add it
 414        */
 415       product = matcher.findIdMatch(prid);
 416       if (product == null)
 417       {
 418         product = new Sequence(prid, prseq, 1, prseq.length());
 419         product.setDescription(((prname.length() == 0) ? "Protein Product from "
 420                 + sourceDb
 421                 : prname));
 422         peptides.add(product);
 423         matcher.add(product);
 424       }
 425
 426       // we have everything - create the mapping and perhaps the protein
 427       // sequence
 428       if (exon == null || exon.length == 0)
 429       {
 430         System.err
 431                 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 432                         + sourceDb + ":" + getAccession() + ")");
 433         if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
 434         {
 435           System.err
 436                   .println("Not allowing for additional stop codon at end of cDNA fragment... !");
 437           // this might occur for CDS sequences where no features are
 438           // marked.
 439           exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() };
 440           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 441                   3, 1);
 442         }
 443         if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
 444         {
 445           System.err
 446                   .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 447           exon = new int[] { dna.getStart() + (codonStart - 1),
 448               dna.getEnd() - 3 };
 449           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 450                   3, 1);
 451         }
 452       }
 453       else
 454       {
 455         // Trim the exon mapping if necessary - the given product may only be a
 456         // fragment of a larger protein. (EMBL:AY043181 is an example)
 457
 458         if (isEmblCdna)
 459         {
 460           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 461           // map
 462           // if given a dataset reference, search dataset for parent EMBL
 463           // sequence if it exists and set its map
 464           // make a new feature annotating the coding contig
 465         }
 466         else
 467         {
 468           // final product length truncation check
 469           // TODO should from range include stop codon even if not in protein
 470           // in order to include stop codon in CDS sequence (as done for
 471           // Ensembl)?
 472           int[] cdsRanges = adjustForProteinLength(prseq.length(),
 473                   exon);
 474           map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1);
 475           // reconstruct the EMBLCDS entry
 476           // TODO: this is only necessary when there codon annotation is
 477           // complete (I think JBPNote)
 478           DBRefEntry pcdnaref = new DBRefEntry();
 479           pcdnaref.setAccessionId(prid);
 480           pcdnaref.setSource(DBRefSource.EMBLCDS);
 481           pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
 482           MapList mp = new MapList(new int[] { 1, prseq.length() },
 483                   new int[] { 1 + (codonStart - 1),
 484                       (codonStart - 1) + 3 * prseq.length() }, 1, 3);
 485           pcdnaref.setMap(new Mapping(mp));
 486           if (product != null)
 487           {
 488             product.addDBRef(pcdnaref);
 489             protEMBLCDS = new DBRefEntry(pcdnaref);
 490             protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 491             product.addDBRef(protEMBLCDS);
 492           }
 493         }
 494       }
 495       // add cds feature to dna seq - this may include the stop codon
 496       for (int xint = 0; exon != null && xint < exon.length; xint += 2)
 497       {
 498         SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
 499                 codonStart);
 500         sf.setType(feature.getName()); // "CDS"
 501         sf.setFeatureGroup(sourceDb);
 502         dna.addSequenceFeature(sf);
 503       }
 504     }
 505
 506     /*
 507      * add dbRefs to sequence, and mappings for Uniprot xrefs
 508      */
 509     if (feature.dbRefs != null)
 510     {
 511       boolean mappingUsed = false;
 512       for (DBRefEntry ref : feature.dbRefs)
 513       {
 514         ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
 515         if (ref.getSource().equals(DBRefSource.UNIPROT))
 516         {
 517           String proteinSeqName = DBRefSource.UNIPROT + "|"
 518                   + ref.getAccessionId();
 519           if (map != null && map.getTo() != null)
 520           {
 521             if (mappingUsed)
 522             {
 523               /*
 524                * two or more Uniprot xrefs for the same CDS -
 525                * each needs a distinct Mapping (as to a different sequence)
 526                */
 527               map = new Mapping(map);
 528             }
 529             mappingUsed = true;
 530
 531             /*
 532              * try to locate the protein mapped to (possibly by a
 533              * previous CDS feature)
 534              */
 535             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 536             if (proteinSeq == null)
 537             {
 538               proteinSeq = new Sequence(proteinSeqName,
 539                       product
 540                       .getSequenceAsString());
 541               matcher.add(proteinSeq);
 542               peptides.add(proteinSeq);
 543             }
 544             map.setTo(proteinSeq);
 545             map.getTo().addDBRef(
 546                     new DBRefEntry(ref.getSource(), ref.getVersion(), ref
 547                             .getAccessionId()));
 548             ref.setMap(map);
 549           }
 550           noProteinDbref = false;
 551         }
 552         if (product != null)
 553         {
 554           DBRefEntry pref = new DBRefEntry(ref.getSource(),
 555                   ref.getVersion(), ref.getAccessionId());
 556           pref.setMap(null); // reference is direct
 557           product.addDBRef(pref);
 558           // Add converse mapping reference
 559           if (map != null)
 560           {
 561             Mapping pmap = new Mapping(dna, map.getMap().getInverse());
 562             pref = new DBRefEntry(sourceDb, getVersion(),
 563                     this.getAccession());
 564             pref.setMap(pmap);
 565             if (map.getTo() != null)
 566             {
 567               map.getTo().addDBRef(pref);
 568             }
 569           }
 570         }
 571         dna.addDBRef(ref);
 572       }
 573       if (noProteinDbref && product != null)
 574       {
 575         // add protein coding reference to dna sequence so xref matches
 576         if (protEMBLCDS == null)
 577         {
 578           protEMBLCDS = new DBRefEntry();
 579           protEMBLCDS.setAccessionId(prid);
 580           protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 581           protEMBLCDS.setVersion(getVersion());
 582           protEMBLCDS
 583                   .setMap(new Mapping(product, map.getMap().getInverse()));
 584         }
 585         product.addDBRef(protEMBLCDS);
 586
 587         // Add converse mapping reference
 588         if (map != null)
 589         {
 590           Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
 591                   .getInverse());
 592           DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
 593           ncMap.setMap(pmap);
 594           if (map.getTo() != null)
 595           {
 596             dna.addDBRef(ncMap);
 597           }
 598         }
 599       }
 600     }
 601   }
 602
 603   /**
 604    * Helper method to construct a SequenceFeature for one cds range
 605    *
 606    * @param exons
 607    *          array of cds [start, end, ...] positions
 608    * @param exonStartIndex
 609    *          offset into the exons array
 610    * @param proteinName
 611    * @param proteinAccessionId
 612    * @param vals
 613    *          map of 'miscellaneous values' for feature
 614    * @param codonStart
 615    *          codon start position for CDS (1/2/3, normally 1)
 616    * @return
 617    */
 618   protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
 619           String proteinName, String proteinAccessionId,
 620           Map<String, String> vals, int codonStart)
 621   {
 622     int exonNumber = exonStartIndex / 2 + 1;
 623     SequenceFeature sf = new SequenceFeature();
 624     sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
 625     sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
 626     sf.setDescription(String.format(
 627             "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName,
 628             proteinAccessionId));
 629     sf.setPhase(String.valueOf(codonStart - 1));
 630     sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-");
 631     sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 632     sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 633     if (!vals.isEmpty())
 634     {
 635       StringBuilder sb = new StringBuilder();
 636       boolean first = true;
 637       for (Entry<String, String> val : vals.entrySet())
 638       {
 639         if (!first)
 640         {
 641           sb.append(";");
 642         }
 643         sb.append(val.getKey()).append("=").append(val.getValue());
 644         first = false;
 645         sf.setValue(val.getKey(), val.getValue());
 646       }
 647       sf.setAttributes(sb.toString());
 648     }
 649     return sf;
 650   }
 651
 652   /**
 653    * Returns the CDS positions as a list of [start, end, start, end...]
 654    * positions. If on the reverse strand, these will be in descending order.
 655    *
 656    * @param feature
 657    * @return
 658    */
 659   protected int[] getCdsRanges(EmblFeature feature)
 660   {
 661     if (feature.locations == null)
 662     {
 663       return new int[] {};
 664     }
 665     int cdsBoundaryCount = 0; // count of all start/stop locations
 666     int[][] cdsLocations = new int[feature.locations.size()][];
 667     int locationNumber = 0;
 668     for (EmblFeatureLocations loc : feature.locations)
 669     {
 670       int[] locationRanges = loc.getElementRanges(accession);
 671       cdsLocations[locationNumber++] = locationRanges;
 672       cdsBoundaryCount += locationRanges.length;
 673     }
 674     int[] cdsRanges = new int[cdsBoundaryCount];
 675     int copyTo = 0;
 676     for (int[] ranges : cdsLocations)
 677     {
 678       System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
 679       copyTo += ranges.length;
 680     }
 681     return cdsRanges;
 682
 683   }
 684
 685   /**
 686    * truncate the last exon interval to the prlength'th codon
 687    *
 688    * @param prlength
 689    * @param exon
 690    * @return new exon
 691    */
 692   private int[] adjustForProteinLength(int prlength, int[] exon)
 693   {
 694
 695     int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3;
 696     // first adjust range for codon start attribute
 697     if (prlength >= 1 && exon != null)
 698     {
 699       origxon = new int[exon.length];
 700       System.arraycopy(exon, 0, origxon, 0, exon.length);
 701       int cdspos = 0;
 702       for (int x = 0; x < exon.length && sxpos == -1; x += 2)
 703       {
 704         cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 705         if (cdslength <= cdspos)
 706         {
 707           // advanced beyond last codon.
 708           sxpos = x;
 709           if (cdslength != cdspos)
 710           {
 711             // System.err
 712             // .println("Truncating final exon interval on region by "
 713             // + (cdspos - cdslength));
 714           }
 715           // locate the new end boundary of final exon as endxon
 716           endxon = exon[x + 1] - cdspos + cdslength;
 717           break;
 718         }
 719       }
 720
 721       if (sxpos != -1)
 722       {
 723         // and trim the exon interval set if necessary
 724         int[] nxon = new int[sxpos + 2];
 725         System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 726         nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 727                                   // set
 728         exon = nxon;
 729       }
 730     }
 731     return exon;
 732   }
 733 }