src/jalview/datamodel/xdb/embl/EmblEntry.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel.xdb.embl;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.DBRefEntry;
  25 import jalview.datamodel.DBRefSource;
  26 import jalview.datamodel.FeatureProperties;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.DBRefUtils;
  32 import jalview.util.MapList;
  33 import jalview.util.MappingUtils;
  34 import jalview.util.StringUtils;
  35
  36 import java.util.Hashtable;
  37 import java.util.List;
  38 import java.util.Map;
  39 import java.util.Map.Entry;
  40 import java.util.Vector;
  41 import java.util.regex.Pattern;
  42
  43 /**
  44  * Data model for one entry returned from an EMBL query, as marshalled by a
  45  * Castor binding file
  46  *
  47  * For example:
  48  * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
  49  * &format=emblxml
  50  *
  51  * @see embl_mapping.xml
  52  */
  53 public class EmblEntry
  54 {
  55   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  56
  57   String accession;
  58
  59   String version;
  60
  61   String taxDivision;
  62
  63   String desc;
  64
  65   String rCreated;
  66
  67   String rLastUpdated;
  68
  69   String lastUpdated;
  70
  71   Vector<String> keywords;
  72
  73   Vector<DBRefEntry> dbRefs;
  74
  75   Vector<EmblFeature> features;
  76
  77   EmblSequence sequence;
  78
  79   /**
  80    * @return the accession
  81    */
  82   public String getAccession()
  83   {
  84     return accession;
  85   }
  86
  87   /**
  88    * @param accession
  89    *          the accession to set
  90    */
  91   public void setAccession(String accession)
  92   {
  93     this.accession = accession;
  94   }
  95
  96   /**
  97    * @return the dbRefs
  98    */
  99   public Vector<DBRefEntry> getDbRefs()
 100   {
 101     return dbRefs;
 102   }
 103
 104   /**
 105    * @param dbRefs
 106    *          the dbRefs to set
 107    */
 108   public void setDbRefs(Vector<DBRefEntry> dbRefs)
 109   {
 110     this.dbRefs = dbRefs;
 111   }
 112
 113   /**
 114    * @return the desc
 115    */
 116   public String getDesc()
 117   {
 118     return desc;
 119   }
 120
 121   /**
 122    * @param desc
 123    *          the desc to set
 124    */
 125   public void setDesc(String desc)
 126   {
 127     this.desc = desc;
 128   }
 129
 130   /**
 131    * @return the features
 132    */
 133   public Vector<EmblFeature> getFeatures()
 134   {
 135     return features;
 136   }
 137
 138   /**
 139    * @param features
 140    *          the features to set
 141    */
 142   public void setFeatures(Vector<EmblFeature> features)
 143   {
 144     this.features = features;
 145   }
 146
 147   /**
 148    * @return the keywords
 149    */
 150   public Vector<String> getKeywords()
 151   {
 152     return keywords;
 153   }
 154
 155   /**
 156    * @param keywords
 157    *          the keywords to set
 158    */
 159   public void setKeywords(Vector<String> keywords)
 160   {
 161     this.keywords = keywords;
 162   }
 163
 164   /**
 165    * @return the lastUpdated
 166    */
 167   public String getLastUpdated()
 168   {
 169     return lastUpdated;
 170   }
 171
 172   /**
 173    * @param lastUpdated
 174    *          the lastUpdated to set
 175    */
 176   public void setLastUpdated(String lastUpdated)
 177   {
 178     this.lastUpdated = lastUpdated;
 179   }
 180
 181   /**
 182    * @return the releaseCreated
 183    */
 184   public String getRCreated()
 185   {
 186     return rCreated;
 187   }
 188
 189   /**
 190    * @param releaseCreated
 191    *          the releaseCreated to set
 192    */
 193   public void setRCreated(String releaseCreated)
 194   {
 195     this.rCreated = releaseCreated;
 196   }
 197
 198   /**
 199    * @return the releaseLastUpdated
 200    */
 201   public String getRLastUpdated()
 202   {
 203     return rLastUpdated;
 204   }
 205
 206   /**
 207    * @param releaseLastUpdated
 208    *          the releaseLastUpdated to set
 209    */
 210   public void setRLastUpdated(String releaseLastUpdated)
 211   {
 212     this.rLastUpdated = releaseLastUpdated;
 213   }
 214
 215   /**
 216    * @return the sequence
 217    */
 218   public EmblSequence getSequence()
 219   {
 220     return sequence;
 221   }
 222
 223   /**
 224    * @param sequence
 225    *          the sequence to set
 226    */
 227   public void setSequence(EmblSequence sequence)
 228   {
 229     this.sequence = sequence;
 230   }
 231
 232   /**
 233    * @return the taxDivision
 234    */
 235   public String getTaxDivision()
 236   {
 237     return taxDivision;
 238   }
 239
 240   /**
 241    * @param taxDivision
 242    *          the taxDivision to set
 243    */
 244   public void setTaxDivision(String taxDivision)
 245   {
 246     this.taxDivision = taxDivision;
 247   }
 248
 249   /**
 250    * @return the version
 251    */
 252   public String getVersion()
 253   {
 254     return version;
 255   }
 256
 257   /**
 258    * @param version
 259    *          the version to set
 260    */
 261   public void setVersion(String version)
 262   {
 263     this.version = version;
 264   }
 265
 266   /**
 267    * Recover annotated sequences from EMBL file
 268    *
 269    * @param sourceDb
 270    * @param peptides
 271    *          a list of protein products found so far (to add to)
 272    * @return dna dataset sequence with DBRefs and features
 273    */
 274   public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
 275   {
 276     SequenceI dna = new Sequence(sourceDb + "|" + accession,
 277             sequence.getSequence());
 278     dna.setDescription(desc);
 279     DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
 280     dna.addDBRef(retrievedref);
 281     // add map to indicate the sequence is a valid coordinate frame for the
 282     // dbref
 283     retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
 284             new int[] { 1, dna.getLength() }, 1, 1));
 285     // TODO: transform EMBL Database refs to canonical form
 286     if (dbRefs != null)
 287     {
 288       for (DBRefEntry dbref : dbRefs)
 289       {
 290         dna.addDBRef(dbref);
 291       }
 292     }
 293
 294     try
 295     {
 296       for (EmblFeature feature : features)
 297       {
 298         if (feature.dbRefs != null)
 299         {
 300           for (DBRefEntry dbref : feature.dbRefs)
 301           {
 302             dna.addDBRef(dbref);
 303           }
 304         }
 305         if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
 306         {
 307           parseCodingFeature(feature, sourceDb, dna, peptides);
 308         }
 309       }
 310     } catch (Exception e)
 311     {
 312       System.err.println("EMBL Record Features parsing error!");
 313       System.err
 314               .println("Please report the following to help@jalview.org :");
 315       System.err.println("EMBL Record " + accession);
 316       System.err.println("Resulted in exception: " + e.getMessage());
 317       e.printStackTrace(System.err);
 318     }
 319
 320     return dna;
 321   }
 322
 323   /**
 324    * Extracts coding region and product from a CDS feature and properly decorate
 325    * it with annotations.
 326    *
 327    * @param feature
 328    *          coding feature
 329    * @param sourceDb
 330    *          source database for the EMBLXML
 331    * @param dna
 332    *          parent dna sequence for this record
 333    * @param peptides
 334    *          list of protein product sequences for Embl entry
 335    */
 336   void parseCodingFeature(EmblFeature feature, String sourceDb,
 337           SequenceI dna, List<SequenceI> peptides)
 338   {
 339     boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 340
 341     int[] exon = getCdsRanges(feature);
 342
 343     String prseq = null;
 344     String prname = "";
 345     String prid = null;
 346     Map<String, String> vals = new Hashtable<String, String>();
 347     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 348
 349     /*
 350      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 351      * (phase is required for CDS features in GFF3 format)
 352      */
 353     int codonStart = 1;
 354
 355     /*
 356      * parse qualifiers, saving protein translation, protein id,
 357      * codon start position, product (name), and 'other values'
 358      */
 359     if (feature.getQualifiers() != null)
 360     {
 361       for (Qualifier q : feature.getQualifiers())
 362       {
 363         String qname = q.getName();
 364         if (qname.equals("translation"))
 365         {
 366           // remove all spaces (precompiled String.replaceAll(" ", ""))
 367           prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
 368         }
 369         else if (qname.equals("protein_id"))
 370         {
 371           prid = q.getValues()[0];
 372         }
 373         else if (qname.equals("codon_start"))
 374         {
 375           try
 376           {
 377             codonStart = Integer.parseInt(q.getValues()[0]);
 378           } catch (NumberFormatException e)
 379           {
 380             System.err.println("Invalid codon_start in XML for "
 381                     + accession + ": " + e.getMessage());
 382           }
 383         }
 384         else if (qname.equals("product"))
 385         {
 386           // sometimes name is returned e.g. for V00488
 387           prname = q.getValues()[0];
 388         }
 389         else
 390         {
 391           // throw anything else into the additional properties hash
 392           String[] qvals = q.getValues();
 393           if (qvals != null)
 394           {
 395             String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
 396                     ",");
 397             vals.put(qname, commaSeparated);
 398           }
 399         }
 400       }
 401     }
 402
 403     // SequenceI product = null;
 404     DBRefEntry protEMBLCDS = null;
 405     exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
 406     boolean noProteinDbref = true;
 407
 408     SequenceI product = null;
 409     Mapping map = null;
 410     if (prseq != null && prname != null && prid != null)
 411     {
 412       /*
 413        * look for product in peptides list, if not found, add it
 414        */
 415       product = matcher.findIdMatch(prid);
 416       if (product == null)
 417       {
 418         product = new Sequence(prid, prseq, 1, prseq.length());
 419         product.setDescription(((prname.length() == 0) ? "Protein Product from "
 420                 + sourceDb
 421                 : prname));
 422         peptides.add(product);
 423         matcher.add(product);
 424       }
 425
 426       // we have everything - create the mapping and perhaps the protein
 427       // sequence
 428       if (exon == null || exon.length == 0)
 429       {
 430         System.err
 431                 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 432                         + sourceDb + ":" + getAccession() + ")");
 433         if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
 434         {
 435           System.err
 436                   .println("Not allowing for additional stop codon at end of cDNA fragment... !");
 437           // this might occur for CDS sequences where no features are
 438           // marked.
 439           exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() };
 440           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 441                   3, 1);
 442         }
 443         if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
 444         {
 445           System.err
 446                   .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 447           exon = new int[] { dna.getStart() + (codonStart - 1),
 448               dna.getEnd() - 3 };
 449           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 450                   3, 1);
 451         }
 452       }
 453       else
 454       {
 455         // Trim the exon mapping if necessary - the given product may only be a
 456         // fragment of a larger protein. (EMBL:AY043181 is an example)
 457
 458         if (isEmblCdna)
 459         {
 460           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 461           // map
 462           // if given a dataset reference, search dataset for parent EMBL
 463           // sequence if it exists and set its map
 464           // make a new feature annotating the coding contig
 465         }
 466         else
 467         {
 468           // final product length truncation check
 469           // TODO should from range include stop codon even if not in protein
 470           // in order to include stop codon in CDS sequence (as done for
 471           // Ensembl)?
 472           int[] cdsRanges = adjustForProteinLength(prseq.length(),
 473                   exon);
 474           map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1);
 475           // reconstruct the EMBLCDS entry
 476           // TODO: this is only necessary when there codon annotation is
 477           // complete (I think JBPNote)
 478           DBRefEntry pcdnaref = new DBRefEntry();
 479           pcdnaref.setAccessionId(prid);
 480           pcdnaref.setSource(DBRefSource.EMBLCDS);
 481           pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
 482           MapList mp = new MapList(new int[] { 1, prseq.length() },
 483                   new int[] { 1 + (codonStart - 1),
 484                       (codonStart - 1) + 3 * prseq.length() }, 1, 3);
 485           pcdnaref.setMap(new Mapping(mp));
 486           if (product != null)
 487           {
 488             product.addDBRef(pcdnaref);
 489             protEMBLCDS = new DBRefEntry(pcdnaref);
 490             protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 491             product.addDBRef(protEMBLCDS);
 492           }
 493         }
 494       }
 495       // add cds feature to dna seq - this may include the stop codon
 496       for (int xint = 0; exon != null && xint < exon.length; xint += 2)
 497       {
 498         SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
 499                 codonStart);
 500         sf.setType(feature.getName()); // "CDS"
 501         sf.setFeatureGroup(sourceDb);
 502         dna.addSequenceFeature(sf);
 503       }
 504     }
 505     // add dbRefs to sequence
 506     if (feature.dbRefs != null)
 507     {
 508       boolean productMapped = false;
 509       for (DBRefEntry ref : feature.dbRefs)
 510       {
 511         ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
 512         // Hard code the kind of protein product accessions that EMBL cite
 513         if (ref.getSource().equals(DBRefSource.UNIPROT))
 514         {
 515           String refSeqName = DBRefSource.UNIPROT + "|"
 516                   + ref.getAccessionId();
 517           ref.setMap(map);
 518           if (map != null && map.getTo() != null)
 519           {
 520             // if (!productMapped)
 521             // {
 522             // map.getTo().setName(refSeqName);
 523             // map.getTo().addDBRef(
 524             // new DBRefEntry(ref.getSource(), ref.getVersion(), ref
 525             // .getAccessionId())); // don't copy map over.
 526             // // if (map.getTo().getName().startsWith(prid))
 527             // productMapped = true;
 528             // }
 529             // else
 530             // {
 531               /*
 532                * an alternate UNIPROT product for CDS - same mapping
 533                * but to a sequence with a different name
 534                */
 535               SequenceI newSeq = matcher.findIdMatch(refSeqName);
 536               if (newSeq == null)
 537               {
 538                 newSeq = new Sequence(refSeqName, map.getTo()
 539                       .getSequenceAsString());
 540                 matcher.add(newSeq);
 541                 peptides.add(newSeq);
 542               }
 543               Mapping newMap = new Mapping(newSeq, map.getMap());
 544               ref.setMap(newMap);
 545             // }
 546           }
 547           noProteinDbref = false;
 548         }
 549         if (product != null)
 550         {
 551           DBRefEntry pref = new DBRefEntry(ref.getSource(),
 552                   ref.getVersion(), ref.getAccessionId());
 553           pref.setMap(null); // reference is direct
 554           product.addDBRef(pref);
 555           // Add converse mapping reference
 556           if (map != null)
 557           {
 558             Mapping pmap = new Mapping(dna, map.getMap().getInverse());
 559             pref = new DBRefEntry(sourceDb, getVersion(),
 560                     this.getAccession());
 561             pref.setMap(pmap);
 562             if (map.getTo() != null)
 563             {
 564               map.getTo().addDBRef(pref);
 565             }
 566           }
 567         }
 568         dna.addDBRef(ref);
 569       }
 570       if (noProteinDbref && product != null)
 571       {
 572         // add protein coding reference to dna sequence so xref matches
 573         if (protEMBLCDS == null)
 574         {
 575           protEMBLCDS = new DBRefEntry();
 576           protEMBLCDS.setAccessionId(prid);
 577           protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 578           protEMBLCDS.setVersion(getVersion());
 579           protEMBLCDS
 580                   .setMap(new Mapping(product, map.getMap().getInverse()));
 581         }
 582         product.addDBRef(protEMBLCDS);
 583
 584         // Add converse mapping reference
 585         if (map != null)
 586         {
 587           Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
 588                   .getInverse());
 589           DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
 590           ncMap.setMap(pmap);
 591           if (map.getTo() != null)
 592           {
 593             dna.addDBRef(ncMap);
 594           }
 595         }
 596       }
 597     }
 598   }
 599
 600   /**
 601    * Helper method to construct a SequenceFeature for one cds range
 602    *
 603    * @param exons
 604    *          array of cds [start, end, ...] positions
 605    * @param exonStartIndex
 606    *          offset into the exons array
 607    * @param proteinName
 608    * @param proteinAccessionId
 609    * @param vals
 610    *          map of 'miscellaneous values' for feature
 611    * @param codonStart
 612    *          codon start position for CDS (1/2/3, normally 1)
 613    * @return
 614    */
 615   protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
 616           String proteinName, String proteinAccessionId,
 617           Map<String, String> vals, int codonStart)
 618   {
 619     int exonNumber = exonStartIndex / 2 + 1;
 620     SequenceFeature sf = new SequenceFeature();
 621     sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
 622     sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
 623     sf.setDescription(String.format(
 624             "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName,
 625             proteinAccessionId));
 626     sf.setPhase(String.valueOf(codonStart - 1));
 627     sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-");
 628     sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 629     sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 630     if (!vals.isEmpty())
 631     {
 632       StringBuilder sb = new StringBuilder();
 633       boolean first = true;
 634       for (Entry<String, String> val : vals.entrySet())
 635       {
 636         if (!first)
 637         {
 638           sb.append(";");
 639         }
 640         sb.append(val.getKey()).append("=").append(val.getValue());
 641         first = false;
 642         sf.setValue(val.getKey(), val.getValue());
 643       }
 644       sf.setAttributes(sb.toString());
 645     }
 646     return sf;
 647   }
 648
 649   /**
 650    * Returns the CDS positions as a list of [start, end, start, end...]
 651    * positions. If on the reverse strand, these will be in descending order.
 652    *
 653    * @param feature
 654    * @return
 655    */
 656   protected int[] getCdsRanges(EmblFeature feature)
 657   {
 658     if (feature.locations == null)
 659     {
 660       return new int[] {};
 661     }
 662     int cdsBoundaryCount = 0; // count of all start/stop locations
 663     int[][] cdsLocations = new int[feature.locations.size()][];
 664     int locationNumber = 0;
 665     for (EmblFeatureLocations loc : feature.locations)
 666     {
 667       int[] locationRanges = loc.getElementRanges(accession);
 668       cdsLocations[locationNumber++] = locationRanges;
 669       cdsBoundaryCount += locationRanges.length;
 670     }
 671     int[] cdsRanges = new int[cdsBoundaryCount];
 672     int copyTo = 0;
 673     for (int[] ranges : cdsLocations)
 674     {
 675       System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
 676       copyTo += ranges.length;
 677     }
 678     return cdsRanges;
 679
 680   }
 681
 682   /**
 683    * truncate the last exon interval to the prlength'th codon
 684    *
 685    * @param prlength
 686    * @param exon
 687    * @return new exon
 688    */
 689   private int[] adjustForProteinLength(int prlength, int[] exon)
 690   {
 691
 692     int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3;
 693     // first adjust range for codon start attribute
 694     if (prlength >= 1 && exon != null)
 695     {
 696       origxon = new int[exon.length];
 697       System.arraycopy(exon, 0, origxon, 0, exon.length);
 698       int cdspos = 0;
 699       for (int x = 0; x < exon.length && sxpos == -1; x += 2)
 700       {
 701         cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 702         if (cdslength <= cdspos)
 703         {
 704           // advanced beyond last codon.
 705           sxpos = x;
 706           if (cdslength != cdspos)
 707           {
 708             System.err
 709                     .println("Truncating final exon interval on region by "
 710                             + (cdspos - cdslength));
 711           }
 712           // locate the new end boundary of final exon as endxon
 713           endxon = exon[x + 1] - cdspos + cdslength;
 714           break;
 715         }
 716       }
 717
 718       if (sxpos != -1)
 719       {
 720         // and trim the exon interval set if necessary
 721         int[] nxon = new int[sxpos + 2];
 722         System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 723         nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 724                                   // set
 725         exon = nxon;
 726       }
 727     }
 728     return exon;
 729   }
 730 }