src/jalview/datamodel/xdb/embl/EmblEntry.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel.xdb.embl;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.DBRefEntry;
  25 import jalview.datamodel.DBRefSource;
  26 import jalview.datamodel.FeatureProperties;
  27 import jalview.datamodel.Mapping;
  28 import jalview.datamodel.Sequence;
  29 import jalview.datamodel.SequenceFeature;
  30 import jalview.datamodel.SequenceI;
  31 import jalview.util.DBRefUtils;
  32 import jalview.util.MapList;
  33 import jalview.util.MappingUtils;
  34 import jalview.util.StringUtils;
  35
  36 import java.util.Arrays;
  37 import java.util.Hashtable;
  38 import java.util.List;
  39 import java.util.Map;
  40 import java.util.Map.Entry;
  41 import java.util.Vector;
  42 import java.util.regex.Pattern;
  43
  44 /**
  45  * Data model for one entry returned from an EMBL query, as marshalled by a
  46  * Castor binding file
  47  *
  48  * For example:
  49  * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
  50  * &format=emblxml
  51  *
  52  * @see embl_mapping.xml
  53  */
  54 public class EmblEntry
  55 {
  56   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  57
  58   String accession;
  59
  60   String version;
  61
  62   String taxDivision;
  63
  64   String desc;
  65
  66   String rCreated;
  67
  68   String rLastUpdated;
  69
  70   String lastUpdated;
  71
  72   Vector<String> keywords;
  73
  74   Vector<DBRefEntry> dbRefs;
  75
  76   Vector<EmblFeature> features;
  77
  78   EmblSequence sequence;
  79
  80   /**
  81    * @return the accession
  82    */
  83   public String getAccession()
  84   {
  85     return accession;
  86   }
  87
  88   /**
  89    * @param accession
  90    *          the accession to set
  91    */
  92   public void setAccession(String accession)
  93   {
  94     this.accession = accession;
  95   }
  96
  97   /**
  98    * @return the dbRefs
  99    */
 100   public Vector<DBRefEntry> getDbRefs()
 101   {
 102     return dbRefs;
 103   }
 104
 105   /**
 106    * @param dbRefs
 107    *          the dbRefs to set
 108    */
 109   public void setDbRefs(Vector<DBRefEntry> dbRefs)
 110   {
 111     this.dbRefs = dbRefs;
 112   }
 113
 114   /**
 115    * @return the desc
 116    */
 117   public String getDesc()
 118   {
 119     return desc;
 120   }
 121
 122   /**
 123    * @param desc
 124    *          the desc to set
 125    */
 126   public void setDesc(String desc)
 127   {
 128     this.desc = desc;
 129   }
 130
 131   /**
 132    * @return the features
 133    */
 134   public Vector<EmblFeature> getFeatures()
 135   {
 136     return features;
 137   }
 138
 139   /**
 140    * @param features
 141    *          the features to set
 142    */
 143   public void setFeatures(Vector<EmblFeature> features)
 144   {
 145     this.features = features;
 146   }
 147
 148   /**
 149    * @return the keywords
 150    */
 151   public Vector<String> getKeywords()
 152   {
 153     return keywords;
 154   }
 155
 156   /**
 157    * @param keywords
 158    *          the keywords to set
 159    */
 160   public void setKeywords(Vector<String> keywords)
 161   {
 162     this.keywords = keywords;
 163   }
 164
 165   /**
 166    * @return the lastUpdated
 167    */
 168   public String getLastUpdated()
 169   {
 170     return lastUpdated;
 171   }
 172
 173   /**
 174    * @param lastUpdated
 175    *          the lastUpdated to set
 176    */
 177   public void setLastUpdated(String lastUpdated)
 178   {
 179     this.lastUpdated = lastUpdated;
 180   }
 181
 182   /**
 183    * @return the releaseCreated
 184    */
 185   public String getRCreated()
 186   {
 187     return rCreated;
 188   }
 189
 190   /**
 191    * @param releaseCreated
 192    *          the releaseCreated to set
 193    */
 194   public void setRCreated(String releaseCreated)
 195   {
 196     this.rCreated = releaseCreated;
 197   }
 198
 199   /**
 200    * @return the releaseLastUpdated
 201    */
 202   public String getRLastUpdated()
 203   {
 204     return rLastUpdated;
 205   }
 206
 207   /**
 208    * @param releaseLastUpdated
 209    *          the releaseLastUpdated to set
 210    */
 211   public void setRLastUpdated(String releaseLastUpdated)
 212   {
 213     this.rLastUpdated = releaseLastUpdated;
 214   }
 215
 216   /**
 217    * @return the sequence
 218    */
 219   public EmblSequence getSequence()
 220   {
 221     return sequence;
 222   }
 223
 224   /**
 225    * @param sequence
 226    *          the sequence to set
 227    */
 228   public void setSequence(EmblSequence sequence)
 229   {
 230     this.sequence = sequence;
 231   }
 232
 233   /**
 234    * @return the taxDivision
 235    */
 236   public String getTaxDivision()
 237   {
 238     return taxDivision;
 239   }
 240
 241   /**
 242    * @param taxDivision
 243    *          the taxDivision to set
 244    */
 245   public void setTaxDivision(String taxDivision)
 246   {
 247     this.taxDivision = taxDivision;
 248   }
 249
 250   /**
 251    * @return the version
 252    */
 253   public String getVersion()
 254   {
 255     return version;
 256   }
 257
 258   /**
 259    * @param version
 260    *          the version to set
 261    */
 262   public void setVersion(String version)
 263   {
 264     this.version = version;
 265   }
 266
 267   /**
 268    * Recover annotated sequences from EMBL file
 269    *
 270    * @param sourceDb
 271    * @param peptides
 272    *          a list of protein products found so far (to add to)
 273    * @return dna dataset sequence with DBRefs and features
 274    */
 275   public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
 276   {
 277     SequenceI dna = new Sequence(sourceDb + "|" + accession,
 278             sequence.getSequence());
 279     dna.setDescription(desc);
 280     DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
 281     dna.addDBRef(retrievedref);
 282     // add map to indicate the sequence is a valid coordinate frame for the
 283     // dbref
 284     retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
 285             new int[] { 1, dna.getLength() }, 1, 1));
 286     // TODO: transform EMBL Database refs to canonical form
 287     if (dbRefs != null)
 288     {
 289       for (DBRefEntry dbref : dbRefs)
 290       {
 291         dna.addDBRef(dbref);
 292       }
 293     }
 294
 295     try
 296     {
 297       for (EmblFeature feature : features)
 298       {
 299         if (feature.dbRefs != null)
 300         {
 301           for (DBRefEntry dbref : feature.dbRefs)
 302           {
 303             dna.addDBRef(dbref);
 304           }
 305         }
 306         if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
 307         {
 308           parseCodingFeature(feature, sourceDb, dna, peptides);
 309         }
 310       }
 311     } catch (Exception e)
 312     {
 313       System.err.println("EMBL Record Features parsing error!");
 314       System.err
 315               .println("Please report the following to help@jalview.org :");
 316       System.err.println("EMBL Record " + accession);
 317       System.err.println("Resulted in exception: " + e.getMessage());
 318       e.printStackTrace(System.err);
 319     }
 320
 321     return dna;
 322   }
 323
 324   /**
 325    * Extracts coding region and product from a CDS feature and properly decorate
 326    * it with annotations.
 327    *
 328    * @param feature
 329    *          coding feature
 330    * @param sourceDb
 331    *          source database for the EMBLXML
 332    * @param dna
 333    *          parent dna sequence for this record
 334    * @param peptides
 335    *          list of protein product sequences for Embl entry
 336    */
 337   void parseCodingFeature(EmblFeature feature, String sourceDb,
 338           SequenceI dna, List<SequenceI> peptides)
 339   {
 340     boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 341
 342     int[] exon = getCdsRanges(feature);
 343
 344     String prseq = null;
 345     String prname = "";
 346     String prid = null;
 347     Map<String, String> vals = new Hashtable<String, String>();
 348     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 349
 350     /*
 351      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 352      * (phase is required for CDS features in GFF3 format)
 353      */
 354     int codonStart = 1;
 355
 356     /*
 357      * parse qualifiers, saving protein translation, protein id,
 358      * codon start position, product (name), and 'other values'
 359      */
 360     if (feature.getQualifiers() != null)
 361     {
 362       for (Qualifier q : feature.getQualifiers())
 363       {
 364         String qname = q.getName();
 365         if (qname.equals("translation"))
 366         {
 367           // remove all spaces (precompiled String.replaceAll(" ", ""))
 368           prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
 369         }
 370         else if (qname.equals("protein_id"))
 371         {
 372           prid = q.getValues()[0];
 373         }
 374         else if (qname.equals("codon_start"))
 375         {
 376           try
 377           {
 378             codonStart = Integer.parseInt(q.getValues()[0]);
 379           } catch (NumberFormatException e)
 380           {
 381             System.err.println("Invalid codon_start in XML for "
 382                     + accession + ": " + e.getMessage());
 383           }
 384         }
 385         else if (qname.equals("product"))
 386         {
 387           // sometimes name is returned e.g. for V00488
 388           prname = q.getValues()[0];
 389         }
 390         else
 391         {
 392           // throw anything else into the additional properties hash
 393           String[] qvals = q.getValues();
 394           if (qvals != null)
 395           {
 396             String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
 397                     ",");
 398             vals.put(qname, commaSeparated);
 399           }
 400         }
 401       }
 402     }
 403
 404     DBRefEntry protEMBLCDS = null;
 405     exon = MappingUtils.removeStartPositions(codonStart - 1, exon);
 406     boolean noProteinDbref = true;
 407
 408     SequenceI product = null;
 409     Mapping map = null;
 410     if (prseq != null && prname != null && prid != null)
 411     {
 412       /*
 413        * look for product in peptides list, if not found, add it
 414        */
 415       product = matcher.findIdMatch(prid);
 416       if (product == null)
 417       {
 418         product = new Sequence(prid, prseq, 1, prseq.length());
 419         product.setDescription(((prname.length() == 0) ? "Protein Product from "
 420                 + sourceDb
 421                 : prname));
 422         peptides.add(product);
 423         matcher.add(product);
 424       }
 425
 426       // we have everything - create the mapping and perhaps the protein
 427       // sequence
 428       if (exon == null || exon.length == 0)
 429       {
 430         System.err
 431                 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 432                         + sourceDb + ":" + getAccession() + ")");
 433         if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length))
 434         {
 435           System.err
 436                   .println("Not allowing for additional stop codon at end of cDNA fragment... !");
 437           // this might occur for CDS sequences where no features are
 438           // marked.
 439           exon = new int[] { dna.getStart() + (codonStart - 1),
 440               dna.getEnd() };
 441           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 442                   3, 1);
 443         }
 444         if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
 445         {
 446           System.err
 447                   .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 448           exon = new int[] { dna.getStart() + (codonStart - 1),
 449               dna.getEnd() - 3 };
 450           map = new Mapping(product, exon, new int[] { 1, prseq.length() },
 451                   3, 1);
 452         }
 453       }
 454       else
 455       {
 456         // Trim the exon mapping if necessary - the given product may only be a
 457         // fragment of a larger protein. (EMBL:AY043181 is an example)
 458
 459         if (isEmblCdna)
 460         {
 461           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 462           // map
 463           // if given a dataset reference, search dataset for parent EMBL
 464           // sequence if it exists and set its map
 465           // make a new feature annotating the coding contig
 466         }
 467         else
 468         {
 469           // final product length truncation check
 470           // TODO should from range include stop codon even if not in protein
 471           // in order to include stop codon in CDS sequence (as done for
 472           // Ensembl)?
 473           int[] cdsRanges = adjustForProteinLength(prseq.length(), exon);
 474           map = new Mapping(product, cdsRanges, new int[] { 1,
 475               prseq.length() }, 3, 1);
 476           // reconstruct the EMBLCDS entry
 477           // TODO: this is only necessary when there codon annotation is
 478           // complete (I think JBPNote)
 479           DBRefEntry pcdnaref = new DBRefEntry();
 480           pcdnaref.setAccessionId(prid);
 481           pcdnaref.setSource(DBRefSource.EMBLCDS);
 482           pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
 483           MapList mp = new MapList(new int[] { 1, prseq.length() },
 484                   new int[] { 1 + (codonStart - 1),
 485                       (codonStart - 1) + 3 * prseq.length() }, 1, 3);
 486           pcdnaref.setMap(new Mapping(mp));
 487           if (product != null)
 488           {
 489             product.addDBRef(pcdnaref);
 490             protEMBLCDS = new DBRefEntry(pcdnaref);
 491             protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 492             product.addDBRef(protEMBLCDS);
 493           }
 494         }
 495       }
 496       // add cds feature to dna seq - this may include the stop codon
 497       for (int xint = 0; exon != null && xint < exon.length; xint += 2)
 498       {
 499         SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals,
 500                 codonStart);
 501         sf.setType(feature.getName()); // "CDS"
 502         sf.setFeatureGroup(sourceDb);
 503         dna.addSequenceFeature(sf);
 504       }
 505     }
 506
 507     /*
 508      * add dbRefs to sequence, and mappings for Uniprot xrefs
 509      */
 510     if (feature.dbRefs != null)
 511     {
 512       boolean mappingUsed = false;
 513       for (DBRefEntry ref : feature.dbRefs)
 514       {
 515         ref.setSource(DBRefUtils.getCanonicalName(ref.getSource()));
 516         if (ref.getSource().equals(DBRefSource.UNIPROT))
 517         {
 518           String proteinSeqName = DBRefSource.UNIPROT + "|"
 519                   + ref.getAccessionId();
 520           if (map != null && map.getTo() != null)
 521           {
 522             if (mappingUsed)
 523             {
 524               /*
 525                * two or more Uniprot xrefs for the same CDS -
 526                * each needs a distinct Mapping (as to a different sequence)
 527                */
 528               map = new Mapping(map);
 529             }
 530             mappingUsed = true;
 531
 532             /*
 533              * try to locate the protein mapped to (possibly by a
 534              * previous CDS feature)
 535              */
 536             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 537             if (proteinSeq == null)
 538             {
 539               proteinSeq = new Sequence(proteinSeqName,
 540                       product.getSequenceAsString());
 541               matcher.add(proteinSeq);
 542               peptides.add(proteinSeq);
 543             }
 544             map.setTo(proteinSeq);
 545             map.getTo().addDBRef(
 546                     new DBRefEntry(ref.getSource(), ref.getVersion(), ref
 547                             .getAccessionId()));
 548             ref.setMap(map);
 549           }
 550           noProteinDbref = false;
 551         }
 552         if (product != null)
 553         {
 554           DBRefEntry pref = new DBRefEntry(ref.getSource(),
 555                   ref.getVersion(), ref.getAccessionId());
 556           pref.setMap(null); // reference is direct
 557           product.addDBRef(pref);
 558           // Add converse mapping reference
 559           if (map != null)
 560           {
 561             Mapping pmap = new Mapping(dna, map.getMap().getInverse());
 562             pref = new DBRefEntry(sourceDb, getVersion(),
 563                     this.getAccession());
 564             pref.setMap(pmap);
 565             if (map.getTo() != null)
 566             {
 567               map.getTo().addDBRef(pref);
 568             }
 569           }
 570         }
 571         dna.addDBRef(ref);
 572       }
 573       if (noProteinDbref && product != null)
 574       {
 575         // add protein coding reference to dna sequence so xref matches
 576         if (protEMBLCDS == null)
 577         {
 578           protEMBLCDS = new DBRefEntry();
 579           protEMBLCDS.setAccessionId(prid);
 580           protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct);
 581           protEMBLCDS.setVersion(getVersion());
 582           protEMBLCDS
 583                   .setMap(new Mapping(product, map.getMap().getInverse()));
 584         }
 585         product.addDBRef(protEMBLCDS);
 586
 587         // Add converse mapping reference
 588         if (map != null)
 589         {
 590           Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap()
 591                   .getInverse());
 592           DBRefEntry ncMap = new DBRefEntry(protEMBLCDS);
 593           ncMap.setMap(pmap);
 594           if (map.getTo() != null)
 595           {
 596             dna.addDBRef(ncMap);
 597           }
 598         }
 599       }
 600     }
 601   }
 602
 603   /**
 604    * Helper method to construct a SequenceFeature for one cds range
 605    *
 606    * @param exons
 607    *          array of cds [start, end, ...] positions
 608    * @param exonStartIndex
 609    *          offset into the exons array
 610    * @param proteinName
 611    * @param proteinAccessionId
 612    * @param vals
 613    *          map of 'miscellaneous values' for feature
 614    * @param codonStart
 615    *          codon start position for CDS (1/2/3, normally 1)
 616    * @return
 617    */
 618   protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
 619           String proteinName, String proteinAccessionId,
 620           Map<String, String> vals, int codonStart)
 621   {
 622     int exonNumber = exonStartIndex / 2 + 1;
 623     SequenceFeature sf = new SequenceFeature();
 624     sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
 625     sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
 626     sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
 627             exonNumber, proteinName, proteinAccessionId));
 628     sf.setPhase(String.valueOf(codonStart - 1));
 629     sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
 630             : "-");
 631     sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 632     sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 633     if (!vals.isEmpty())
 634     {
 635       StringBuilder sb = new StringBuilder();
 636       boolean first = true;
 637       for (Entry<String, String> val : vals.entrySet())
 638       {
 639         if (!first)
 640         {
 641           sb.append(";");
 642         }
 643         sb.append(val.getKey()).append("=").append(val.getValue());
 644         first = false;
 645         sf.setValue(val.getKey(), val.getValue());
 646       }
 647       sf.setAttributes(sb.toString());
 648     }
 649     return sf;
 650   }
 651
 652   /**
 653    * Returns the CDS positions as a list of [start, end, start, end...]
 654    * positions. If on the reverse strand, these will be in descending order.
 655    *
 656    * @param feature
 657    * @return
 658    */
 659   protected int[] getCdsRanges(EmblFeature feature)
 660   {
 661     if (feature.locations == null)
 662     {
 663       return new int[] {};
 664     }
 665     int cdsBoundaryCount = 0; // count of all start/stop locations
 666     int[][] cdsLocations = new int[feature.locations.size()][];
 667     int locationNumber = 0;
 668     for (EmblFeatureLocations loc : feature.locations)
 669     {
 670       int[] locationRanges = loc.getElementRanges(accession);
 671       cdsLocations[locationNumber++] = locationRanges;
 672       cdsBoundaryCount += locationRanges.length;
 673     }
 674     int[] cdsRanges = new int[cdsBoundaryCount];
 675     int copyTo = 0;
 676     for (int[] ranges : cdsLocations)
 677     {
 678       System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length);
 679       copyTo += ranges.length;
 680     }
 681     return cdsRanges;
 682
 683   }
 684
 685   /**
 686    * truncate the last exon interval to the prlength'th codon
 687    *
 688    * @param prlength
 689    * @param exon
 690    * @return new exon
 691    */
 692   static int[] adjustForProteinLength(int prlength, int[] exon)
 693   {
 694     if (prlength <= 0 || exon == null)
 695     {
 696       return exon;
 697     }
 698     int desiredCdsLength = prlength * 3;
 699     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 700
 701     /*
 702      * assuming here exon might include stop codon in addition to protein codons
 703      */
 704     if (desiredCdsLength == exonLength
 705             || desiredCdsLength == exonLength - 3)
 706     {
 707       return exon;
 708     }
 709
 710     int origxon[];
 711     int sxpos = -1;
 712     int endxon = 0;
 713     origxon = new int[exon.length];
 714     System.arraycopy(exon, 0, origxon, 0, exon.length);
 715     int cdspos = 0;
 716     for (int x = 0; x < exon.length; x += 2)
 717     {
 718       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 719       if (desiredCdsLength <= cdspos)
 720       {
 721         // advanced beyond last codon.
 722         sxpos = x;
 723         if (desiredCdsLength != cdspos)
 724         {
 725           // System.err
 726           // .println("Truncating final exon interval on region by "
 727           // + (cdspos - cdslength));
 728         }
 729
 730         /*
 731          * shrink the final exon - reduce end position if forward
 732          * strand, increase it if reverse
 733          */
 734         if (exon[x + 1] >= exon[x])
 735         {
 736           endxon = exon[x + 1] - cdspos + desiredCdsLength;
 737         }
 738         else
 739         {
 740           endxon = exon[x + 1] + cdspos - desiredCdsLength;
 741         }
 742         break;
 743       }
 744     }
 745
 746     if (sxpos != -1)
 747     {
 748       // and trim the exon interval set if necessary
 749       int[] nxon = new int[sxpos + 2];
 750       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 751       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 752                                 // set
 753       exon = nxon;
 754     }
 755     return exon;
 756   }
 757 }