src/jalview/datamodel/xdb/embl/EmblEntry.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.6)
   3  * Copyright (C) 2010 J Procter, AM Waterhouse, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18 package jalview.datamodel.xdb.embl;
  19
  20 import jalview.datamodel.DBRefEntry;
  21 import jalview.datamodel.DBRefSource;
  22 import jalview.datamodel.FeatureProperties;
  23 import jalview.datamodel.Mapping;
  24 import jalview.datamodel.Sequence;
  25 import jalview.datamodel.SequenceFeature;
  26 import jalview.datamodel.SequenceI;
  27
  28 import java.util.Enumeration;
  29 import java.util.Hashtable;
  30 import java.util.Iterator;
  31 import java.util.Vector;
  32
  33 public class EmblEntry
  34 {
  35   String accession;
  36
  37   String version;
  38
  39   String taxDivision;
  40
  41   String desc;
  42
  43   String rCreated;
  44
  45   String rLastUpdated;
  46
  47   String lastUpdated;
  48
  49   Vector keywords;
  50
  51   Vector refs;
  52
  53   Vector dbRefs;
  54
  55   Vector features;
  56
  57   EmblSequence sequence;
  58
  59   /**
  60    * @return the accession
  61    */
  62   public String getAccession()
  63   {
  64     return accession;
  65   }
  66
  67   /**
  68    * @param accession
  69    *          the accession to set
  70    */
  71   public void setAccession(String accession)
  72   {
  73     this.accession = accession;
  74   }
  75
  76   /**
  77    * @return the dbRefs
  78    */
  79   public Vector getDbRefs()
  80   {
  81     return dbRefs;
  82   }
  83
  84   /**
  85    * @param dbRefs
  86    *          the dbRefs to set
  87    */
  88   public void setDbRefs(Vector dbRefs)
  89   {
  90     this.dbRefs = dbRefs;
  91   }
  92
  93   /**
  94    * @return the desc
  95    */
  96   public String getDesc()
  97   {
  98     return desc;
  99   }
 100
 101   /**
 102    * @param desc
 103    *          the desc to set
 104    */
 105   public void setDesc(String desc)
 106   {
 107     this.desc = desc;
 108   }
 109
 110   /**
 111    * @return the features
 112    */
 113   public Vector getFeatures()
 114   {
 115     return features;
 116   }
 117
 118   /**
 119    * @param features
 120    *          the features to set
 121    */
 122   public void setFeatures(Vector features)
 123   {
 124     this.features = features;
 125   }
 126
 127   /**
 128    * @return the keywords
 129    */
 130   public Vector getKeywords()
 131   {
 132     return keywords;
 133   }
 134
 135   /**
 136    * @param keywords
 137    *          the keywords to set
 138    */
 139   public void setKeywords(Vector keywords)
 140   {
 141     this.keywords = keywords;
 142   }
 143
 144   /**
 145    * @return the lastUpdated
 146    */
 147   public String getLastUpdated()
 148   {
 149     return lastUpdated;
 150   }
 151
 152   /**
 153    * @param lastUpdated
 154    *          the lastUpdated to set
 155    */
 156   public void setLastUpdated(String lastUpdated)
 157   {
 158     this.lastUpdated = lastUpdated;
 159   }
 160
 161   /**
 162    * @return the refs
 163    */
 164   public Vector getRefs()
 165   {
 166     return refs;
 167   }
 168
 169   /**
 170    * @param refs
 171    *          the refs to set
 172    */
 173   public void setRefs(Vector refs)
 174   {
 175     this.refs = refs;
 176   }
 177
 178   /**
 179    * @return the releaseCreated
 180    */
 181   public String getRCreated()
 182   {
 183     return rCreated;
 184   }
 185
 186   /**
 187    * @param releaseCreated
 188    *          the releaseCreated to set
 189    */
 190   public void setRcreated(String releaseCreated)
 191   {
 192     this.rCreated = releaseCreated;
 193   }
 194
 195   /**
 196    * @return the releaseLastUpdated
 197    */
 198   public String getRLastUpdated()
 199   {
 200     return rLastUpdated;
 201   }
 202
 203   /**
 204    * @param releaseLastUpdated
 205    *          the releaseLastUpdated to set
 206    */
 207   public void setRLastUpdated(String releaseLastUpdated)
 208   {
 209     this.rLastUpdated = releaseLastUpdated;
 210   }
 211
 212   /**
 213    * @return the sequence
 214    */
 215   public EmblSequence getSequence()
 216   {
 217     return sequence;
 218   }
 219
 220   /**
 221    * @param sequence
 222    *          the sequence to set
 223    */
 224   public void setSequence(EmblSequence sequence)
 225   {
 226     this.sequence = sequence;
 227   }
 228
 229   /**
 230    * @return the taxDivision
 231    */
 232   public String getTaxDivision()
 233   {
 234     return taxDivision;
 235   }
 236
 237   /**
 238    * @param taxDivision
 239    *          the taxDivision to set
 240    */
 241   public void setTaxDivision(String taxDivision)
 242   {
 243     this.taxDivision = taxDivision;
 244   }
 245
 246   /**
 247    * @return the version
 248    */
 249   public String getVersion()
 250   {
 251     return version;
 252   }
 253
 254   /**
 255    * @param version
 256    *          the version to set
 257    */
 258   public void setVersion(String version)
 259   {
 260     this.version = version;
 261   }
 262
 263   /*
 264    * EMBL Feature support is limited. The text below is included for the benefit
 265    * of any developer working on improving EMBL feature import in Jalview.
 266    * Extract from EMBL feature specification see
 267    * http://www.embl-ebi.ac.uk/embl/Documentation
 268    * /FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose
 269    *
 270    * The location indicates the region of the presented sequence which
 271    * corresponds to a feature.
 272    *
 273    * 3.5.2 Format and conventions The location contains at least one sequence
 274    * location descriptor and may contain one or more operators with one or more
 275    * sequence location descriptors. Base numbers refer to the numbering in the
 276    * entry. This numbering designates the first base (5' end) of the presented
 277    * sequence as base 1. Base locations beyond the range of the presented
 278    * sequence may not be used in location descriptors, the only exception being
 279    * location in a remote entry (see 3.5.2.1, e).
 280    *
 281    * Location operators and descriptors are discussed in more detail below.
 282    *
 283    * 3.5.2.1 Location descriptors
 284    *
 285    * The location descriptor can be one of the following: (a) a single base
 286    * number (b) a site between two indicated adjoining bases (c) a single base
 287    * chosen from within a specified range of bases (not allowed for new entries)
 288    * (d) the base numbers delimiting a sequence span (e) a remote entry
 289    * identifier followed by a local location descriptor (i.e., a-d)
 290    *
 291    * A site between two adjoining nucleotides, such as endonucleolytic cleavage
 292    * site, is indicated by listing the two points separated by a carat (^). The
 293    * permitted formats for this descriptor are n^n+1 (for example 55^56), or,
 294    * for circular molecules, n^1, where "n" is the full length of the molecule,
 295    * ie 1000^1 for circular molecule with length 1000.
 296    *
 297    * A single base chosen from a range of bases is indicated by the first base
 298    * number and the last base number of the range separated by a single period
 299    * (e.g., '12.21' indicates a single base taken from between the indicated
 300    * points). From October 2006 the usage of this descriptor is restricted : it
 301    * is illegal to use "a single base from a range" (c) either on its own or in
 302    * combination with the "sequence span" (d) descriptor for newly created
 303    * entries. The existing entries where such descriptors exist are going to be
 304    * retrofitted.
 305    *
 306    * Sequence spans are indicated by the starting base number and the ending
 307    * base number separated by two periods (e.g., '34..456'). The '<' and '>'
 308    * symbols may be used with the starting and ending base numbers to indicate
 309    * that an end point is beyond the specified base number. The starting and
 310    * ending base positions can be represented as distinct base numbers
 311    * ('34..456') or a site between two indicated adjoining bases.
 312    *
 313    * A location in a remote entry (not the entry to which the feature table
 314    * belongs) can be specified by giving the accession-number and sequence
 315    * version of the remote entry, followed by a colon ":", followed by a
 316    * location descriptor which applies to that entry's sequence (i.e.
 317    * J12345.1:1..15, see also examples below)
 318    *
 319    * 3.5.2.2 Operators
 320    *
 321    * The location operator is a prefix that specifies what must be done to the
 322    * indicated sequence to find or construct the location corresponding to the
 323    * feature. A list of operators is given below with their definitions and most
 324    * common format.
 325    *
 326    * complement(location) Find the complement of the presented sequence in the
 327    * span specified by " location" (i.e., read the complement of the presented
 328    * strand in its 5'-to-3' direction)
 329    *
 330    * join(location,location, ... location) The indicated elements should be
 331    * joined (placed end-to-end) to form one contiguous sequence
 332    *
 333    * order(location,location, ... location) The elements can be found in the
 334    * specified order (5' to 3' direction), but nothing is implied about the
 335    * reasonableness about joining them
 336    *
 337    * Note : location operator "complement" can be used in combination with
 338    * either " join" or "order" within the same location; combinations of "join"
 339    * and "order" within the same location (nested operators) are illegal.
 340    *
 341    *
 342    *
 343    * 3.5.3 Location examples
 344    *
 345    * The following is a list of common location descriptors with their meanings:
 346    *
 347    * Location Description
 348    *
 349    * 467 Points to a single base in the presented sequence
 350    *
 351    * 340..565 Points to a continuous range of bases bounded by and including the
 352    * starting and ending bases
 353    *
 354    * <345..500 Indicates that the exact lower boundary point of a feature is
 355    * unknown. The location begins at some base previous to the first base
 356    * specified (which need not be contained in the presented sequence) and
 357    * continues to and includes the ending base
 358    *
 359    * <1..888 The feature starts before the first sequenced base and continues to
 360    * and includes base 888
 361    *
 362    * 1..>888 The feature starts at the first sequenced base and continues beyond
 363    * base 888
 364    *
 365    * 102.110 Indicates that the exact location is unknown but that it is one of
 366    * the bases between bases 102 and 110, inclusive
 367    *
 368    * 123^124 Points to a site between bases 123 and 124
 369    *
 370    * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to
 371    * form one contiguous sequence
 372    *
 373    *
 374    * complement(34..126) Start at the base complementary to 126 and finish at
 375    * the base complementary to base 34 (the feature is on the strand
 376    * complementary to the presented strand)
 377    *
 378    *
 379    * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918
 380    * to 5163, then complements the joined segments (the feature is on the strand
 381    * complementary to the presented strand)
 382    *
 383    * join(complement(4918..5163),complement(2691..4571)) Complements regions
 384    * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the
 385    * feature is on the strand complementary to the presented strand)
 386    *
 387    * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in
 388    * this database) with primary accession number 'J00194'
 389    *
 390    * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry
 391    * with the region 100..202 of remote entry J00194
 392    */
 393   /**
 394    * Recover annotated sequences from EMBL file
 395    *
 396    * @param noNa
 397    *          don't return nucleic acid sequences
 398    * @param sourceDb
 399    *          TODO
 400    * @param noProtein
 401    *          don't return any translated protein sequences marked in features
 402    * @return dataset sequences with DBRefs and features - DNA always comes first
 403    */
 404   public jalview.datamodel.SequenceI[] getSequences(boolean noNa,
 405           boolean noPeptide, String sourceDb)
 406   { // TODO: ensure emblEntry.getSequences behaves correctly for returning all
 407     // cases of noNa and noPeptide
 408     Vector seqs = new Vector();
 409     Sequence dna = null;
 410     if (!noNa)
 411     {
 412       // In theory we still need to create this if noNa is set to avoid a null
 413       // pointer exception
 414       dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence());
 415       dna.setDescription(desc);
 416       DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession);
 417       dna.addDBRef(retrievedref);
 418       // add map to indicate the sequence is a valid coordinate frame for the
 419       // dbref
 420       retrievedref.setMap(new Mapping(null, new int[]
 421       { 1, dna.getLength() }, new int[]
 422       { 1, dna.getLength() }, 1, 1));
 423       // TODO: transform EMBL Database refs to canonical form
 424       if (dbRefs != null)
 425         for (Iterator i = dbRefs.iterator(); i.hasNext(); dna
 426                 .addDBRef((DBRefEntry) i.next()))
 427           ;
 428     }
 429     try
 430     {
 431       for (Iterator i = features.iterator(); i.hasNext();)
 432       {
 433         EmblFeature feature = (EmblFeature) i.next();
 434         if (!noNa)
 435         {
 436           if (feature.dbRefs != null && feature.dbRefs.size() > 0)
 437           {
 438             for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna
 439                     .addDBRef((DBRefEntry) dbr.next()))
 440               ;
 441           }
 442         }
 443         if (FeatureProperties.isCodingFeature(sourceDb, feature.getName()))
 444         {
 445           parseCodingFeature(feature, sourceDb, seqs, dna, noPeptide);
 446         }
 447         else
 448         {
 449           // General feature type.
 450           if (!noNa)
 451           {
 452             if (feature.dbRefs != null && feature.dbRefs.size() > 0)
 453             {
 454               for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext(); dna
 455                       .addDBRef((DBRefEntry) dbr.next()))
 456                 ;
 457             }
 458           }
 459         }
 460       }
 461     } catch (Exception e)
 462     {
 463       System.err.println("EMBL Record Features parsing error!");
 464       System.err
 465               .println("Please report the following to help@jalview.org :");
 466       System.err.println("EMBL Record " + accession);
 467       System.err.println("Resulted in exception: " + e.getMessage());
 468       e.printStackTrace(System.err);
 469     }
 470     if (!noNa && dna != null)
 471     {
 472       seqs.add(dna);
 473     }
 474     SequenceI[] sqs = new SequenceI[seqs.size()];
 475     for (int i = 0, j = seqs.size(); i < j; i++)
 476     {
 477       sqs[i] = (SequenceI) seqs.elementAt(i);
 478       seqs.set(i, null);
 479     }
 480     return sqs;
 481   }
 482
 483   /**
 484    * attempt to extract coding region and product from a feature and properly
 485    * decorate it with annotations.
 486    *
 487    * @param feature
 488    *          coding feature
 489    * @param sourceDb
 490    *          source database for the EMBLXML
 491    * @param seqs
 492    *          place where sequences go
 493    * @param dna
 494    *          parent dna sequence for this record
 495    * @param noPeptide
 496    *          flag for generation of Peptide sequence objects
 497    */
 498   private void parseCodingFeature(EmblFeature feature, String sourceDb,
 499           Vector seqs, Sequence dna, boolean noPeptide)
 500   {
 501     boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 502     // extract coding region(s)
 503     jalview.datamodel.Mapping map = null;
 504     int[] exon = null;
 505     if (feature.locations != null && feature.locations.size() > 0)
 506     {
 507       for (Enumeration locs = feature.locations.elements(); locs
 508               .hasMoreElements();)
 509       {
 510         EmblFeatureLocations loc = (EmblFeatureLocations) locs
 511                 .nextElement();
 512         int[] se = loc.getElementRanges(accession);
 513         if (exon == null)
 514         {
 515           exon = se;
 516         }
 517         else
 518         {
 519           int[] t = new int[exon.length + se.length];
 520           System.arraycopy(exon, 0, t, 0, exon.length);
 521           System.arraycopy(se, 0, t, exon.length, se.length);
 522           exon = t;
 523         }
 524       }
 525     }
 526     String prseq = null;
 527     String prname = new String();
 528     String prid = null;
 529     Hashtable vals = new Hashtable();
 530     int prstart = 1;
 531     // get qualifiers
 532     if (feature.getQualifiers() != null
 533             && feature.getQualifiers().size() > 0)
 534     {
 535       for (Iterator quals = feature.getQualifiers().iterator(); quals
 536               .hasNext();)
 537       {
 538         Qualifier q = (Qualifier) quals.next();
 539         if (q.getName().equals("translation"))
 540         {
 541           StringBuffer prsq = new StringBuffer(q.getValues()[0]);
 542           int p = prsq.indexOf(" ");
 543           while (p > -1)
 544           {
 545             prsq.deleteCharAt(p);
 546             p = prsq.indexOf(" ", p);
 547           }
 548           prseq = prsq.toString();
 549           prsq = null;
 550
 551         }
 552         else if (q.getName().equals("protein_id"))
 553         {
 554           prid = q.getValues()[0];
 555         }
 556         else if (q.getName().equals("codon_start"))
 557         {
 558           prstart = Integer.parseInt(q.getValues()[0]);
 559         }
 560         else if (q.getName().equals("product"))
 561         {
 562           prname = q.getValues()[0];
 563         }
 564         else
 565         {
 566           // throw anything else into the additional properties hash
 567           String[] s = q.getValues();
 568           StringBuffer sb = new StringBuffer();
 569           if (s != null)
 570           {
 571             for (int i = 0; i < s.length; i++)
 572             {
 573               sb.append(s[i]);
 574               sb.append("\n");
 575             }
 576           }
 577           vals.put(q.getName(), sb.toString());
 578         }
 579       }
 580     }
 581     Sequence product = null;
 582     exon = adjustForPrStart(prstart, exon);
 583
 584     if (prseq != null && prname != null && prid != null)
 585     {
 586       // extract proteins.
 587       product = new Sequence(prid, prseq, 1, prseq.length());
 588       product.setDescription(((prname.length() == 0) ? "Protein Product from "
 589               + sourceDb
 590               : prname));
 591       if (!noPeptide)
 592       {
 593         // Protein is also added to vector of sequences returned
 594         seqs.add(product);
 595       }
 596       // we have everything - create the mapping and perhaps the protein
 597       // sequence
 598       if (exon == null || exon.length == 0)
 599       {
 600         System.err
 601                 .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 602                         + sourceDb + ":" + getAccession() + ")");
 603         if (prseq.length() * 3 == (1 - prstart + dna.getSequence().length))
 604         {
 605           System.err
 606                   .println("Not allowing for additional stop codon at end of cDNA fragment... !");
 607           // this might occur for CDS sequences where no features are
 608           // marked.
 609           exon = new int[]
 610           { dna.getStart() + (prstart - 1), dna.getEnd() };
 611           map = new jalview.datamodel.Mapping(product, exon, new int[]
 612           { 1, prseq.length() }, 3, 1);
 613         }
 614         if ((prseq.length() + 1) * 3 == (1 - prstart + dna.getSequence().length))
 615         {
 616           System.err
 617                   .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 618           exon = new int[]
 619           { dna.getStart() + (prstart - 1), dna.getEnd() - 3 };
 620           map = new jalview.datamodel.Mapping(product, exon, new int[]
 621           { 1, prseq.length() }, 3, 1);
 622         }
 623       }
 624       else
 625       {
 626         // Trim the exon mapping if necessary - the given product may only be a
 627         // fragment of a larger protein. (EMBL:AY043181 is an example)
 628
 629         if (isEmblCdna)
 630         {
 631           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 632           // map
 633           // if given a dataset reference, search dataset for parent EMBL
 634           // sequence if it exists and set its map
 635           // make a new feature annotating the coding contig
 636         }
 637         else
 638         {
 639           // final product length trunctation check
 640
 641           map = new jalview.datamodel.Mapping(product,
 642                   adjustForProteinLength(prseq.length(), exon), new int[]
 643                   { 1, prseq.length() }, 3, 1);
 644           // reconstruct the EMBLCDS entry
 645           // TODO: this is only necessary when there codon annotation is
 646           // complete (I think JBPNote)
 647           DBRefEntry pcdnaref = new DBRefEntry();
 648           pcdnaref.setAccessionId(prid);
 649           pcdnaref.setSource(DBRefSource.EMBLCDS);
 650           pcdnaref.setVersion(getVersion()); // same as parent EMBL version.
 651           jalview.util.MapList mp = new jalview.util.MapList(new int[]
 652           { 1, prseq.length() }, new int[]
 653           { 1 + (prstart - 1), (prstart - 1) + 3 * prseq.length() }, 1, 3);
 654           // { 1 + (prstart - 1) * 3,
 655           // 1 + (prstart - 1) * 3 + prseq.length() * 3 - 1 }, new int[]
 656           // { 1prstart, prstart + prseq.length() - 1 }, 3, 1);
 657           pcdnaref.setMap(new Mapping(mp));
 658           if (product != null)
 659             product.addDBRef(pcdnaref);
 660
 661         }
 662       }
 663       // add cds feature to dna seq - this may include the stop codon
 664       for (int xint = 0; exon != null && xint < exon.length; xint += 2)
 665       {
 666         SequenceFeature sf = new SequenceFeature();
 667         sf.setBegin(exon[xint]);
 668         sf.setEnd(exon[xint + 1]);
 669         sf.setType(feature.getName());
 670         sf.setFeatureGroup(sourceDb);
 671         sf.setDescription("Exon " + (1 + (int) (xint / 2))
 672                 + " for protein '" + prname + "' EMBLCDS:" + prid);
 673         sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint));
 674         sf.setValue(FeatureProperties.EXONPRODUCT, prname);
 675         if (vals != null && vals.size() > 0)
 676         {
 677           Enumeration kv = vals.elements();
 678           while (kv.hasMoreElements())
 679           {
 680             Object key = kv.nextElement();
 681             if (key != null)
 682               sf.setValue(key.toString(), vals.get(key));
 683           }
 684         }
 685         dna.addSequenceFeature(sf);
 686       }
 687     }
 688     // add dbRefs to sequence
 689     if (feature.dbRefs != null && feature.dbRefs.size() > 0)
 690     {
 691       for (Iterator dbr = feature.dbRefs.iterator(); dbr.hasNext();)
 692       {
 693         DBRefEntry ref = (DBRefEntry) dbr.next();
 694         ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref
 695                 .getSource()));
 696         // Hard code the kind of protein product accessions that EMBL cite
 697         if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT))
 698         {
 699           ref.setMap(map);
 700           if (map != null && map.getTo() != null)
 701           {
 702             map.getTo().addDBRef(
 703                     new DBRefEntry(ref.getSource(), ref.getVersion(), ref
 704                             .getAccessionId())); // don't copy map over.
 705             if (map.getTo().getName().indexOf(prid) == 0)
 706             {
 707               map.getTo().setName(
 708                       jalview.datamodel.DBRefSource.UNIPROT + "|"
 709                               + ref.getAccessionId());
 710             }
 711           }
 712         }
 713         if (product != null)
 714         {
 715           DBRefEntry pref = new DBRefEntry(ref.getSource(),
 716                   ref.getVersion(), ref.getAccessionId());
 717           pref.setMap(null); // reference is direct
 718           product.addDBRef(pref);
 719           // Add converse mapping reference
 720           if (map != null)
 721           {
 722             Mapping pmap = new Mapping(dna, map.getMap().getInverse());
 723             pref = new DBRefEntry(sourceDb, getVersion(),
 724                     this.getAccession());
 725             pref.setMap(pmap);
 726             if (map.getTo() != null)
 727             {
 728               map.getTo().addDBRef(pref);
 729             }
 730           }
 731         }
 732         dna.addDBRef(ref);
 733       }
 734     }
 735   }
 736
 737   private int[] adjustForPrStart(int prstart, int[] exon)
 738   {
 739
 740     int origxon[], sxpos = -1;
 741     int sxstart, sxstop; // unnecessary variables used for debugging
 742     // first adjust range for codon start attribute
 743     if (prstart > 1)
 744     {
 745       origxon = new int[exon.length];
 746       System.arraycopy(exon, 0, origxon, 0, exon.length);
 747       int cdspos = 0;
 748       for (int x = 0; x < exon.length && sxpos == -1; x += 2)
 749       {
 750         cdspos += exon[x + 1] - exon[x] + 1;
 751         if (prstart <= cdspos)
 752         {
 753           sxpos = x;
 754           sxstart = exon[x];
 755           sxstop = exon[x + 1];
 756           // and adjust start boundary of first exon.
 757           exon[x] = exon[x + 1] - cdspos + prstart;
 758           break;
 759         }
 760       }
 761
 762       if (sxpos > 0)
 763       {
 764         int[] nxon = new int[exon.length - sxpos];
 765         System.arraycopy(exon, sxpos, nxon, 0, exon.length - sxpos);
 766         exon = nxon;
 767       }
 768     }
 769     return exon;
 770   }
 771
 772   /**
 773    * truncate the last exon interval to the prlength'th codon
 774    *
 775    * @param prlength
 776    * @param exon
 777    * @return new exon
 778    */
 779   private int[] adjustForProteinLength(int prlength, int[] exon)
 780   {
 781
 782     int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3;
 783     int sxstart, sxstop; // unnecessary variables used for debugging
 784     // first adjust range for codon start attribute
 785     if (prlength >= 1 && exon != null)
 786     {
 787       origxon = new int[exon.length];
 788       System.arraycopy(exon, 0, origxon, 0, exon.length);
 789       int cdspos = 0;
 790       for (int x = 0; x < exon.length && sxpos == -1; x += 2)
 791       {
 792         cdspos += exon[x + 1] - exon[x] + 1;
 793         if (cdslength <= cdspos)
 794         {
 795           // advanced beyond last codon.
 796           sxpos = x;
 797           sxstart = exon[x];
 798           sxstop = exon[x + 1];
 799           if (cdslength != cdspos)
 800           {
 801             System.err
 802                     .println("Truncating final exon interval on region by "
 803                             + (cdspos - cdslength));
 804           }
 805           // locate the new end boundary of final exon as endxon
 806           endxon = exon[x + 1] - cdspos + cdslength;
 807           break;
 808         }
 809       }
 810
 811       if (sxpos != -1)
 812       {
 813         // and trim the exon interval set if necessary
 814         int[] nxon = new int[sxpos + 2];
 815         System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 816         nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 817                                   // set
 818         exon = nxon;
 819       }
 820     }
 821     return exon;
 822   }
 823 }