src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.bin.Cache;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.DBRefEntry;
  28 import jalview.datamodel.DBRefSource;
  29 import jalview.datamodel.FeatureProperties;
  30 import jalview.datamodel.Mapping;
  31 import jalview.datamodel.Sequence;
  32 import jalview.datamodel.SequenceFeature;
  33 import jalview.datamodel.SequenceI;
  34 import jalview.util.DBRefUtils;
  35 import jalview.util.DnaUtils;
  36 import jalview.util.MapList;
  37 import jalview.util.MappingUtils;
  38 import jalview.util.MessageManager;
  39 import jalview.ws.ebi.EBIFetchClient;
  40 import jalview.xml.binding.embl.EntryType;
  41 import jalview.xml.binding.embl.EntryType.Feature;
  42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  43 import jalview.xml.binding.embl.XrefType;
  44
  45 import java.io.File;
  46 import java.io.FileInputStream;
  47 import java.io.InputStream;
  48 import java.text.ParseException;
  49 import java.util.ArrayList;
  50 import java.util.Arrays;
  51 import java.util.Hashtable;
  52 import java.util.List;
  53 import java.util.Map;
  54 import java.util.Map.Entry;
  55
  56 import javax.xml.bind.JAXBContext;
  57 import javax.xml.bind.JAXBException;
  58 import javax.xml.stream.FactoryConfigurationError;
  59 import javax.xml.stream.XMLInputFactory;
  60 import javax.xml.stream.XMLStreamException;
  61 import javax.xml.stream.XMLStreamReader;
  62
  63 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  64 {
  65   /*
  66    * JAL-1856 Embl returns this text for query not found
  67    */
  68   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  69
  70   public EmblXmlSource()
  71   {
  72     super();
  73   }
  74
  75   /**
  76    * Retrieves and parses an emblxml file, and returns an alignment containing
  77    * the parsed sequences, or null if none were found
  78    *
  79    * @param emprefx
  80    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  81    * @param query
  82    * @return
  83    * @throws Exception
  84    */
  85   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  86           throws Exception
  87   {
  88     startQuery();
  89     EBIFetchClient dbFetch = new EBIFetchClient();
  90     File reply;
  91     try
  92     {
  93       reply = dbFetch.fetchDataAsFile(
  94               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
  95               "xml");
  96     } catch (Exception e)
  97     {
  98       stopQuery();
  99       throw new Exception(MessageManager.formatMessage(
 100               "exception.ebiembl_retrieval_failed_on", new String[]
 101               { emprefx.toLowerCase(), query.trim() }), e);
 102     }
 103     return getEmblSequenceRecords(emprefx, query, reply);
 104   }
 105
 106   /**
 107    * parse an emblxml file stored locally
 108    *
 109    * @param emprefx
 110    *          either EMBL or EMBLCDS strings are allowed - anything else will
 111    *          not retrieve emblxml
 112    * @param query
 113    * @param file
 114    *          the EMBL XML file containing the results of a query
 115    * @return
 116    * @throws Exception
 117    */
 118   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 119           File reply) throws Exception
 120   {
 121     List<EntryType> entries = null;
 122     if (reply != null && reply.exists())
 123     {
 124       file = reply.getAbsolutePath();
 125       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 126       {
 127         InputStream is = new FileInputStream(reply);
 128         entries = getEmblEntries(is);
 129       }
 130     }
 131
 132     /*
 133      * invalid accession gets a reply with no <entry> elements, text content of
 134      * EmbFile reads something like (e.g.) this ungrammatical phrase
 135      * Entry: <acc> display type is either not supported or entry is not found.
 136      */
 137     AlignmentI al = null;
 138     List<SequenceI> seqs = new ArrayList<>();
 139     List<SequenceI> peptides = new ArrayList<>();
 140     if (entries != null)
 141     {
 142       for (EntryType entry : entries)
 143       {
 144         SequenceI seq = getSequence(emprefx, entry, peptides);
 145         if (seq != null)
 146         {
 147           seqs.add(seq.deriveSequence());
 148           // place DBReferences on dataset and refer
 149         }
 150       }
 151       if (!seqs.isEmpty())
 152       {
 153         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 154       }
 155       else
 156       {
 157         System.out.println(
 158                 "No record found for '" + emprefx + ":" + query + "'");
 159       }
 160     }
 161
 162     stopQuery();
 163     return al;
 164   }
 165
 166   /**
 167    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 168    * (possibly empty) list of <code>EntryType</code> objects.
 169    *
 170    * is
 171    *
 172    * @return
 173    */
 174   List<EntryType> getEmblEntries(InputStream is)
 175   {
 176     List<EntryType> entries = new ArrayList<>();
 177     try
 178     {
 179       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 180       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 181               .createXMLStreamReader(is);
 182       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 183       jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um
 184               .unmarshal(streamReader);
 185
 186       /*
 187        * document root contains either "entry" or "entrySet"
 188        */
 189       if (root == null)
 190       {
 191         return entries;
 192       }
 193       if (root.getEntrySet() != null)
 194       {
 195         entries = root.getEntrySet().getEntry();
 196       }
 197       else if (root.getEntry() != null)
 198       {
 199         entries.add(root.getEntry());
 200       }
 201     } catch (JAXBException | XMLStreamException
 202             | FactoryConfigurationError e)
 203     {
 204       e.printStackTrace();
 205     }
 206     return entries;
 207   }
 208
 209   /**
 210    * A helper method to parse XML data and construct a sequence, with any
 211    * available database references and features
 212    *
 213    * @param emprefx
 214    * @param entry
 215    * @param peptides
 216    * @return
 217    */
 218   SequenceI getSequence(String sourceDb, EntryType entry,
 219           List<SequenceI> peptides)
 220   {
 221     String seqString = entry.getSequence();
 222     if (seqString == null)
 223     {
 224       return null;
 225     }
 226     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 227             "");
 228     String accession = entry.getAccession();
 229     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 230
 231     dna.setDescription(entry.getDescription());
 232     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 233     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 234             accession);
 235     dna.addDBRef(selfRref);
 236     selfRref.setMap(
 237             new Mapping(null, new int[]
 238             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 239                     1));
 240
 241     /*
 242      * add db references
 243      */
 244     List<XrefType> xrefs = entry.getXref();
 245     if (xrefs != null)
 246     {
 247       for (XrefType xref : xrefs)
 248       {
 249         String acc = xref.getId();
 250         String source = DBRefUtils.getCanonicalName(xref.getDb());
 251         String version = xref.getSecondaryId();
 252         if (version == null || "".equals(version))
 253         {
 254           version = "0";
 255         }
 256         dna.addDBRef(new DBRefEntry(source, version, acc));
 257       }
 258     }
 259
 260     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 261     try
 262     {
 263       List<Feature> features = entry.getFeature();
 264       if (features != null)
 265       {
 266         for (Feature feature : features)
 267         {
 268           if (FeatureProperties.isCodingFeature(sourceDb,
 269                   feature.getName()))
 270           {
 271             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 272                     matcher);
 273           }
 274         }
 275       }
 276     } catch (Exception e)
 277     {
 278       System.err.println("EMBL Record Features parsing error!");
 279       System.err
 280               .println("Please report the following to help@jalview.org :");
 281       System.err.println("EMBL Record " + accession);
 282       System.err.println("Resulted in exception: " + e.getMessage());
 283       e.printStackTrace(System.err);
 284     }
 285
 286     return dna;
 287   }
 288
 289   /**
 290    * Extracts coding region and product from a CDS feature and decorates it with
 291    * annotations
 292    *
 293    * @param entry
 294    * @param feature
 295    * @param sourceDb
 296    * @param dna
 297    * @param peptides
 298    * @param matcher
 299    */
 300   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 301           SequenceI dna, List<SequenceI> peptides,
 302           SequenceIdMatcher matcher)
 303   {
 304     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 305     final String accession = entry.getAccession();
 306     final String sequenceVersion = entry.getVersion().toString();
 307
 308     int[] exons = getCdsRanges(entry.getAccession(), feature);
 309
 310     String translation = null;
 311     String proteinName = "";
 312     String proteinId = null;
 313     Map<String, String> vals = new Hashtable<>();
 314
 315     /*
 316      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 317      * (phase is required for CDS features in GFF3 format)
 318      */
 319     int codonStart = 1;
 320
 321     /*
 322      * parse qualifiers, saving protein translation, protein id,
 323      * codon start position, product (name), and 'other values'
 324      */
 325     if (feature.getQualifier() != null)
 326     {
 327       for (Qualifier q : feature.getQualifier())
 328       {
 329         String qname = q.getName();
 330         String value = q.getValue();
 331         value = value == null ? ""
 332                 : value.trim().replace(" ", "").replace("\n", "")
 333                         .replace("\t", "");
 334         if (qname.equals("translation"))
 335         {
 336           translation = value;
 337         }
 338         else if (qname.equals("protein_id"))
 339         {
 340           proteinId = value;
 341         }
 342         else if (qname.equals("codon_start"))
 343         {
 344           try
 345           {
 346             codonStart = Integer.parseInt(value.trim());
 347           } catch (NumberFormatException e)
 348           {
 349             System.err.println("Invalid codon_start in XML for "
 350                     + entry.getAccession() + ": " + e.getMessage());
 351           }
 352         }
 353         else if (qname.equals("product"))
 354         {
 355           // sometimes name is returned e.g. for V00488
 356           proteinName = value;
 357         }
 358         else
 359         {
 360           // throw anything else into the additional properties hash
 361           if (!"".equals(value))
 362           {
 363             vals.put(qname, value);
 364           }
 365         }
 366       }
 367     }
 368
 369     DBRefEntry proteinToEmblProteinRef = null;
 370     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 371
 372     SequenceI product = null;
 373     Mapping dnaToProteinMapping = null;
 374     if (translation != null && proteinName != null && proteinId != null)
 375     {
 376       int translationLength = translation.length();
 377
 378       /*
 379        * look for product in peptides list, if not found, add it
 380        */
 381       product = matcher.findIdMatch(proteinId);
 382       if (product == null)
 383       {
 384         product = new Sequence(proteinId, translation, 1,
 385                 translationLength);
 386         product.setDescription(((proteinName.length() == 0)
 387                 ? "Protein Product from " + sourceDb
 388                 : proteinName));
 389         peptides.add(product);
 390         matcher.add(product);
 391       }
 392
 393       // we have everything - create the mapping and perhaps the protein
 394       // sequence
 395       if (exons == null || exons.length == 0)
 396       {
 397         /*
 398          * workaround until we handle dna location for CDS sequence
 399          * e.g. location="X53828.1:60..1058" correctly
 400          */
 401         System.err.println(
 402                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 403                         + sourceDb + ":" + entry.getAccession() + ")");
 404         int dnaLength = dna.getLength();
 405         if (translationLength * 3 == (1 - codonStart + dnaLength))
 406         {
 407           System.err.println(
 408                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 409           // this might occur for CDS sequences where no features are marked
 410           exons = new int[] { dna.getStart() + (codonStart - 1),
 411               dna.getEnd() };
 412           dnaToProteinMapping = new Mapping(product, exons,
 413                   new int[]
 414                   { 1, translationLength }, 3, 1);
 415         }
 416         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 417         {
 418           System.err.println(
 419                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 420           exons = new int[] { dna.getStart() + (codonStart - 1),
 421               dna.getEnd() - 3 };
 422           dnaToProteinMapping = new Mapping(product, exons,
 423                   new int[]
 424                   { 1, translationLength }, 3, 1);
 425         }
 426       }
 427       else
 428       {
 429         // Trim the exon mapping if necessary - the given product may only be a
 430         // fragment of a larger protein. (EMBL:AY043181 is an example)
 431
 432         if (isEmblCdna)
 433         {
 434           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 435           // map
 436           // if given a dataset reference, search dataset for parent EMBL
 437           // sequence if it exists and set its map
 438           // make a new feature annotating the coding contig
 439         }
 440         else
 441         {
 442           // final product length truncation check
 443           int[] cdsRanges = adjustForProteinLength(translationLength,
 444                   exons);
 445           dnaToProteinMapping = new Mapping(product, cdsRanges,
 446                   new int[]
 447                   { 1, translationLength }, 3, 1);
 448           if (product != null)
 449           {
 450             /*
 451              * make xref with mapping from protein to EMBL dna
 452              */
 453             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 454                     sequenceVersion, proteinId,
 455                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 456             product.addDBRef(proteinToEmblRef);
 457
 458             /*
 459              * make xref from protein to EMBLCDS; we assume here that the
 460              * CDS sequence version is same as dna sequence (?!)
 461              */
 462             MapList proteinToCdsMapList = new MapList(
 463                     new int[]
 464                     { 1, translationLength },
 465                     new int[]
 466                     { 1 + (codonStart - 1),
 467                         (codonStart - 1) + 3 * translationLength },
 468                     1, 3);
 469             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 470                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 471                     new Mapping(proteinToCdsMapList));
 472             product.addDBRef(proteinToEmblCdsRef);
 473
 474             /*
 475              * make 'direct' xref from protein to EMBLCDSPROTEIN
 476              */
 477             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 478             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 479             proteinToEmblProteinRef.setMap(null);
 480             product.addDBRef(proteinToEmblProteinRef);
 481           }
 482         }
 483       }
 484
 485       /*
 486        * add cds features to dna sequence
 487        */
 488       String cds = feature.getName(); // "CDS"
 489       for (int xint = 0; exons != null
 490               && xint < exons.length - 1; xint += 2)
 491       {
 492         int exonStart = exons[xint];
 493         int exonEnd = exons[xint + 1];
 494         int begin = Math.min(exonStart, exonEnd);
 495         int end = Math.max(exonStart, exonEnd);
 496         int exonNumber = xint / 2 + 1;
 497         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 498                 exonNumber, proteinName, proteinId);
 499
 500         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 501                 vals);
 502
 503         sf.setEnaLocation(feature.getLocation());
 504         boolean forwardStrand = exonStart <= exonEnd;
 505         sf.setStrand(forwardStrand ? "+" : "-");
 506         sf.setPhase(String.valueOf(codonStart - 1));
 507         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 508         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 509
 510         dna.addSequenceFeature(sf);
 511       }
 512     }
 513
 514     /*
 515      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 516      */
 517     boolean hasUniprotDbref = false;
 518     List<XrefType> xrefs = feature.getXref();
 519     if (xrefs != null)
 520     {
 521       boolean mappingUsed = false;
 522       for (XrefType xref : xrefs)
 523       {
 524         /*
 525          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 526          */
 527         String source = DBRefUtils.getCanonicalName(xref.getDb());
 528         String version = xref.getSecondaryId();
 529         if (version == null || "".equals(version))
 530         {
 531           version = "0";
 532         }
 533         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 534         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 535                 dbref.getAccessionId());
 536         if (source.equals(DBRefSource.UNIPROT))
 537         {
 538           String proteinSeqName = DBRefSource.UNIPROT + "|"
 539                   + dbref.getAccessionId();
 540           if (dnaToProteinMapping != null
 541                   && dnaToProteinMapping.getTo() != null)
 542           {
 543             if (mappingUsed)
 544             {
 545               /*
 546                * two or more Uniprot xrefs for the same CDS -
 547                * each needs a distinct Mapping (as to a different sequence)
 548                */
 549               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 550             }
 551             mappingUsed = true;
 552
 553             /*
 554              * try to locate the protein mapped to (possibly by a
 555              * previous CDS feature); if not found, construct it from
 556              * the EMBL translation
 557              */
 558             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 559             if (proteinSeq == null)
 560             {
 561               proteinSeq = new Sequence(proteinSeqName,
 562                       product.getSequenceAsString());
 563               matcher.add(proteinSeq);
 564               peptides.add(proteinSeq);
 565             }
 566             dnaToProteinMapping.setTo(proteinSeq);
 567             dnaToProteinMapping.setMappedFromId(proteinId);
 568             proteinSeq.addDBRef(proteinDbRef);
 569             dbref.setMap(dnaToProteinMapping);
 570           }
 571           hasUniprotDbref = true;
 572         }
 573         if (product != null)
 574         {
 575           /*
 576            * copy feature dbref to our protein product
 577            */
 578           DBRefEntry pref = proteinDbRef;
 579           pref.setMap(null); // reference is direct
 580           product.addDBRef(pref);
 581           // Add converse mapping reference
 582           if (dnaToProteinMapping != null)
 583           {
 584             Mapping pmap = new Mapping(dna,
 585                     dnaToProteinMapping.getMap().getInverse());
 586             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 587             pref.setMap(pmap);
 588             if (dnaToProteinMapping.getTo() != null)
 589             {
 590               dnaToProteinMapping.getTo().addDBRef(pref);
 591             }
 592           }
 593         }
 594         dna.addDBRef(dbref);
 595       }
 596     }
 597
 598     /*
 599      * if we have a product (translation) but no explicit Uniprot dbref
 600      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 601      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 602      */
 603     if (!hasUniprotDbref && product != null)
 604     {
 605       if (proteinToEmblProteinRef == null)
 606       {
 607         // assuming CDSPROTEIN sequence version = dna version (?!)
 608         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 609                 sequenceVersion, proteinId);
 610       }
 611       product.addDBRef(proteinToEmblProteinRef);
 612
 613       if (dnaToProteinMapping != null
 614               && dnaToProteinMapping.getTo() != null)
 615       {
 616         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 617                 DBRefSource.EMBLCDSProduct, sequenceVersion,
 618                 proteinId);
 619         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 620         dnaToProteinMapping.setMappedFromId(proteinId);
 621         dna.addDBRef(dnaToEmblProteinRef);
 622       }
 623     }
 624   }
 625
 626   @Override
 627   public boolean isDnaCoding()
 628   {
 629     return true;
 630   }
 631
 632   /**
 633    * Returns the CDS positions as a single array of [start, end, start, end...]
 634    * positions. If on the reverse strand, these will be in descending order.
 635    *
 636    * @param accession
 637    * @param feature
 638    * @return
 639    */
 640   protected int[] getCdsRanges(String accession, Feature feature)
 641   {
 642     String location = feature.getLocation();
 643     if (location == null)
 644     {
 645       return new int[] {};
 646     }
 647
 648     try
 649     {
 650       List<int[]> ranges = DnaUtils.parseLocation(location);
 651       return listToArray(ranges);
 652     } catch (ParseException e)
 653     {
 654       Cache.log.warn(
 655               String.format("Not parsing inexact CDS location %s in ENA %s",
 656                       location, accession));
 657       return new int[] {};
 658     }
 659   }
 660
 661   /**
 662    * Converts a list of [start, end] ranges to a single array of [start, end,
 663    * start, end ...]
 664    *
 665    * @param ranges
 666    * @return
 667    */
 668   int[] listToArray(List<int[]> ranges)
 669   {
 670     int[] result = new int[ranges.size() * 2];
 671     int i = 0;
 672     for (int[] range : ranges)
 673     {
 674       result[i++] = range[0];
 675       result[i++] = range[1];
 676     }
 677     return result;
 678   }
 679
 680   /**
 681    * Helper method to construct a SequenceFeature for one cds range
 682    *
 683    * @param type
 684    *          feature type ("CDS")
 685    * @param desc
 686    *          description
 687    * @param begin
 688    *          start position
 689    * @param end
 690    *          end position
 691    * @param group
 692    *          feature group
 693    * @param vals
 694    *          map of 'miscellaneous values' for feature
 695    * @return
 696    */
 697   protected SequenceFeature makeCdsFeature(String type, String desc,
 698           int begin, int end, String group, Map<String, String> vals)
 699   {
 700     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 701     if (!vals.isEmpty())
 702     {
 703       for (Entry<String, String> val : vals.entrySet())
 704       {
 705         sf.setValue(val.getKey(), val.getValue());
 706       }
 707     }
 708     return sf;
 709   }
 710
 711   /**
 712    * Truncates (if necessary) the exon intervals to match 3 times the length of
 713    * the protein; also accepts 3 bases longer (for stop codon not included in
 714    * protein)
 715    *
 716    * @param proteinLength
 717    * @param exon
 718    *          an array of [start, end, start, end...] intervals
 719    * @return the same array (if unchanged) or a truncated copy
 720    */
 721   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 722   {
 723     if (proteinLength <= 0 || exon == null)
 724     {
 725       return exon;
 726     }
 727     int expectedCdsLength = proteinLength * 3;
 728     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 729
 730     /*
 731      * if exon length matches protein, or is shorter, or longer by the
 732      * length of a stop codon (3 bases), then leave it unchanged
 733      */
 734     if (expectedCdsLength >= exonLength
 735             || expectedCdsLength == exonLength - 3)
 736     {
 737       return exon;
 738     }
 739
 740     int origxon[];
 741     int sxpos = -1;
 742     int endxon = 0;
 743     origxon = new int[exon.length];
 744     System.arraycopy(exon, 0, origxon, 0, exon.length);
 745     int cdspos = 0;
 746     for (int x = 0; x < exon.length; x += 2)
 747     {
 748       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 749       if (expectedCdsLength <= cdspos)
 750       {
 751         // advanced beyond last codon.
 752         sxpos = x;
 753         if (expectedCdsLength != cdspos)
 754         {
 755           // System.err
 756           // .println("Truncating final exon interval on region by "
 757           // + (cdspos - cdslength));
 758         }
 759
 760         /*
 761          * shrink the final exon - reduce end position if forward
 762          * strand, increase it if reverse
 763          */
 764         if (exon[x + 1] >= exon[x])
 765         {
 766           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 767         }
 768         else
 769         {
 770           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 771         }
 772         break;
 773       }
 774     }
 775
 776     if (sxpos != -1)
 777     {
 778       // and trim the exon interval set if necessary
 779       int[] nxon = new int[sxpos + 2];
 780       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 781       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 782                                 // set
 783       exon = nxon;
 784     }
 785     return exon;
 786   }
 787
 788 }