src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.bin.Cache;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.DBRefEntry;
  28 import jalview.datamodel.DBRefSource;
  29 import jalview.datamodel.FeatureProperties;
  30 import jalview.datamodel.Mapping;
  31 import jalview.datamodel.Sequence;
  32 import jalview.datamodel.SequenceFeature;
  33 import jalview.datamodel.SequenceI;
  34 import jalview.util.DBRefUtils;
  35 import jalview.util.DnaUtils;
  36 import jalview.util.MapList;
  37 import jalview.util.MappingUtils;
  38 import jalview.util.MessageManager;
  39 import jalview.ws.ebi.EBIFetchClient;
  40 import jalview.xml.binding.embl.EntryType;
  41 import jalview.xml.binding.embl.EntryType.Feature;
  42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  43 import jalview.xml.binding.embl.XrefType;
  44
  45 import java.io.File;
  46 import java.io.FileInputStream;
  47 import java.io.InputStream;
  48 import java.text.ParseException;
  49 import java.util.ArrayList;
  50 import java.util.Arrays;
  51 import java.util.Hashtable;
  52 import java.util.List;
  53 import java.util.Map;
  54 import java.util.Map.Entry;
  55 import java.util.regex.Pattern;
  56
  57 import javax.xml.bind.JAXBContext;
  58 import javax.xml.bind.JAXBException;
  59 import javax.xml.stream.FactoryConfigurationError;
  60 import javax.xml.stream.XMLInputFactory;
  61 import javax.xml.stream.XMLStreamException;
  62 import javax.xml.stream.XMLStreamReader;
  63
  64 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  65 {
  66   /*
  67    * JAL-1856 Embl returns this text for query not found
  68    */
  69   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  70
  71   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  72
  73   public EmblXmlSource()
  74   {
  75     super();
  76   }
  77
  78   /**
  79    * Retrieves and parses an emblxml file, and returns an alignment containing
  80    * the parsed sequences, or null if none were found
  81    *
  82    * @param emprefx
  83    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  84    * @param query
  85    * @return
  86    * @throws Exception
  87    */
  88   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  89           throws Exception
  90   {
  91     startQuery();
  92     EBIFetchClient dbFetch = new EBIFetchClient();
  93     File reply;
  94     try
  95     {
  96       reply = dbFetch.fetchDataAsFile(
  97               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
  98               "xml");
  99     } catch (Exception e)
 100     {
 101       stopQuery();
 102       throw new Exception(MessageManager.formatMessage(
 103               "exception.ebiembl_retrieval_failed_on", new String[]
 104               { emprefx.toLowerCase(), query.trim() }), e);
 105     }
 106     return getEmblSequenceRecords(emprefx, query, reply);
 107   }
 108
 109   /**
 110    * parse an emblxml file stored locally
 111    *
 112    * @param emprefx
 113    *          either EMBL or EMBLCDS strings are allowed - anything else will
 114    *          not retrieve emblxml
 115    * @param query
 116    * @param file
 117    *          the EMBL XML file containing the results of a query
 118    * @return
 119    * @throws Exception
 120    */
 121   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 122           File reply) throws Exception
 123   {
 124     List<EntryType> entries = null;
 125     if (reply != null && reply.exists())
 126     {
 127       file = reply.getAbsolutePath();
 128       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 129       {
 130         InputStream is = new FileInputStream(reply);
 131         entries = getEmblEntries(is);
 132       }
 133     }
 134
 135     /*
 136      * invalid accession gets a reply with no <entry> elements, text content of
 137      * EmbFile reads something like (e.g.) this ungrammatical phrase
 138      * Entry: <acc> display type is either not supported or entry is not found.
 139      */
 140     AlignmentI al = null;
 141     List<SequenceI> seqs = new ArrayList<>();
 142     List<SequenceI> peptides = new ArrayList<>();
 143     if (entries != null)
 144     {
 145       for (EntryType entry : entries)
 146       {
 147         SequenceI seq = getSequence(emprefx, entry, peptides);
 148         if (seq != null)
 149         {
 150           seqs.add(seq.deriveSequence());
 151           // place DBReferences on dataset and refer
 152         }
 153       }
 154       if (!seqs.isEmpty())
 155       {
 156         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 157       }
 158       else
 159       {
 160         System.out.println(
 161                 "No record found for '" + emprefx + ":" + query + "'");
 162       }
 163     }
 164
 165     stopQuery();
 166     return al;
 167   }
 168
 169   /**
 170    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 171    * (possibly empty) list of <code>EntryType</code> objects.
 172    *
 173    * is
 174    *
 175    * @return
 176    */
 177   List<EntryType> getEmblEntries(InputStream is)
 178   {
 179     List<EntryType> entries = new ArrayList<>();
 180     try
 181     {
 182       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 183       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 184               .createXMLStreamReader(is);
 185       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 186       jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um
 187               .unmarshal(streamReader);
 188
 189       /*
 190        * document root contains either "entry" or "entrySet"
 191        */
 192       if (root == null)
 193       {
 194         return entries;
 195       }
 196       if (root.getEntrySet() != null)
 197       {
 198         entries = root.getEntrySet().getEntry();
 199       }
 200       else if (root.getEntry() != null)
 201       {
 202         entries.add(root.getEntry());
 203       }
 204     } catch (JAXBException | XMLStreamException
 205             | FactoryConfigurationError e)
 206     {
 207       e.printStackTrace();
 208     }
 209     return entries;
 210   }
 211
 212   /**
 213    * A helper method to parse XML data and construct a sequence, with any
 214    * available database references and features
 215    *
 216    * @param emprefx
 217    * @param entry
 218    * @param peptides
 219    * @return
 220    */
 221   SequenceI getSequence(String sourceDb, EntryType entry,
 222           List<SequenceI> peptides)
 223   {
 224     String seqString = entry.getSequence();
 225     if (seqString == null)
 226     {
 227       return null;
 228     }
 229     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 230             "");
 231     String accession = entry.getAccession();
 232     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 233
 234     dna.setDescription(entry.getDescription());
 235     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 236     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 237             accession);
 238     dna.addDBRef(selfRref);
 239     selfRref.setMap(
 240             new Mapping(null, new int[]
 241             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 242                     1));
 243
 244     /*
 245      * add db references
 246      */
 247     List<XrefType> xrefs = entry.getXref();
 248     if (xrefs != null)
 249     {
 250       for (XrefType xref : xrefs)
 251       {
 252         String acc = xref.getId();
 253         String source = DBRefUtils.getCanonicalName(xref.getDb());
 254         String version = xref.getSecondaryId();
 255         if (version == null || "".equals(version))
 256         {
 257           version = "0";
 258         }
 259         dna.addDBRef(new DBRefEntry(source, version, acc));
 260       }
 261     }
 262
 263     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 264     try
 265     {
 266       List<Feature> features = entry.getFeature();
 267       if (features != null)
 268       {
 269         for (Feature feature : features)
 270         {
 271           if (FeatureProperties.isCodingFeature(sourceDb,
 272                   feature.getName()))
 273           {
 274             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 275                     matcher);
 276           }
 277         }
 278       }
 279     } catch (Exception e)
 280     {
 281       System.err.println("EMBL Record Features parsing error!");
 282       System.err
 283               .println("Please report the following to help@jalview.org :");
 284       System.err.println("EMBL Record " + accession);
 285       System.err.println("Resulted in exception: " + e.getMessage());
 286       e.printStackTrace(System.err);
 287     }
 288
 289     return dna;
 290   }
 291
 292   /**
 293    * Extracts coding region and product from a CDS feature and decorates it with
 294    * annotations
 295    *
 296    * @param entry
 297    * @param feature
 298    * @param sourceDb
 299    * @param dna
 300    * @param peptides
 301    * @param matcher
 302    */
 303   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 304           SequenceI dna, List<SequenceI> peptides,
 305           SequenceIdMatcher matcher)
 306   {
 307     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 308     final String accession = entry.getAccession();
 309     final String sequenceVersion = entry.getVersion().toString();
 310
 311     int[] exons = getCdsRanges(entry.getAccession(), feature);
 312
 313     String translation = null;
 314     String proteinName = "";
 315     String proteinId = null;
 316     Map<String, String> vals = new Hashtable<>();
 317
 318     /*
 319      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 320      * (phase is required for CDS features in GFF3 format)
 321      */
 322     int codonStart = 1;
 323
 324     /*
 325      * parse qualifiers, saving protein translation, protein id,
 326      * codon start position, product (name), and 'other values'
 327      */
 328     if (feature.getQualifier() != null)
 329     {
 330       for (Qualifier q : feature.getQualifier())
 331       {
 332         String qname = q.getName();
 333         String value = q.getValue();
 334         value = value == null ? ""
 335                 : value.trim().replace(" ", "").replace("\n", "")
 336                         .replace("\t", "");
 337         if (qname.equals("translation"))
 338         {
 339           translation = value;
 340         }
 341         else if (qname.equals("protein_id"))
 342         {
 343           proteinId = value;
 344         }
 345         else if (qname.equals("codon_start"))
 346         {
 347           try
 348           {
 349             codonStart = Integer.parseInt(value.trim());
 350           } catch (NumberFormatException e)
 351           {
 352             System.err.println("Invalid codon_start in XML for "
 353                     + entry.getAccession() + ": " + e.getMessage());
 354           }
 355         }
 356         else if (qname.equals("product"))
 357         {
 358           // sometimes name is returned e.g. for V00488
 359           proteinName = value;
 360         }
 361         else
 362         {
 363           // throw anything else into the additional properties hash
 364           if (!"".equals(value))
 365           {
 366             vals.put(qname, value);
 367           }
 368         }
 369       }
 370     }
 371
 372     DBRefEntry proteinToEmblProteinRef = null;
 373     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 374
 375     SequenceI product = null;
 376     Mapping dnaToProteinMapping = null;
 377     if (translation != null && proteinName != null && proteinId != null)
 378     {
 379       int translationLength = translation.length();
 380
 381       /*
 382        * look for product in peptides list, if not found, add it
 383        */
 384       product = matcher.findIdMatch(proteinId);
 385       if (product == null)
 386       {
 387         product = new Sequence(proteinId, translation, 1,
 388                 translationLength);
 389         product.setDescription(((proteinName.length() == 0)
 390                 ? "Protein Product from " + sourceDb
 391                 : proteinName));
 392         peptides.add(product);
 393         matcher.add(product);
 394       }
 395
 396       // we have everything - create the mapping and perhaps the protein
 397       // sequence
 398       if (exons == null || exons.length == 0)
 399       {
 400         /*
 401          * workaround until we handle dna location for CDS sequence
 402          * e.g. location="X53828.1:60..1058" correctly
 403          */
 404         System.err.println(
 405                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 406                         + sourceDb + ":" + entry.getAccession() + ")");
 407         int dnaLength = dna.getLength();
 408         if (translationLength * 3 == (1 - codonStart + dnaLength))
 409         {
 410           System.err.println(
 411                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 412           // this might occur for CDS sequences where no features are marked
 413           exons = new int[] { dna.getStart() + (codonStart - 1),
 414               dna.getEnd() };
 415           dnaToProteinMapping = new Mapping(product, exons,
 416                   new int[]
 417                   { 1, translationLength }, 3, 1);
 418         }
 419         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 420         {
 421           System.err.println(
 422                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 423           exons = new int[] { dna.getStart() + (codonStart - 1),
 424               dna.getEnd() - 3 };
 425           dnaToProteinMapping = new Mapping(product, exons,
 426                   new int[]
 427                   { 1, translationLength }, 3, 1);
 428         }
 429       }
 430       else
 431       {
 432         // Trim the exon mapping if necessary - the given product may only be a
 433         // fragment of a larger protein. (EMBL:AY043181 is an example)
 434
 435         if (isEmblCdna)
 436         {
 437           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 438           // map
 439           // if given a dataset reference, search dataset for parent EMBL
 440           // sequence if it exists and set its map
 441           // make a new feature annotating the coding contig
 442         }
 443         else
 444         {
 445           // final product length truncation check
 446           int[] cdsRanges = adjustForProteinLength(translationLength,
 447                   exons);
 448           dnaToProteinMapping = new Mapping(product, cdsRanges,
 449                   new int[]
 450                   { 1, translationLength }, 3, 1);
 451           if (product != null)
 452           {
 453             /*
 454              * make xref with mapping from protein to EMBL dna
 455              */
 456             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 457                     sequenceVersion, proteinId,
 458                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 459             product.addDBRef(proteinToEmblRef);
 460
 461             /*
 462              * make xref from protein to EMBLCDS; we assume here that the
 463              * CDS sequence version is same as dna sequence (?!)
 464              */
 465             MapList proteinToCdsMapList = new MapList(
 466                     new int[]
 467                     { 1, translationLength },
 468                     new int[]
 469                     { 1 + (codonStart - 1),
 470                         (codonStart - 1) + 3 * translationLength },
 471                     1, 3);
 472             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 473                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 474                     new Mapping(proteinToCdsMapList));
 475             product.addDBRef(proteinToEmblCdsRef);
 476
 477             /*
 478              * make 'direct' xref from protein to EMBLCDSPROTEIN
 479              */
 480             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 481             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 482             proteinToEmblProteinRef.setMap(null);
 483             product.addDBRef(proteinToEmblProteinRef);
 484           }
 485         }
 486       }
 487
 488       /*
 489        * add cds features to dna sequence
 490        */
 491       String cds = feature.getName(); // "CDS"
 492       for (int xint = 0; exons != null
 493               && xint < exons.length - 1; xint += 2)
 494       {
 495         int exonStart = exons[xint];
 496         int exonEnd = exons[xint + 1];
 497         int begin = Math.min(exonStart, exonEnd);
 498         int end = Math.max(exonStart, exonEnd);
 499         int exonNumber = xint / 2 + 1;
 500         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 501                 exonNumber, proteinName, proteinId);
 502
 503         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 504                 vals);
 505
 506         sf.setEnaLocation(feature.getLocation());
 507         boolean forwardStrand = exonStart <= exonEnd;
 508         sf.setStrand(forwardStrand ? "+" : "-");
 509         sf.setPhase(String.valueOf(codonStart - 1));
 510         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 511         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 512
 513         dna.addSequenceFeature(sf);
 514       }
 515     }
 516
 517     /*
 518      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 519      */
 520     boolean hasUniprotDbref = false;
 521     List<XrefType> xrefs = feature.getXref();
 522     if (xrefs != null)
 523     {
 524       boolean mappingUsed = false;
 525       for (XrefType xref : xrefs)
 526       {
 527         /*
 528          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 529          */
 530         String source = DBRefUtils.getCanonicalName(xref.getDb());
 531         String version = xref.getSecondaryId();
 532         if (version == null || "".equals(version))
 533         {
 534           version = "0";
 535         }
 536         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 537         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 538                 dbref.getAccessionId());
 539         if (source.equals(DBRefSource.UNIPROT))
 540         {
 541           String proteinSeqName = DBRefSource.UNIPROT + "|"
 542                   + dbref.getAccessionId();
 543           if (dnaToProteinMapping != null
 544                   && dnaToProteinMapping.getTo() != null)
 545           {
 546             if (mappingUsed)
 547             {
 548               /*
 549                * two or more Uniprot xrefs for the same CDS -
 550                * each needs a distinct Mapping (as to a different sequence)
 551                */
 552               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 553             }
 554             mappingUsed = true;
 555
 556             /*
 557              * try to locate the protein mapped to (possibly by a
 558              * previous CDS feature); if not found, construct it from
 559              * the EMBL translation
 560              */
 561             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 562             if (proteinSeq == null)
 563             {
 564               proteinSeq = new Sequence(proteinSeqName,
 565                       product.getSequenceAsString());
 566               matcher.add(proteinSeq);
 567               peptides.add(proteinSeq);
 568             }
 569             dnaToProteinMapping.setTo(proteinSeq);
 570             dnaToProteinMapping.setMappedFromId(proteinId);
 571             proteinSeq.addDBRef(proteinDbRef);
 572             dbref.setMap(dnaToProteinMapping);
 573           }
 574           hasUniprotDbref = true;
 575         }
 576         if (product != null)
 577         {
 578           /*
 579            * copy feature dbref to our protein product
 580            */
 581           DBRefEntry pref = proteinDbRef;
 582           pref.setMap(null); // reference is direct
 583           product.addDBRef(pref);
 584           // Add converse mapping reference
 585           if (dnaToProteinMapping != null)
 586           {
 587             Mapping pmap = new Mapping(dna,
 588                     dnaToProteinMapping.getMap().getInverse());
 589             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 590             pref.setMap(pmap);
 591             if (dnaToProteinMapping.getTo() != null)
 592             {
 593               dnaToProteinMapping.getTo().addDBRef(pref);
 594             }
 595           }
 596         }
 597         dna.addDBRef(dbref);
 598       }
 599     }
 600
 601     /*
 602      * if we have a product (translation) but no explicit Uniprot dbref
 603      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 604      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 605      */
 606     if (!hasUniprotDbref && product != null)
 607     {
 608       if (proteinToEmblProteinRef == null)
 609       {
 610         // assuming CDSPROTEIN sequence version = dna version (?!)
 611         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 612                 sequenceVersion, proteinId);
 613       }
 614       product.addDBRef(proteinToEmblProteinRef);
 615
 616       if (dnaToProteinMapping != null
 617               && dnaToProteinMapping.getTo() != null)
 618       {
 619         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 620                 DBRefSource.EMBLCDSProduct, sequenceVersion,
 621                 proteinId);
 622         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 623         dnaToProteinMapping.setMappedFromId(proteinId);
 624         dna.addDBRef(dnaToEmblProteinRef);
 625       }
 626     }
 627   }
 628
 629   @Override
 630   public boolean isDnaCoding()
 631   {
 632     return true;
 633   }
 634
 635   /**
 636    * Returns the CDS positions as a single array of [start, end, start, end...]
 637    * positions. If on the reverse strand, these will be in descending order.
 638    *
 639    * @param accession
 640    * @param feature
 641    * @return
 642    */
 643   protected int[] getCdsRanges(String accession, Feature feature)
 644   {
 645     String location = feature.getLocation();
 646     if (location == null)
 647     {
 648       return new int[] {};
 649     }
 650
 651     try
 652     {
 653       List<int[]> ranges = DnaUtils.parseLocation(location);
 654       return listToArray(ranges);
 655     } catch (ParseException e)
 656     {
 657       Cache.log.warn(
 658               String.format("Not parsing inexact CDS location %s in ENA %s",
 659                       location, accession));
 660       return new int[] {};
 661     }
 662   }
 663
 664   /**
 665    * Converts a list of [start, end] ranges to a single array of [start, end,
 666    * start, end ...]
 667    *
 668    * @param ranges
 669    * @return
 670    */
 671   int[] listToArray(List<int[]> ranges)
 672   {
 673     int[] result = new int[ranges.size() * 2];
 674     int i = 0;
 675     for (int[] range : ranges)
 676     {
 677       result[i++] = range[0];
 678       result[i++] = range[1];
 679     }
 680     return result;
 681   }
 682
 683   /**
 684    * Helper method to construct a SequenceFeature for one cds range
 685    *
 686    * @param type
 687    *          feature type ("CDS")
 688    * @param desc
 689    *          description
 690    * @param begin
 691    *          start position
 692    * @param end
 693    *          end position
 694    * @param group
 695    *          feature group
 696    * @param vals
 697    *          map of 'miscellaneous values' for feature
 698    * @return
 699    */
 700   protected SequenceFeature makeCdsFeature(String type, String desc,
 701           int begin, int end, String group, Map<String, String> vals)
 702   {
 703     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 704     if (!vals.isEmpty())
 705     {
 706       StringBuilder sb = new StringBuilder();
 707       boolean first = true;
 708       for (Entry<String, String> val : vals.entrySet())
 709       {
 710         if (!first)
 711         {
 712           sb.append(";");
 713         }
 714         sb.append(val.getKey()).append("=").append(val.getValue());
 715         first = false;
 716         sf.setValue(val.getKey(), val.getValue());
 717       }
 718       sf.setAttributes(sb.toString());
 719     }
 720     return sf;
 721   }
 722
 723   /**
 724    * Truncates (if necessary) the exon intervals to match 3 times the length of
 725    * the protein; also accepts 3 bases longer (for stop codon not included in
 726    * protein)
 727    *
 728    * @param proteinLength
 729    * @param exon
 730    *          an array of [start, end, start, end...] intervals
 731    * @return the same array (if unchanged) or a truncated copy
 732    */
 733   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 734   {
 735     if (proteinLength <= 0 || exon == null)
 736     {
 737       return exon;
 738     }
 739     int expectedCdsLength = proteinLength * 3;
 740     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 741
 742     /*
 743      * if exon length matches protein, or is shorter, or longer by the
 744      * length of a stop codon (3 bases), then leave it unchanged
 745      */
 746     if (expectedCdsLength >= exonLength
 747             || expectedCdsLength == exonLength - 3)
 748     {
 749       return exon;
 750     }
 751
 752     int origxon[];
 753     int sxpos = -1;
 754     int endxon = 0;
 755     origxon = new int[exon.length];
 756     System.arraycopy(exon, 0, origxon, 0, exon.length);
 757     int cdspos = 0;
 758     for (int x = 0; x < exon.length; x += 2)
 759     {
 760       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 761       if (expectedCdsLength <= cdspos)
 762       {
 763         // advanced beyond last codon.
 764         sxpos = x;
 765         if (expectedCdsLength != cdspos)
 766         {
 767           // System.err
 768           // .println("Truncating final exon interval on region by "
 769           // + (cdspos - cdslength));
 770         }
 771
 772         /*
 773          * shrink the final exon - reduce end position if forward
 774          * strand, increase it if reverse
 775          */
 776         if (exon[x + 1] >= exon[x])
 777         {
 778           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 779         }
 780         else
 781         {
 782           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 783         }
 784         break;
 785       }
 786     }
 787
 788     if (sxpos != -1)
 789     {
 790       // and trim the exon interval set if necessary
 791       int[] nxon = new int[sxpos + 2];
 792       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 793       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 794                                 // set
 795       exon = nxon;
 796     }
 797     return exon;
 798   }
 799
 800 }