src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  51
  52   /**
  53    * when true, interpret the mol_type 'source' feature attribute
  54    * and generate an RNA sequence from the DNA record
  55    */
  56   private boolean produceRna=true;
  57   /**
  58    * A data bean class to hold values parsed from one CDS Feature (FT)
  59    */
  60   class CdsData
  61   {
  62     String translation; // from CDS feature /translation
  63
  64     String cdsLocation; // CDS /location raw value
  65
  66     int codonStart = 1; // from CDS /codon_start
  67
  68     String proteinName; // from CDS /product; used for protein description
  69
  70     String proteinId; // from CDS /protein_id
  71
  72     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  73
  74     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  75   }
  76
  77   private static final String WHITESPACE = "\\s+";
  78
  79   private String sourceDb;
  80
  81   /*
  82    * values parsed from the EMBL flatfile record
  83    */
  84   private String accession; // from ID (first token)
  85
  86   private String version; // from ID (second token)
  87
  88   private String description; // from (first) DE line
  89
  90   private int length = 128; // from ID (7th token), with usable default
  91
  92   private List<DBRefEntry> dbrefs; // from DR
  93
  94   private boolean sequenceStringIsRNA=false;
  95   private String sequenceString; // from SQ lines
  96
  97   /*
  98    * parsed CDS data fields, keyed by protein_id
  99    */
 100   private Map<String, CdsData> cds;
 101
 102   /**
 103    * Constructor
 104    *
 105    * @param fp
 106    * @param sourceId
 107    * @throws IOException
 108    */
 109   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 110   {
 111     super(false, fp); // don't parse immediately
 112     this.sourceDb = sourceId;
 113     dbrefs = new ArrayList<>();
 114
 115     /*
 116      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 117      */
 118     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 119   }
 120
 121   /**
 122    * Parses the flatfile, and if successful, saves as an annotated sequence
 123    * which may be retrieved by calling {@code getSequence()}
 124    *
 125    * @throws IOException
 126    */
 127   @Override
 128   public void parse() throws IOException
 129   {
 130     String line = nextLine();
 131     while (line != null)
 132     {
 133       if (line.startsWith("ID"))
 134       {
 135         line = parseID(line);
 136       }
 137       else if (line.startsWith("DE"))
 138       {
 139         line = parseDE(line);
 140       }
 141       else if (line.startsWith("DR"))
 142       {
 143         line = parseDR(line);
 144       }
 145       else if (line.startsWith("SQ"))
 146       {
 147         line = parseSQ();
 148       }
 149       else if (line.startsWith("FT"))
 150       {
 151         line = parseFT(line);
 152       }
 153       else
 154       {
 155         line = nextLine();
 156       }
 157     }
 158     buildSequence();
 159   }
 160
 161   /**
 162    * Extracts and saves the primary accession and version (SV value) from an ID
 163    * line, or null if not found. Returns the next line after the one processed.
 164    *
 165    * @param line
 166    * @throws IOException
 167    */
 168   String parseID(String line) throws IOException
 169   {
 170     String[] tokens = line.substring(2).split(";");
 171
 172     /*
 173      * first is primary accession
 174      */
 175     String token = tokens[0].trim();
 176     if (!token.isEmpty())
 177     {
 178       this.accession = token;
 179     }
 180
 181     /*
 182      * second token is 'SV versionNo'
 183      */
 184     if (tokens.length > 1)
 185     {
 186       token = tokens[1].trim();
 187       if (token.startsWith("SV"))
 188       {
 189         String[] bits = token.trim().split(WHITESPACE);
 190         this.version = bits[bits.length - 1];
 191       }
 192     }
 193
 194     /*
 195      * seventh token is 'length BP'
 196      */
 197     if (tokens.length > 6)
 198     {
 199       token = tokens[6].trim();
 200       String[] bits = token.trim().split(WHITESPACE);
 201       try
 202       {
 203         this.length = Integer.valueOf(bits[0]);
 204       } catch (NumberFormatException e)
 205       {
 206         Cache.log.error("bad length read in flatfile, line: " + line);
 207       }
 208     }
 209
 210     return nextLine();
 211   }
 212
 213   /**
 214    * Reads sequence description from the first DE line found. Any trailing
 215    * period is discarded. If there are multiple DE lines, only the first (short
 216    * description) is read, the rest are ignored.
 217    *
 218    * @param line
 219    * @return
 220    * @throws IOException
 221    */
 222   String parseDE(String line) throws IOException
 223   {
 224     String desc = line.substring(2).trim();
 225     if (desc.endsWith("."))
 226     {
 227       desc = desc.substring(0, desc.length() - 1);
 228     }
 229     this.description = desc;
 230
 231     /*
 232      * pass over any additional DE lines
 233      */
 234     while ((line = nextLine()) != null)
 235     {
 236       if (!line.startsWith("DE"))
 237       {
 238         break;
 239       }
 240     }
 241
 242     return line;
 243   }
 244
 245   /**
 246    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 247    * the line following the line processed.
 248    *
 249    * @param line
 250    * @throws IOException
 251    */
 252   String parseDR(String line) throws IOException
 253   {
 254     String[] tokens = line.substring(2).split(";");
 255     if (tokens.length > 1)
 256     {
 257       /*
 258        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 259        */
 260       String db = tokens[0].trim();
 261       db = DBRefUtils.getCanonicalName(db);
 262       String acc = tokens[1].trim();
 263       if (acc.endsWith("."))
 264       {
 265         acc = acc.substring(0, acc.length() - 1);
 266       }
 267       String version = "0";
 268       if (tokens.length > 2)
 269       {
 270         String secondaryId = tokens[2].trim();
 271         if (!secondaryId.isEmpty())
 272         {
 273           // todo: is this right? secondary id is not a version number
 274           // version = secondaryId;
 275         }
 276       }
 277       this.dbrefs.add(new DBRefEntry(db, version, acc));
 278     }
 279
 280     return nextLine();
 281   }
 282
 283   /**
 284    * Reads and saves the sequence, read from the lines following the SQ line.
 285    * Whitespace and position counters are discarded. Returns the next line
 286    * following the sequence data (the next line that doesn't start with
 287    * whitespace).
 288    *
 289    * @throws IOException
 290    */
 291   String parseSQ() throws IOException
 292   {
 293     StringBuilder sb = new StringBuilder(this.length);
 294     String line = nextLine();
 295     while (line != null && line.startsWith(" "))
 296     {
 297       line = line.trim();
 298       String[] blocks = line.split(WHITESPACE);
 299
 300       /*
 301        * omit the last block (position counter) on each line
 302        */
 303       for (int i = 0; i < blocks.length - 1; i++)
 304       {
 305         sb.append(blocks[i]);
 306       }
 307       line = nextLine();
 308     }
 309     this.sequenceString = sb.toString();
 310
 311     return line;
 312   }
 313
 314   /**
 315    * Processes an FT line. If it declares a feature type of interest (currently,
 316    * only CDS is processed), processes all of the associated lines (feature
 317    * qualifiers), and returns the next line after that, otherwise simply returns
 318    * the next line.
 319    *
 320    * @param line
 321    * @return
 322    * @throws IOException
 323    */
 324   String parseFT(String line) throws IOException
 325   {
 326     String[] tokens = line.split(WHITESPACE);
 327     if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
 328     {
 329       return nextLine();
 330     }
 331
 332     if (tokens[1].equals("source"))
 333     {
 334       return parseSourceQualifiers(tokens);
 335     }
 336     /*
 337      * parse location - which may be over more than one line e.g. EAW51554
 338      */
 339     CdsData data = new CdsData();
 340     StringBuilder sb = new StringBuilder().append(tokens[2]);
 341     line = parseFeatureQualifier(sb, "CDS");
 342     data.cdsLocation = sb.toString();
 343
 344     while (line != null)
 345     {
 346       if (!line.startsWith("FT    ")) // 4 spaces
 347       {
 348         // e.g. start of next feature "FT source..."
 349         break;
 350       }
 351
 352       /*
 353        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 354        * - the value may extend over more than one line
 355        * - if the value has enclosing quotes, these are removed
 356        * - escaped double quotes ("") are reduced to a single character
 357        */
 358       int slashPos = line.indexOf('/');
 359       if (slashPos == -1)
 360       {
 361         Cache.log.error("Unexpected EMBL line ignored: " + line);
 362         line = nextLine();
 363         continue;
 364       }
 365       int eqPos = line.indexOf('=', slashPos + 1);
 366       if (eqPos == -1)
 367       {
 368         // can happen, e.g. /ribosomal_slippage
 369         // Cache.log.error("Unexpected EMBL line ignored: " + line);
 370         line = nextLine();
 371         continue;
 372       }
 373       String qualifier = line.substring(slashPos + 1, eqPos);
 374       String value = line.substring(eqPos + 1);
 375       value = removeQuotes(value);
 376       sb = new StringBuilder().append(value);
 377       line = parseFeatureQualifier(sb, qualifier);
 378       String featureValue = sb.toString();
 379
 380       if ("protein_id".equals(qualifier))
 381       {
 382         data.proteinId = featureValue;
 383       }
 384       else if ("codon_start".equals(qualifier))
 385       {
 386         try
 387         {
 388           data.codonStart = Integer.parseInt(featureValue.trim());
 389         } catch (NumberFormatException e)
 390         {
 391           Cache.log.error("Invalid codon_start in XML for " + this.accession
 392                   + ": " + e.getMessage());
 393         }
 394       }
 395       else if ("db_xref".equals(qualifier))
 396       {
 397         String[] parts = featureValue.split(":");
 398         if (parts.length == 2)
 399         {
 400           String db = parts[0].trim();
 401           db = DBRefUtils.getCanonicalName(db);
 402           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 403           data.xrefs.add(dbref);
 404         }
 405       }
 406       else if ("product".equals(qualifier))
 407       {
 408         data.proteinName = featureValue;
 409       }
 410       else if ("translation".equals(qualifier))
 411       {
 412         data.translation = featureValue;
 413       }
 414       else if (!"".equals(featureValue))
 415       {
 416         // throw anything else into the additional properties hash
 417         data.cdsProps.put(qualifier, featureValue);
 418       }
 419     }
 420
 421     if (data.proteinId != null)
 422     {
 423       this.cds.put(data.proteinId, data);
 424     }
 425     else
 426     {
 427       Cache.log.error("Ignoring CDS feature with no protein_id for "
 428               + sourceDb + ":" + accession);
 429     }
 430
 431     return line;
 432   }
 433
 434   /**
 435    * process attributes for 'source' until the next FT feature entry
 436    * only interested in 'mol_type'
 437    * @param tokens
 438    * @return
 439    * @throws IOException
 440    */
 441   private String parseSourceQualifiers(String[] tokens) throws IOException
 442   {
 443     if (!"source".equals(tokens[1]))
 444     {
 445       throw (new RuntimeException("Not given a source qualifier"));
 446     }
 447     // search for mol_type attribute
 448
 449     StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
 450                                                               // sequence
 451
 452     String line = parseFeatureQualifier(sb, "source");
 453     while (line != null)
 454     {
 455       if (!line.startsWith("FT    ")) // four spaces, end of this feature table
 456                                       // entry
 457       {
 458         return line;
 459       }
 460
 461       int p = line.indexOf("\\mol_type");
 462       int qs = line.indexOf("\"", p);
 463       int qe = line.indexOf("\"", qs + 1);
 464       String qualifier=line.substring(qs,qe).toLowerCase();
 465       if (qualifier.indexOf("rna") > -1)
 466       {
 467         sequenceStringIsRNA = true;
 468       }
 469       if (qualifier.indexOf("dna") > -1)
 470       {
 471         sequenceStringIsRNA = false;
 472       }
 473       line=parseFeatureQualifier(sb, "source");
 474     }
 475     return line;
 476   }
 477
 478   /**
 479    * Removes leading or trailing double quotes (") unless doubled, and changes
 480    * any 'escaped' (doubled) double quotes to single characters. As per the
 481    * Feature Table specification for Qualifiers, Free Text.
 482    *
 483    * @param value
 484    * @return
 485    */
 486   static String removeQuotes(String value)
 487   {
 488     if (value == null)
 489     {
 490       return null;
 491     }
 492     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
 493     {
 494       value = value.substring(1);
 495     }
 496     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
 497     {
 498       value = value.substring(0, value.length() - 1);
 499     }
 500     value = value.replace(DOUBLED_QUOTE, QUOTE);
 501     return value;
 502   }
 503
 504   /**
 505    * Reads the value of a feature (FT) qualifier from one or more lines of the
 506    * file, and returns the next line after that. Values are appended to the
 507    * string buffer, which should be already primed with the value read from the
 508    * first line for the qualifier (with any leading double quote removed).
 509    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 510    * reduced to one only. For example for
 511    *
 512    * <pre>
 513    * FT      /note="gene_id=hCG28070.3
 514    * FT      ""foobar"" isoform=CRA_b"
 515    * the returned value is
 516    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 517    * </pre>
 518    *
 519    * Note the side-effect of this method, to advance data reading to the next
 520    * line after the feature qualifier.
 521    *
 522    * @param sb
 523    *          a string buffer primed with the first line of the value
 524    * @param qualifierName
 525    * @return
 526    * @throws IOException
 527    */
 528   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 529           throws IOException
 530   {
 531     String line;
 532     while ((line = nextLine()) != null)
 533     {
 534       if (!line.startsWith("FT    "))
 535       {
 536         break; // reached next feature or other input line
 537       }
 538       String[] tokens = line.split(WHITESPACE);
 539       if (tokens.length < 2)
 540       {
 541         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 542                 + ": " + line);
 543         break;
 544       }
 545       if (tokens[1].startsWith("/"))
 546       {
 547         break; // next feature qualifier
 548       }
 549
 550       /*
 551        * heuristic rule: most multi-line value (e.g. /product) are text,
 552        * so add a space for word boundary at a new line; not for translation
 553        */
 554       if (!"translation".equals(qualifierName)
 555               && !"CDS".equals(qualifierName))
 556       {
 557         sb.append(" ");
 558       }
 559
 560       /*
 561        * remove trailing " and unescape doubled ""
 562        */
 563       String data = removeQuotes(tokens[1]);
 564       sb.append(data);
 565     }
 566
 567     return line;
 568   }
 569
 570   /**
 571    * Constructs and saves the sequence from parsed components
 572    */
 573   void buildSequence()
 574   {
 575     if (this.accession == null || this.sequenceString == null)
 576     {
 577       Cache.log.error("Failed to parse data from EMBL");
 578       return;
 579     }
 580
 581     String name = this.accession;
 582     if (this.sourceDb != null)
 583     {
 584       name = this.sourceDb + "|" + name;
 585     }
 586
 587     if (produceRna && sequenceStringIsRNA)
 588     {
 589       sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
 590     }
 591
 592     SequenceI seq = new Sequence(name, this.sequenceString);
 593     seq.setDescription(this.description);
 594
 595     /*
 596      * add a DBRef to itself
 597      */
 598     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 599     int[] startEnd = new int[] { 1, seq.getLength() };
 600     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 601     seq.addDBRef(selfRef);
 602
 603     for (DBRefEntry dbref : this.dbrefs)
 604     {
 605       seq.addDBRef(dbref);
 606     }
 607
 608     processCDSFeatures(seq);
 609
 610     seq.deriveSequence();
 611
 612     addSequence(seq);
 613   }
 614
 615   /**
 616    * Process the CDS features, including generation of cross-references and
 617    * mappings to the protein products (translation)
 618    *
 619    * @param seq
 620    */
 621   protected void processCDSFeatures(SequenceI seq)
 622   {
 623     /*
 624      * record protein products found to avoid duplication i.e. >1 CDS with
 625      * the same /protein_id [though not sure I can find an example of this]
 626      */
 627     Map<String, SequenceI> proteins = new HashMap<>();
 628     for (CdsData data : cds.values())
 629     {
 630       processCDSFeature(seq, data, proteins);
 631     }
 632   }
 633
 634   /**
 635    * Processes data for one parsed CDS feature to
 636    * <ul>
 637    * <li>create a protein product sequence for the translation</li>
 638    * <li>create a cross-reference to protein with mapping from dna</li>
 639    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 640    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 641    * </ul>
 642    *
 643    * @param SequenceI
 644    *          dna
 645    * @param proteins
 646    *          map of protein products so far derived from CDS data
 647    */
 648   void processCDSFeature(SequenceI dna, CdsData data,
 649           Map<String, SequenceI> proteins)
 650   {
 651     /*
 652      * parse location into a list of [start, end, start, end] positions
 653      */
 654     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 655
 656     MapList maplist = buildMappingToProtein(dna, exons, data);
 657
 658     int exonNumber = 0;
 659
 660     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 661     {
 662       int exonStart = exons[xint];
 663       int exonEnd = exons[xint + 1];
 664       int begin = Math.min(exonStart, exonEnd);
 665       int end = Math.max(exonStart, exonEnd);
 666       exonNumber++;
 667       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 668               exonNumber, data.proteinId);
 669
 670       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 671               this.sourceDb);
 672       for (Entry<String, String> val : data.cdsProps.entrySet())
 673       {
 674         sf.setValue(val.getKey(), val.getValue());
 675       }
 676
 677       sf.setEnaLocation(data.cdsLocation);
 678       boolean forwardStrand = exonStart <= exonEnd;
 679       sf.setStrand(forwardStrand ? "+" : "-");
 680       sf.setPhase(String.valueOf(data.codonStart - 1));
 681       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 682       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 683
 684       dna.addSequenceFeature(sf);
 685     }
 686
 687     boolean hasUniprotDbref = false;
 688     for (DBRefEntry xref : data.xrefs)
 689     {
 690       dna.addDBRef(xref);
 691       if (xref.getSource().equals(DBRefSource.UNIPROT))
 692       {
 693         /*
 694          * construct (or find) the sequence for (data.protein_id, data.translation)
 695          */
 696         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 697         Mapping map = new Mapping(protein, maplist);
 698         map.setMappedFromId(data.proteinId);
 699         xref.setMap(map);
 700
 701         /*
 702          * add DBRefs with mappings from dna to protein and the inverse
 703          */
 704         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 705         db1.setMap(new Mapping(dna, maplist.getInverse()));
 706         protein.addDBRef(db1);
 707
 708         hasUniprotDbref = true;
 709       }
 710     }
 711
 712     /*
 713      * if we have a product (translation) but no explicit Uniprot dbref
 714      * (example: EMBL M19487 protein_id AAB02592.1)
 715      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 716      */
 717     if (!hasUniprotDbref)
 718     {
 719       SequenceI protein = proteins.get(data.proteinId);
 720       if (protein == null)
 721       {
 722         protein = new Sequence(data.proteinId, data.translation);
 723         protein.setDescription(data.proteinName);
 724         proteins.put(data.proteinId, protein);
 725       }
 726       // assuming CDSPROTEIN sequence version = dna version (?!)
 727       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 728               this.version, data.proteinId);
 729       protein.addDBRef(db1);
 730
 731       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 732               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 733       Mapping map = new Mapping(protein, maplist);
 734       map.setMappedFromId(data.proteinId);
 735       dnaToEmblProteinRef.setMap(map);
 736       dna.addDBRef(dnaToEmblProteinRef);
 737     }
 738
 739     /*
 740      * comment brought forward from EmblXmlSource, lines 447-451:
 741      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 742      * sequence with the exon  map; if given a dataset reference, search
 743      * dataset for parent EMBL sequence if it exists and set its map;
 744      * make a new feature annotating the coding contig
 745      */
 746   }
 747
 748   /**
 749    * Computes a mapping from CDS positions in DNA sequence to protein product
 750    * positions, with allowance for stop codon or incomplete start codon
 751    *
 752    * @param dna
 753    * @param exons
 754    * @param data
 755    * @return
 756    */
 757   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 758           final CdsData data)
 759   {
 760     MapList dnaToProteinMapping = null;
 761     int peptideLength = data.translation.length();
 762
 763     int[] proteinRange = new int[] { 1, peptideLength };
 764     if (exons != null && exons.length > 0)
 765     {
 766       /*
 767        * We were able to parse 'location'; do a final
 768        * product length truncation check
 769        */
 770       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 771       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 772     }
 773     else
 774     {
 775       /*
 776        * workaround until we handle all 'location' formats fully
 777        * e.g. X53828.1:60..1058 or <123..>289
 778        */
 779       Cache.log.error(String.format(
 780               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 781                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 782               data.cdsLocation, sourceDb, this.accession));
 783
 784       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 785       int mappedDnaEnd = dna.getEnd();
 786       if (peptideLength * 3 == completeCodonsLength)
 787       {
 788         // this might occur for CDS sequences where no features are marked
 789         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 790         mappedDnaEnd = dna.getEnd();
 791       }
 792       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 793       {
 794         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 795         mappedDnaEnd = dna.getEnd() - 3;
 796       }
 797
 798       if (mappedDnaEnd != -1)
 799       {
 800         int[] cdsRanges = new int[] {
 801             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 802         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 803       }
 804     }
 805
 806     return dnaToProteinMapping;
 807   }
 808
 809   /**
 810    * Constructs a sequence for the protein product for the CDS data (if there is
 811    * one), and dbrefs with mappings from CDS to protein and the reverse
 812    *
 813    * @param dna
 814    * @param xref
 815    * @param data
 816    * @param proteins
 817    * @return
 818    */
 819   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 820           CdsData data, Map<String, SequenceI> proteins)
 821   {
 822     /*
 823      * check we have some data to work with
 824      */
 825     if (data.proteinId == null || data.translation == null)
 826     {
 827       return null;
 828     }
 829
 830     /*
 831      * Construct the protein sequence (if not already seen)
 832      */
 833     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 834     SequenceI protein = proteins.get(proteinSeqName);
 835     if (protein == null)
 836     {
 837       protein = new Sequence(proteinSeqName, data.translation, 1,
 838               data.translation.length());
 839       protein.setDescription(data.proteinName != null ? data.proteinName
 840               : "Protein Product from " + sourceDb);
 841       proteins.put(proteinSeqName, protein);
 842     }
 843
 844     return protein;
 845   }
 846
 847   /**
 848    * Returns the CDS location as a single array of [start, end, start, end...]
 849    * positions. If on the reverse strand, these will be in descending order.
 850    *
 851    * @param accession
 852    * @param location
 853    * @return
 854    */
 855   protected int[] getCdsRanges(String accession, String location)
 856   {
 857     if (location == null)
 858     {
 859       return new int[] {};
 860     }
 861
 862     try
 863     {
 864       List<int[]> ranges = DnaUtils.parseLocation(location);
 865       return MappingUtils.listToArray(ranges);
 866     } catch (ParseException e)
 867     {
 868       Cache.log.warn(
 869               String.format("Not parsing inexact CDS location %s in ENA %s",
 870                       location, accession));
 871       return new int[] {};
 872     }
 873   }
 874
 875   /**
 876    * Output (print) is not implemented for EMBL flat file format
 877    */
 878   @Override
 879   public String print(SequenceI[] seqs, boolean jvsuffix)
 880   {
 881     return null;
 882   }
 883
 884   /**
 885    * Truncates (if necessary) the exon intervals to match 3 times the length of
 886    * the protein; also accepts 3 bases longer (for stop codon not included in
 887    * protein)
 888    *
 889    * @param proteinLength
 890    * @param exon
 891    *          an array of [start, end, start, end...] intervals
 892    * @return the same array (if unchanged) or a truncated copy
 893    */
 894   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 895   {
 896     if (proteinLength <= 0 || exon == null)
 897     {
 898       return exon;
 899     }
 900     int expectedCdsLength = proteinLength * 3;
 901     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 902
 903     /*
 904      * if exon length matches protein, or is shorter, then leave it unchanged
 905      */
 906     if (expectedCdsLength >= exonLength)
 907     {
 908       return exon;
 909     }
 910
 911     int origxon[];
 912     int sxpos = -1;
 913     int endxon = 0;
 914     origxon = new int[exon.length];
 915     System.arraycopy(exon, 0, origxon, 0, exon.length);
 916     int cdspos = 0;
 917     for (int x = 0; x < exon.length; x += 2)
 918     {
 919       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 920       if (expectedCdsLength <= cdspos)
 921       {
 922         // advanced beyond last codon.
 923         sxpos = x;
 924         if (expectedCdsLength != cdspos)
 925         {
 926           // System.err
 927           // .println("Truncating final exon interval on region by "
 928           // + (cdspos - cdslength));
 929         }
 930
 931         /*
 932          * shrink the final exon - reduce end position if forward
 933          * strand, increase it if reverse
 934          */
 935         if (exon[x + 1] >= exon[x])
 936         {
 937           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 938         }
 939         else
 940         {
 941           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 942         }
 943         break;
 944       }
 945     }
 946
 947     if (sxpos != -1)
 948     {
 949       // and trim the exon interval set if necessary
 950       int[] nxon = new int[sxpos + 2];
 951       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 952       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 953                                 // set
 954       exon = nxon;
 955     }
 956     return exon;
 957   }
 958 }