src/jalview/ext/ensembl/EnsemblSeqProxy.java

   1 package jalview.ext.ensembl;
   2
   3 import jalview.analysis.AlignmentUtils;
   4 import jalview.datamodel.Alignment;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.DBRefEntry;
   7 import jalview.datamodel.DBRefSource;
   8 import jalview.datamodel.Mapping;
   9 import jalview.datamodel.SequenceFeature;
  10 import jalview.datamodel.SequenceI;
  11 import jalview.exceptions.JalviewException;
  12 import jalview.io.FastaFile;
  13 import jalview.io.FileParse;
  14 import jalview.io.gff.SequenceOntologyFactory;
  15 import jalview.io.gff.SequenceOntologyI;
  16 import jalview.schemes.ResidueProperties;
  17 import jalview.util.DBRefUtils;
  18 import jalview.util.MapList;
  19 import jalview.util.MappingUtils;
  20 import jalview.util.StringUtils;
  21
  22 import java.io.IOException;
  23 import java.net.MalformedURLException;
  24 import java.net.URL;
  25 import java.util.ArrayList;
  26 import java.util.Arrays;
  27 import java.util.Collections;
  28 import java.util.Comparator;
  29 import java.util.LinkedHashMap;
  30 import java.util.List;
  31 import java.util.Map.Entry;
  32
  33 /**
  34  * Base class for Ensembl sequence fetchers
  35  *
  36  * @see http://rest.ensembl.org/documentation/info/sequence_id
  37  * @author gmcarstairs
  38  */
  39 public abstract class EnsemblSeqProxy extends EnsemblRestClient
  40 {
  41   private static final List<String> CROSS_REFERENCES = Arrays
  42           .asList(new String[] { "CCDS", "Uniprot/SWISSPROT" });
  43
  44   protected static final String CONSEQUENCE_TYPE = "consequence_type";
  45
  46   protected static final String PARENT = "Parent";
  47
  48   protected static final String ID = "ID";
  49
  50   protected static final String NAME = "Name";
  51
  52   /*
  53    * enum for 'type' parameter to the /sequence REST service
  54    */
  55   public enum EnsemblSeqType
  56   {
  57     /**
  58      * type=genomic to fetch full dna including introns
  59      */
  60     GENOMIC("genomic"),
  61
  62     /**
  63      * type=cdna to fetch dna including UTRs
  64      */
  65     CDNA("cdna"),
  66
  67     /**
  68      * type=cds to fetch coding dna excluding UTRs
  69      */
  70     CDS("cds"),
  71
  72     /**
  73      * type=protein to fetch peptide product sequence
  74      */
  75     PROTEIN("protein");
  76
  77     /*
  78      * the value of the 'type' parameter to fetch this version of
  79      * an Ensembl sequence
  80      */
  81     private String type;
  82
  83     EnsemblSeqType(String t)
  84     {
  85       type = t;
  86     }
  87
  88     public String getType()
  89     {
  90       return type;
  91     }
  92
  93   }
  94
  95   /**
  96    * A comparator to sort ranges into ascending start position order
  97    */
  98   private class RangeSorter implements Comparator<int[]>
  99   {
 100     boolean forwards;
 101
 102     RangeSorter(boolean forward)
 103     {
 104       forwards = forward;
 105     }
 106
 107     @Override
 108     public int compare(int[] o1, int[] o2)
 109     {
 110       return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]);
 111     }
 112
 113   }
 114
 115   /**
 116    * Default constructor (to use rest.ensembl.org)
 117    */
 118   public EnsemblSeqProxy()
 119   {
 120     super();
 121   }
 122
 123   /**
 124    * Constructor given the target domain to fetch data from
 125    */
 126   public EnsemblSeqProxy(String d)
 127   {
 128     super(d);
 129   }
 130
 131   /**
 132    * Makes the sequence queries to Ensembl's REST service and returns an
 133    * alignment consisting of the returned sequences.
 134    */
 135   @Override
 136   public AlignmentI getSequenceRecords(String query) throws Exception
 137   {
 138     // TODO use a String... query vararg instead?
 139
 140     // danger: accession separator used as a regex here, a string elsewhere
 141     // in this case it is ok (it is just a space), but (e.g.) '\' would not be
 142     List<String> allIds = Arrays.asList(query
 143             .split(getAccessionSeparator()));
 144     AlignmentI alignment = null;
 145     inProgress = true;
 146
 147     /*
 148      * execute queries, if necessary in batches of the
 149      * maximum allowed number of ids
 150      */
 151     int maxQueryCount = getMaximumQueryCount();
 152     for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount)
 153     {
 154       int p = Math.min(vSize, v + maxQueryCount);
 155       List<String> ids = allIds.subList(v, p);
 156       try
 157       {
 158         alignment = fetchSequences(ids, alignment);
 159       } catch (Throwable r)
 160       {
 161         inProgress = false;
 162         String msg = "Aborting ID retrieval after " + v
 163                 + " chunks. Unexpected problem (" + r.getLocalizedMessage()
 164                 + ")";
 165         System.err.println(msg);
 166         break;
 167       }
 168     }
 169
 170     if (alignment == null)
 171     {
 172       return null;
 173     }
 174
 175     /*
 176      * fetch and transfer genomic sequence features,
 177      * fetch protein product and add as cross-reference
 178      */
 179     for (String accId : allIds)
 180     {
 181       addFeaturesAndProduct(accId, alignment);
 182     }
 183
 184     for (SequenceI seq : alignment.getSequences())
 185     {
 186       getCrossReferences(seq);
 187     }
 188
 189     return alignment;
 190   }
 191
 192   /**
 193    * Fetches Ensembl features using the /overlap REST endpoint, and adds them to
 194    * the sequence in the alignment. Also fetches the protein product, maps it
 195    * from the CDS features of the sequence, and saves it as a cross-reference of
 196    * the dna sequence.
 197    *
 198    * @param accId
 199    * @param alignment
 200    */
 201   protected void addFeaturesAndProduct(String accId, AlignmentI alignment)
 202   {
 203     if (alignment == null)
 204     {
 205       return;
 206     }
 207
 208     try
 209     {
 210       /*
 211        * get 'dummy' genomic sequence with exon, cds and variation features
 212        */
 213       SequenceI genomicSequence = null;
 214       EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain());
 215       EnsemblFeatureType[] features = getFeaturesToFetch();
 216       AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
 217               features);
 218       if (geneFeatures.getHeight() > 0)
 219       {
 220         genomicSequence = geneFeatures.getSequenceAt(0);
 221       }
 222       if (genomicSequence != null)
 223       {
 224         /*
 225          * transfer features to the query sequence
 226          */
 227         SequenceI querySeq = alignment.findName(accId);
 228         if (transferFeatures(accId, genomicSequence, querySeq))
 229         {
 230
 231           /*
 232            * fetch and map protein product, and add it as a cross-reference
 233            * of the retrieved sequence
 234            */
 235           addProteinProduct(querySeq);
 236         }
 237       }
 238     } catch (IOException e)
 239     {
 240       System.err.println("Error transferring Ensembl features: "
 241               + e.getMessage());
 242     }
 243   }
 244
 245   /**
 246    * Returns those sequence feature types to fetch from Ensembl. We may want
 247    * features either because they are of interest to the user, or as means to
 248    * identify the locations of the sequence on the genomic sequence (CDS
 249    * features identify CDS, exon features identify cDNA etc).
 250    *
 251    * @return
 252    */
 253   protected abstract EnsemblFeatureType[] getFeaturesToFetch();
 254
 255   /**
 256    * Fetches and maps the protein product, and adds it as a cross-reference of
 257    * the retrieved sequence
 258    */
 259   protected void addProteinProduct(SequenceI querySeq)
 260   {
 261     String accId = querySeq.getName();
 262     try
 263     {
 264       AlignmentI protein = new EnsemblProtein(getDomain())
 265               .getSequenceRecords(accId);
 266       if (protein == null || protein.getHeight() == 0)
 267       {
 268         System.out.println("Failed to retrieve protein for " + accId);
 269         return;
 270       }
 271       SequenceI proteinSeq = protein.getSequenceAt(0);
 272
 273       /*
 274        * need dataset sequences (to be the subject of mappings)
 275        */
 276       proteinSeq.createDatasetSequence();
 277       querySeq.createDatasetSequence();
 278
 279       MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
 280       if (mapList != null)
 281       {
 282         // clunky: ensure Uniprot xref if we have one is on mapped sequence
 283         SequenceI ds = proteinSeq.getDatasetSequence();
 284         ds.setSourceDBRef(proteinSeq.getSourceDBRef());
 285         Mapping map = new Mapping(ds, mapList);
 286         DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
 287                 accId, map);
 288         querySeq.getDatasetSequence().addDBRef(dbr);
 289
 290         /*
 291          * compute peptide variants from dna variants and add as
 292          * sequence features on the protein sequence ta-da
 293          */
 294         computeProteinFeatures(querySeq, proteinSeq, mapList);
 295       }
 296     } catch (Exception e)
 297     {
 298       System.err
 299               .println(String.format("Error retrieving protein for %s: %s",
 300                       accId, e.getMessage()));
 301     }
 302   }
 303
 304   /**
 305    * Get database xrefs from Ensembl, and attach them to the sequence
 306    *
 307    * @param seq
 308    */
 309   protected void getCrossReferences(SequenceI seq)
 310   {
 311     while (seq.getDatasetSequence() != null)
 312     {
 313       seq = seq.getDatasetSequence();
 314     }
 315
 316     EnsemblXref xrefFetcher = new EnsemblXref(getDomain());
 317     List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName(),
 318             getCrossReferenceDatabases());
 319     for (DBRefEntry xref : xrefs)
 320     {
 321       seq.addDBRef(xref);
 322       /*
 323        * Save any Uniprot xref to be the reference for SIFTS mapping
 324        */
 325       if (DBRefSource.UNIPROT.equals(xref.getSource()))
 326       {
 327         seq.setSourceDBRef(xref);
 328       }
 329     }
 330   }
 331
 332   /**
 333    * Returns a list of database names to be used when fetching cross-references.
 334    *
 335    * @return
 336    */
 337   protected List<String> getCrossReferenceDatabases()
 338   {
 339     return CROSS_REFERENCES;
 340   }
 341
 342   /**
 343    * Returns a mapping from dna to protein by inspecting sequence features of
 344    * type "CDS" on the dna.
 345    *
 346    * @param dnaSeq
 347    * @param proteinSeq
 348    * @return
 349    */
 350   protected MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq)
 351   {
 352     List<int[]> ranges = getCdsRanges(dnaSeq);
 353     int mappedDnaLength = MappingUtils.getLength(ranges);
 354
 355     int proteinLength = proteinSeq.getLength();
 356     int proteinEnd = proteinLength;
 357     int proteinStart = 1;
 358
 359     /*
 360      * incomplete start codon may mean X at start of peptide
 361      * we ignore both for mapping purposes
 362      */
 363     if (proteinSeq.getCharAt(0) == 'X')
 364     {
 365       proteinStart = 2;
 366       proteinLength--;
 367     }
 368     List<int[]> proteinRange = new ArrayList<int[]>();
 369
 370     /*
 371      * dna length should map to protein (or protein plus stop codon)
 372      */
 373     int codesForResidues = mappedDnaLength / 3;
 374     if (codesForResidues == (proteinLength + 1))
 375     {
 376       // assuming extra codon is for STOP and not in peptide
 377       codesForResidues--;
 378     }
 379     if (codesForResidues == proteinLength)
 380     {
 381       proteinRange.add(new int[] { proteinStart, proteinEnd });
 382       return new MapList(ranges, proteinRange, 3, 1);
 383     }
 384     return null;
 385   }
 386
 387   /**
 388    * Returns a list of CDS ranges found.
 389    *
 390    * No need to worry about reverse strand dna, here since the retrieved
 391    * sequence is as transcribed (reverse complement for reverse strand), i.e in
 392    * the same sense as the peptide.
 393    *
 394    * @param dnaSeq
 395    * @return
 396    */
 397   protected List<int[]> getCdsRanges(SequenceI dnaSeq)
 398   {
 399     List<int[]> result = new ArrayList<int[]>();
 400     SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
 401     if (sfs == null)
 402     {
 403       return result;
 404     }
 405     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 406     for (SequenceFeature sf : sfs)
 407     {
 408       /*
 409        * process a CDS feature (or a sub-type of CDS)
 410        */
 411       if (so.isA(sf.getType(), SequenceOntologyI.CDS))
 412       {
 413         int phase = 0;
 414         try {
 415           phase = Integer.parseInt(sf.getPhase());
 416         } catch (NumberFormatException e)
 417         {
 418           // ignore
 419         }
 420         /*
 421          * phase > 0 on first codon means 5' incomplete - skip to the start
 422          * of the next codon; example ENST00000496384
 423          */
 424         int begin = sf.getBegin();
 425         int end = sf.getEnd();
 426         if (result.isEmpty())
 427         {
 428           begin += phase;
 429           if (begin > end)
 430           {
 431             continue; // shouldn't happen?
 432           }
 433         }
 434         result.add(new int[] { begin, end });
 435       }
 436     }
 437     return result;
 438   }
 439
 440   /**
 441    * Fetches sequences for the list of accession ids and adds them to the
 442    * alignment. Returns the extended (or created) alignment.
 443    *
 444    * @param ids
 445    * @param alignment
 446    * @return
 447    * @throws JalviewException
 448    * @throws IOException
 449    */
 450   protected AlignmentI fetchSequences(List<String> ids, AlignmentI alignment)
 451           throws JalviewException, IOException
 452   {
 453     if (!isEnsemblAvailable())
 454     {
 455       inProgress = false;
 456       throw new JalviewException("ENSEMBL Rest API not available.");
 457     }
 458     FileParse fp = getSequenceReader(ids);
 459     FastaFile fr = new FastaFile(fp);
 460     if (fr.hasWarningMessage())
 461     {
 462       System.out.println(String.format(
 463               "Warning when retrieving %d ids %s\n%s", ids.size(),
 464               ids.toString(), fr.getWarningMessage()));
 465     }
 466     else if (fr.getSeqs().size() != ids.size())
 467     {
 468       System.out.println(String.format(
 469               "Only retrieved %d sequences for %d query strings", fr
 470                       .getSeqs().size(), ids.size()));
 471     }
 472
 473     if (fr.getSeqs().size() == 1 && fr.getSeqs().get(0).getLength() == 0)
 474     {
 475       /*
 476        * POST request has returned an empty FASTA file e.g. for invalid id
 477        */
 478       throw new IOException("No data returned for " + ids);
 479     }
 480
 481     if (fr.getSeqs().size() > 0)
 482     {
 483       AlignmentI seqal = new Alignment(
 484               fr.getSeqsAsArray());
 485       for (SequenceI sq:seqal.getSequences())
 486       {
 487         if (sq.getDescription() == null)
 488         {
 489           sq.setDescription(getDbName());
 490         }
 491         String name = sq.getName();
 492         if (ids.contains(name)
 493                 || ids.contains(name.replace("ENSP", "ENST")))
 494         {
 495           DBRefUtils.parseToDbRef(sq, DBRefSource.ENSEMBL, "0", name);
 496         }
 497       }
 498       if (alignment == null)
 499       {
 500         alignment = seqal;
 501       }
 502       else
 503       {
 504         alignment.append(seqal);
 505       }
 506     }
 507     return alignment;
 508   }
 509
 510   /**
 511    * Returns the URL for the REST call
 512    *
 513    * @return
 514    * @throws MalformedURLException
 515    */
 516   @Override
 517   protected URL getUrl(List<String> ids) throws MalformedURLException
 518   {
 519     /*
 520      * a single id is included in the URL path
 521      * multiple ids go in the POST body instead
 522      */
 523     StringBuffer urlstring = new StringBuffer(128);
 524     urlstring.append(getDomain() + "/sequence/id");
 525     if (ids.size() == 1)
 526     {
 527       urlstring.append("/").append(ids.get(0));
 528     }
 529     // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
 530     urlstring.append("?type=").append(getSourceEnsemblType().getType());
 531     urlstring.append(("&Accept=text/x-fasta"));
 532
 533     URL url = new URL(urlstring.toString());
 534     return url;
 535   }
 536
 537   /**
 538    * A sequence/id POST request currently allows up to 50 queries
 539    *
 540    * @see http://rest.ensembl.org/documentation/info/sequence_id_post
 541    */
 542   @Override
 543   public int getMaximumQueryCount()
 544   {
 545     return 50;
 546   }
 547
 548   @Override
 549   protected boolean useGetRequest()
 550   {
 551     return false;
 552   }
 553
 554   @Override
 555   protected String getRequestMimeType(boolean multipleIds)
 556   {
 557     return multipleIds ? "application/json" : "text/x-fasta";
 558   }
 559
 560   @Override
 561   protected String getResponseMimeType()
 562   {
 563     return "text/x-fasta";
 564   }
 565
 566   /**
 567    *
 568    * @return the configured sequence return type for this source
 569    */
 570   protected abstract EnsemblSeqType getSourceEnsemblType();
 571
 572   /**
 573    * Returns a list of [start, end] genomic ranges corresponding to the sequence
 574    * being retrieved.
 575    *
 576    * The correspondence between the frames of reference is made by locating
 577    * those features on the genomic sequence which identify the retrieved
 578    * sequence. Specifically
 579    * <ul>
 580    * <li>genomic sequence is identified by "transcript" features with
 581    * ID=transcript:transcriptId</li>
 582    * <li>cdna sequence is identified by "exon" features with
 583    * Parent=transcript:transcriptId</li>
 584    * <li>cds sequence is identified by "CDS" features with
 585    * Parent=transcript:transcriptId</li>
 586    * </ul>
 587    *
 588    * The returned ranges are sorted to run forwards (for positive strand) or
 589    * backwards (for negative strand). Aborts and returns null if both positive
 590    * and negative strand are found (this should not normally happen).
 591    *
 592    * @param sourceSequence
 593    * @param accId
 594    * @param start
 595    *          the start position of the sequence we are mapping to
 596    * @return
 597    */
 598   protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence,
 599           String accId, int start)
 600   {
 601     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
 602     if (sfs == null)
 603     {
 604       return null;
 605     }
 606
 607     /*
 608      * generously initial size for number of cds regions
 609      * (worst case titin Q8WZ42 has c. 313 exons)
 610      */
 611     List<int[]> regions = new ArrayList<int[]>(100);
 612     int mappedLength = 0;
 613     int direction = 1; // forward
 614     boolean directionSet = false;
 615
 616     for (SequenceFeature sf : sfs)
 617     {
 618       /*
 619        * accept the target feature type or a specialisation of it
 620        * (e.g. coding_exon for exon)
 621        */
 622       if (identifiesSequence(sf, accId))
 623       {
 624         int strand = sf.getStrand();
 625         strand = strand == 0 ? 1 : strand; // treat unknown as forward
 626
 627         if (directionSet && strand != direction)
 628         {
 629           // abort - mix of forward and backward
 630           System.err.println("Error: forward and backward strand for "
 631                   + accId);
 632             return null;
 633           }
 634           direction = strand;
 635           directionSet = true;
 636
 637           /*
 638            * add to CDS ranges, semi-sorted forwards/backwards
 639            */
 640           if (strand < 0)
 641           {
 642             regions.add(0, new int[] { sf.getEnd(), sf.getBegin() });
 643           }
 644           else
 645           {
 646           regions.add(new int[] { sf.getBegin(), sf.getEnd() });
 647         }
 648         mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1);
 649
 650         if (!isSpliceable())
 651         {
 652           /*
 653            * 'gene' sequence is contiguous so we can stop as soon as its
 654            * identifying feature has been found
 655            */
 656           break;
 657         }
 658       }
 659     }
 660
 661     if (regions.isEmpty())
 662     {
 663       System.out.println("Failed to identify target sequence for " + accId
 664               + " from genomic features");
 665       return null;
 666     }
 667
 668     /*
 669      * a final sort is needed since Ensembl returns CDS sorted within source
 670      * (havana / ensembl_havana)
 671      */
 672     Collections.sort(regions, new RangeSorter(direction == 1));
 673
 674     List<int[]> to = Arrays.asList(new int[] { start,
 675         start + mappedLength - 1 });
 676
 677     return new MapList(regions, to, 1, 1);
 678   }
 679
 680   /**
 681    * Answers true if the sequence being retrieved may occupy discontiguous
 682    * regions on the genomic sequence.
 683    */
 684   protected boolean isSpliceable()
 685   {
 686     return true;
 687   }
 688
 689   /**
 690    * Returns true if the sequence feature marks positions of the genomic
 691    * sequence feature which are within the sequence being retrieved. For
 692    * example, an 'exon' feature whose parent is the target transcript marks the
 693    * cdna positions of the transcript.
 694    *
 695    * @param sf
 696    * @param accId
 697    * @return
 698    */
 699   protected abstract boolean identifiesSequence(SequenceFeature sf,
 700           String accId);
 701
 702   /**
 703    * Transfers the sequence feature to the target sequence, locating its start
 704    * and end range based on the mapping. Features which do not overlap the
 705    * target sequence are ignored.
 706    *
 707    * @param sf
 708    * @param targetSequence
 709    * @param mapping
 710    *          mapping from the sequence feature's coordinates to the target
 711    *          sequence
 712    */
 713   protected void transferFeature(SequenceFeature sf,
 714           SequenceI targetSequence, MapList mapping)
 715   {
 716     int start = sf.getBegin();
 717     int end = sf.getEnd();
 718     int[] mappedRange = mapping.locateInTo(start, end);
 719
 720     if (mappedRange != null)
 721     {
 722       SequenceFeature copy = new SequenceFeature(sf);
 723       copy.setBegin(Math.min(mappedRange[0], mappedRange[1]));
 724       copy.setEnd(Math.max(mappedRange[0], mappedRange[1]));
 725       targetSequence.addSequenceFeature(copy);
 726
 727       /*
 728        * for sequence_variant, make an additional feature with consequence
 729        */
 730       // if (SequenceOntologyFactory.getInstance().isA(sf.getType(),
 731       // SequenceOntologyI.SEQUENCE_VARIANT))
 732       // {
 733       // String consequence = (String) sf.getValue(CONSEQUENCE_TYPE);
 734       // if (consequence != null)
 735       // {
 736       // SequenceFeature sf2 = new SequenceFeature("consequence",
 737       // consequence, copy.getBegin(), copy.getEnd(), 0f,
 738       // null);
 739       // targetSequence.addSequenceFeature(sf2);
 740       // }
 741       // }
 742     }
 743   }
 744
 745   /**
 746    * Transfers features from sourceSequence to targetSequence
 747    *
 748    * @param accessionId
 749    * @param sourceSequence
 750    * @param targetSequence
 751    * @return true if any features were transferred, else false
 752    */
 753   protected boolean transferFeatures(String accessionId,
 754           SequenceI sourceSequence, SequenceI targetSequence)
 755   {
 756     if (sourceSequence == null || targetSequence == null)
 757     {
 758       return false;
 759     }
 760
 761     // long start = System.currentTimeMillis();
 762     SequenceFeature[] sfs = sourceSequence.getSequenceFeatures();
 763     MapList mapping = getGenomicRangesFromFeatures(sourceSequence, accessionId,
 764             targetSequence.getStart());
 765     if (mapping == null)
 766     {
 767       return false;
 768     }
 769
 770     boolean result = transferFeatures(sfs, targetSequence, mapping,
 771             accessionId);
 772     // System.out.println("transferFeatures (" + (sfs.length) + " --> "
 773     // + targetSequence.getSequenceFeatures().length + ") to "
 774     // + targetSequence.getName()
 775     // + " took " + (System.currentTimeMillis() - start) + "ms");
 776     return result;
 777   }
 778
 779   /**
 780    * Transfer features to the target sequence. The start/end positions are
 781    * converted using the mapping. Features which do not overlap are ignored.
 782    * Features whose parent is not the specified identifier are also ignored.
 783    *
 784    * @param features
 785    * @param targetSequence
 786    * @param mapping
 787    * @param parentId
 788    * @return
 789    */
 790   protected boolean transferFeatures(SequenceFeature[] features,
 791           SequenceI targetSequence, MapList mapping, String parentId)
 792   {
 793     final boolean forwardStrand = mapping.isFromForwardStrand();
 794
 795     /*
 796      * sort features by start position (descending if reverse strand)
 797      * before transferring (in forwards order) to the target sequence
 798      */
 799     Arrays.sort(features, new Comparator<SequenceFeature>()
 800     {
 801       @Override
 802       public int compare(SequenceFeature o1, SequenceFeature o2)
 803       {
 804         int c = Integer.compare(o1.getBegin(), o2.getBegin());
 805         return forwardStrand ? c : -c;
 806       }
 807     });
 808
 809     boolean transferred = false;
 810     for (SequenceFeature sf : features)
 811     {
 812       if (retainFeature(sf, parentId))
 813       {
 814         transferFeature(sf, targetSequence, mapping);
 815         transferred = true;
 816       }
 817     }
 818     return transferred;
 819   }
 820
 821   /**
 822    * Answers true if the feature type is one we want to keep for the sequence.
 823    * Some features are only retrieved in order to identify the sequence range,
 824    * and may then be discarded as redundant information (e.g. "CDS" feature for
 825    * a CDS sequence).
 826    */
 827   @SuppressWarnings("unused")
 828   protected boolean retainFeature(SequenceFeature sf, String accessionId)
 829   {
 830     return true; // override as required
 831   }
 832
 833   /**
 834    * Answers true if the feature has a Parent which refers to the given
 835    * accession id, or if the feature has no parent. Answers false if the
 836    * feature's Parent is for a different accession id.
 837    *
 838    * @param sf
 839    * @param identifier
 840    * @return
 841    */
 842   protected boolean featureMayBelong(SequenceFeature sf, String identifier)
 843   {
 844     String parent = (String) sf.getValue(PARENT);
 845     // using contains to allow for prefix "gene:", "transcript:" etc
 846     if (parent != null && !parent.contains(identifier))
 847     {
 848       // this genomic feature belongs to a different transcript
 849       return false;
 850     }
 851     return true;
 852   }
 853
 854   @Override
 855   public String getDescription()
 856   {
 857     return "Ensembl " + getSourceEnsemblType().getType()
 858             + " sequence with variant features";
 859   }
 860
 861   /**
 862    * Returns a (possibly empty) list of features on the sequence which have the
 863    * specified sequence ontology type (or a sub-type of it), and the given
 864    * identifier as parent
 865    *
 866    * @param sequence
 867    * @param type
 868    * @param parentId
 869    * @return
 870    */
 871   protected List<SequenceFeature> findFeatures(SequenceI sequence,
 872           String type, String parentId)
 873   {
 874     List<SequenceFeature> result = new ArrayList<SequenceFeature>();
 875
 876     SequenceFeature[] sfs = sequence.getSequenceFeatures();
 877     if (sfs != null) {
 878       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 879       for (SequenceFeature sf :sfs) {
 880         if (so.isA(sf.getType(), type))
 881         {
 882           String parent = (String) sf.getValue(PARENT);
 883           if (parent.equals(parentId))
 884           {
 885             result.add(sf);
 886           }
 887         }
 888       }
 889     }
 890     return result;
 891   }
 892
 893   /**
 894    * Maps exon features from dna to protein, and computes variants in peptide
 895    * product generated by variants in dna, and adds them as sequence_variant
 896    * features on the protein sequence. Returns the number of variant features
 897    * added.
 898    *
 899    * @param dnaSeq
 900    * @param peptide
 901    * @param dnaToProtein
 902    */
 903   static int computeProteinFeatures(SequenceI dnaSeq,
 904           SequenceI peptide, MapList dnaToProtein)
 905   {
 906     while (dnaSeq.getDatasetSequence() != null)
 907     {
 908       dnaSeq = dnaSeq.getDatasetSequence();
 909     }
 910     while (peptide.getDatasetSequence() != null)
 911     {
 912       peptide = peptide.getDatasetSequence();
 913     }
 914
 915     AlignmentUtils.transferFeatures(dnaSeq, peptide, dnaToProtein,
 916             SequenceOntologyI.EXON);
 917
 918     LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
 919             dnaSeq, dnaToProtein);
 920
 921     /*
 922      * scan codon variations, compute peptide variants and add to peptide sequence
 923      */
 924     int count = 0;
 925     for (Entry<Integer, String[][]> variant : variants.entrySet())
 926     {
 927       int peptidePos = variant.getKey();
 928       String[][] codonVariants = variant.getValue();
 929       String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
 930       List<String> peptideVariants = computePeptideVariants(codonVariants,
 931               residue);
 932       if (!peptideVariants.isEmpty())
 933       {
 934         String desc = StringUtils.listToDelimitedString(peptideVariants,
 935                 ", ");
 936         SequenceFeature sf = new SequenceFeature(
 937                 SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
 938                 peptidePos, 0f, null);
 939         peptide.addSequenceFeature(sf);
 940         count++;
 941       }
 942     }
 943
 944     /*
 945      * ugly sort to get sequence features in start position order
 946      * - would be better to store in Sequence as a TreeSet instead?
 947      */
 948     Arrays.sort(peptide.getSequenceFeatures(),
 949             new Comparator<SequenceFeature>()
 950             {
 951               @Override
 952               public int compare(SequenceFeature o1, SequenceFeature o2)
 953               {
 954                 int c = Integer.compare(o1.getBegin(), o2.getBegin());
 955                 return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
 956                         : c;
 957               }
 958             });
 959     return count;
 960   }
 961
 962   /**
 963    * Builds a map whose key is position in the protein sequence, and value is an
 964    * array of all variants for the coding codon positions
 965    *
 966    * @param dnaSeq
 967    * @param dnaToProtein
 968    * @return
 969    */
 970   static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
 971           SequenceI dnaSeq, MapList dnaToProtein)
 972   {
 973     /*
 974      * map from peptide position to all variant features of the codon for it
 975      * LinkedHashMap ensures we add the peptide features in sequence order
 976      */
 977     LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
 978     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 979
 980     SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
 981     if (dnaFeatures == null)
 982     {
 983       return variants;
 984     }
 985
 986     int dnaStart = dnaSeq.getStart();
 987     int[] lastCodon = null;
 988     int lastPeptidePostion = 0;
 989
 990     /*
 991      * build a map of codon variations for peptides
 992      */
 993     for (SequenceFeature sf : dnaFeatures)
 994     {
 995       int dnaCol = sf.getBegin();
 996       if (dnaCol != sf.getEnd())
 997       {
 998         // not handling multi-locus variant features
 999         continue;
1000       }
1001       if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
1002       {
1003         int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
1004         if (mapsTo == null)
1005         {
1006           // feature doesn't lie within coding region
1007           continue;
1008         }
1009         int peptidePosition = mapsTo[0];
1010         String[][] codonVariants = variants.get(peptidePosition);
1011         if (codonVariants == null)
1012         {
1013           codonVariants = new String[3][];
1014           variants.put(peptidePosition, codonVariants);
1015         }
1016
1017         /*
1018          * extract dna variants to a string array
1019          */
1020         String alls = (String) sf.getValue("alleles");
1021         if (alls == null)
1022         {
1023           continue;
1024         }
1025         String[] alleles = alls.split(",");
1026
1027         /*
1028          * get this peptides codon positions e.g. [3, 4, 5] or [4, 7, 10]
1029          */
1030         int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
1031                 : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
1032                         peptidePosition, peptidePosition));
1033         lastPeptidePostion = peptidePosition;
1034         lastCodon = codon;
1035
1036         /*
1037          * save nucleotide (and this variant) for each codon position
1038          */
1039         for (int codonPos = 0; codonPos < 3; codonPos++)
1040         {
1041           String nucleotide = String.valueOf(dnaSeq
1042                   .getCharAt(codon[codonPos] - dnaStart));
1043           if (codon[codonPos] == dnaCol)
1044           {
1045             /*
1046              * record current dna base and its alleles
1047              */
1048             String[] dnaVariants = new String[alleles.length + 1];
1049             dnaVariants[0] = nucleotide;
1050             System.arraycopy(alleles, 0, dnaVariants, 1, alleles.length);
1051             codonVariants[codonPos] = dnaVariants;
1052           }
1053           else if (codonVariants[codonPos] == null)
1054           {
1055             /*
1056              * record current dna base only
1057              * (at least until we find any variation and overwrite it)
1058              */
1059             codonVariants[codonPos] = new String[] { nucleotide };
1060           }
1061         }
1062       }
1063     }
1064     return variants;
1065   }
1066
1067   /**
1068    * Returns a sorted, non-redundant list of all peptide translations generated
1069    * by the given dna variants, excluding the current residue value
1070    *
1071    * @param codonVariants
1072    *          an array of base values (acgtACGT) for codon positions 1, 2, 3
1073    * @param residue
1074    *          the current residue translation
1075    * @return
1076    */
1077   static List<String> computePeptideVariants(
1078           String[][] codonVariants, String residue)
1079   {
1080     List<String> result = new ArrayList<String>();
1081     for (String base1 : codonVariants[0])
1082     {
1083       for (String base2 : codonVariants[1])
1084       {
1085         for (String base3 : codonVariants[2])
1086         {
1087           String codon = base1 + base2 + base3;
1088           // TODO: report frameshift/insertion/deletion
1089           // and multiple-base variants?!
1090           String peptide = codon.contains("-") ? "-" : ResidueProperties
1091                   .codonTranslate(codon);
1092           if (peptide != null && !result.contains(peptide)
1093                   && !peptide.equalsIgnoreCase(residue))
1094           {
1095             result.add(peptide);
1096           }
1097         }
1098       }
1099     }
1100
1101     /*
1102      * sort alphabetically with STOP at the end
1103      */
1104     Collections.sort(result, new Comparator<String>()
1105     {
1106
1107       @Override
1108       public int compare(String o1, String o2)
1109       {
1110         if ("STOP".equals(o1))
1111         {
1112           return 1;
1113         }
1114         else if ("STOP".equals(o2))
1115         {
1116           return -1;
1117         }
1118         else
1119         {
1120           return o1.compareTo(o2);
1121         }
1122       }
1123     });
1124     return result;
1125   }
1126
1127   /**
1128    * Answers true if the feature type is either 'NMD_transcript_variant' or
1129    * 'transcript' or one of its sub-types in the Sequence Ontology. This is
1130    * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl
1131    * although strictly speaking it is not (it is a sub-type of
1132    * sequence_variant).
1133    *
1134    * @param featureType
1135    * @return
1136    */
1137   public static boolean isTranscript(String featureType)
1138   {
1139     return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType)
1140             || SequenceOntologyFactory.getInstance().isA(featureType,
1141                     SequenceOntologyI.TRANSCRIPT);
1142   }
1143 }