src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import java.util.ArrayList;
  24 import java.util.Arrays;
  25 import java.util.Collection;
  26 import java.util.Collections;
  27 import java.util.HashMap;
  28 import java.util.HashSet;
  29 import java.util.Iterator;
  30 import java.util.LinkedHashMap;
  31 import java.util.List;
  32 import java.util.Locale;
  33 import java.util.Map;
  34 import java.util.Map.Entry;
  35 import java.util.NoSuchElementException;
  36 import java.util.Set;
  37 import java.util.SortedMap;
  38 import java.util.TreeMap;
  39
  40 import jalview.api.AlignCalcWorkerI;
  41 import jalview.bin.Console;
  42 import jalview.commands.RemoveGapColCommand;
  43 import jalview.datamodel.AlignedCodon;
  44 import jalview.datamodel.AlignedCodonFrame;
  45 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
  46 import jalview.datamodel.Alignment;
  47 import jalview.datamodel.AlignmentAnnotation;
  48 import jalview.datamodel.AlignmentI;
  49 import jalview.datamodel.ContactMatrixI;
  50 import jalview.datamodel.DBRefEntry;
  51 import jalview.datamodel.GeneLociI;
  52 import jalview.datamodel.IncompleteCodonException;
  53 import jalview.datamodel.Mapping;
  54 import jalview.datamodel.Sequence;
  55 import jalview.datamodel.SequenceFeature;
  56 import jalview.datamodel.SequenceGroup;
  57 import jalview.datamodel.SequenceI;
  58 import jalview.datamodel.features.SequenceFeatures;
  59 import jalview.gui.AlignmentPanel;
  60 import jalview.io.gff.SequenceOntologyI;
  61 import jalview.schemes.ResidueProperties;
  62 import jalview.util.Comparison;
  63 import jalview.util.DBRefUtils;
  64 import jalview.util.IntRangeComparator;
  65 import jalview.util.MapList;
  66 import jalview.util.MappingUtils;
  67 import jalview.workers.SecondaryStructureConsensusThread;
  68
  69 /**
  70  * grab bag of useful alignment manipulation operations Expect these to be
  71  * refactored elsewhere at some point.
  72  *
  73  * @author jimp
  74  *
  75  */
  76 public class AlignmentUtils
  77 {
  78   private static final int CODON_LENGTH = 3;
  79
  80   private static final String SEQUENCE_VARIANT = "sequence_variant:";
  81
  82
  83   private static final Map<String, String> SECONDARY_STRUCTURE_LABELS = new HashMap<>();
  84
  85   static {
  86       SECONDARY_STRUCTURE_LABELS.put("Secondary Structure", "3D Structures");
  87       SECONDARY_STRUCTURE_LABELS.put("jnetpred", "JPred");
  88       // Add other secondary structure labels here if needed
  89   }
  90
  91   private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
  92
  93   /*
  94    * the 'id' attribute is provided for variant features fetched from
  95    * Ensembl using its REST service with JSON format
  96    */
  97   public static final String VARIANT_ID = "id";
  98
  99   /**
 100    * A data model to hold the 'normal' base value at a position, and an optional
 101    * sequence variant feature
 102    */
 103   static final class DnaVariant
 104   {
 105     final String base;
 106
 107     SequenceFeature variant;
 108
 109     DnaVariant(String nuc)
 110     {
 111       base = nuc;
 112       variant = null;
 113     }
 114
 115     DnaVariant(String nuc, SequenceFeature var)
 116     {
 117       base = nuc;
 118       variant = var;
 119     }
 120
 121     public String getSource()
 122     {
 123       return variant == null ? null : variant.getFeatureGroup();
 124     }
 125
 126     /**
 127      * toString for aid in the debugger only
 128      */
 129     @Override
 130     public String toString()
 131     {
 132       return base + ":" + (variant == null ? "" : variant.getDescription());
 133     }
 134   }
 135
 136   /**
 137    * given an existing alignment, create a new alignment including all, or up to
 138    * flankSize additional symbols from each sequence's dataset sequence
 139    *
 140    * @param core
 141    * @param flankSize
 142    * @return AlignmentI
 143    */
 144   public static AlignmentI expandContext(AlignmentI core, int flankSize)
 145   {
 146     List<SequenceI> sq = new ArrayList<>();
 147     int maxoffset = 0;
 148     for (SequenceI s : core.getSequences())
 149     {
 150       SequenceI newSeq = s.deriveSequence();
 151       final int newSeqStart = newSeq.getStart() - 1;
 152       if (newSeqStart > maxoffset
 153               && newSeq.getDatasetSequence().getStart() < s.getStart())
 154       {
 155         maxoffset = newSeqStart;
 156       }
 157       sq.add(newSeq);
 158     }
 159     if (flankSize > -1)
 160     {
 161       maxoffset = Math.min(maxoffset, flankSize);
 162     }
 163
 164     /*
 165      * now add offset left and right to create an expanded alignment
 166      */
 167     for (SequenceI s : sq)
 168     {
 169       SequenceI ds = s;
 170       while (ds.getDatasetSequence() != null)
 171       {
 172         ds = ds.getDatasetSequence();
 173       }
 174       int s_end = s.findPosition(s.getStart() + s.getLength());
 175       // find available flanking residues for sequence
 176       int ustream_ds = s.getStart() - ds.getStart();
 177       int dstream_ds = ds.getEnd() - s_end;
 178
 179       // build new flanked sequence
 180
 181       // compute gap padding to start of flanking sequence
 182       int offset = maxoffset - ustream_ds;
 183
 184       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
 185       if (flankSize >= 0)
 186       {
 187         if (flankSize < ustream_ds)
 188         {
 189           // take up to flankSize residues
 190           offset = maxoffset - flankSize;
 191           ustream_ds = flankSize;
 192         }
 193         if (flankSize <= dstream_ds)
 194         {
 195           dstream_ds = flankSize - 1;
 196         }
 197       }
 198       // TODO use Character.toLowerCase to avoid creating String objects?
 199       char[] upstream = new String(ds
 200               .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
 201                       .toLowerCase(Locale.ROOT).toCharArray();
 202       char[] downstream = new String(
 203               ds.getSequence(s_end - 1, s_end + dstream_ds))
 204                       .toLowerCase(Locale.ROOT).toCharArray();
 205       char[] coreseq = s.getSequence();
 206       char[] nseq = new char[offset + upstream.length + downstream.length
 207               + coreseq.length];
 208       char c = core.getGapCharacter();
 209
 210       int p = 0;
 211       for (; p < offset; p++)
 212       {
 213         nseq[p] = c;
 214       }
 215
 216       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 217       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 218               coreseq.length);
 219       System.arraycopy(downstream, 0, nseq,
 220               p + coreseq.length + upstream.length, downstream.length);
 221       s.setSequence(new String(nseq));
 222       s.setStart(s.getStart() - ustream_ds);
 223       s.setEnd(s_end + downstream.length);
 224     }
 225     AlignmentI newAl = new jalview.datamodel.Alignment(
 226             sq.toArray(new SequenceI[0]));
 227     for (SequenceI s : sq)
 228     {
 229       if (s.getAnnotation() != null)
 230       {
 231         for (AlignmentAnnotation aa : s.getAnnotation())
 232         {
 233           aa.adjustForAlignment(); // JAL-1712 fix
 234           newAl.addAnnotation(aa);
 235         }
 236       }
 237     }
 238     newAl.setDataset(core.getDataset());
 239     return newAl;
 240   }
 241
 242   /**
 243    * Returns the index (zero-based position) of a sequence in an alignment, or
 244    * -1 if not found.
 245    *
 246    * @param al
 247    * @param seq
 248    * @return
 249    */
 250   public static int getSequenceIndex(AlignmentI al, SequenceI seq)
 251   {
 252     int result = -1;
 253     int pos = 0;
 254     for (SequenceI alSeq : al.getSequences())
 255     {
 256       if (alSeq == seq)
 257       {
 258         result = pos;
 259         break;
 260       }
 261       pos++;
 262     }
 263     return result;
 264   }
 265
 266   /**
 267    * Returns a map of lists of sequences in the alignment, keyed by sequence
 268    * name. For use in mapping between different alignment views of the same
 269    * sequences.
 270    *
 271    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 272    */
 273   public static Map<String, List<SequenceI>> getSequencesByName(
 274           AlignmentI al)
 275   {
 276     Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
 277     for (SequenceI seq : al.getSequences())
 278     {
 279       String name = seq.getName();
 280       if (name != null)
 281       {
 282         List<SequenceI> seqs = theMap.get(name);
 283         if (seqs == null)
 284         {
 285           seqs = new ArrayList<>();
 286           theMap.put(name, seqs);
 287         }
 288         seqs.add(seq);
 289       }
 290     }
 291     return theMap;
 292   }
 293
 294   /**
 295    * Build mapping of protein to cDNA alignment. Mappings are made between
 296    * sequences where the cDNA translates to the protein sequence. Any new
 297    * mappings are added to the protein alignment. Returns true if any mappings
 298    * either already exist or were added, else false.
 299    *
 300    * @param proteinAlignment
 301    * @param cdnaAlignment
 302    * @return
 303    */
 304   public static boolean mapProteinAlignmentToCdna(
 305           final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
 306   {
 307     if (proteinAlignment == null || cdnaAlignment == null)
 308     {
 309       return false;
 310     }
 311
 312     Set<SequenceI> mappedDna = new HashSet<>();
 313     Set<SequenceI> mappedProtein = new HashSet<>();
 314
 315     /*
 316      * First pass - map sequences where cross-references exist. This include
 317      * 1-to-many mappings to support, for example, variant cDNA.
 318      */
 319     boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
 320             cdnaAlignment, mappedDna, mappedProtein, true);
 321
 322     /*
 323      * Second pass - map sequences where no cross-references exist. This only
 324      * does 1-to-1 mappings and assumes corresponding sequences are in the same
 325      * order in the alignments.
 326      */
 327     mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
 328             mappedDna, mappedProtein, false);
 329     return mappingPerformed;
 330   }
 331
 332   /**
 333    * Make mappings between compatible sequences (where the cDNA translation
 334    * matches the protein).
 335    *
 336    * @param proteinAlignment
 337    * @param cdnaAlignment
 338    * @param mappedDna
 339    *          a set of mapped DNA sequences (to add to)
 340    * @param mappedProtein
 341    *          a set of mapped Protein sequences (to add to)
 342    * @param xrefsOnly
 343    *          if true, only map sequences where xrefs exist
 344    * @return
 345    */
 346   protected static boolean mapProteinToCdna(
 347           final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
 348           Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
 349           boolean xrefsOnly)
 350   {
 351     boolean mappingExistsOrAdded = false;
 352     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 353     for (SequenceI aaSeq : thisSeqs)
 354     {
 355       boolean proteinMapped = false;
 356       AlignedCodonFrame acf = new AlignedCodonFrame();
 357
 358       for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
 359       {
 360         /*
 361          * Always try to map if sequences have xref to each other; this supports
 362          * variant cDNA or alternative splicing for a protein sequence.
 363          *
 364          * If no xrefs, try to map progressively, assuming that alignments have
 365          * mappable sequences in corresponding order. These are not
 366          * many-to-many, as that would risk mixing species with similar cDNA
 367          * sequences.
 368          */
 369         if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
 370         {
 371           continue;
 372         }
 373
 374         /*
 375          * Don't map non-xrefd sequences more than once each. This heuristic
 376          * allows us to pair up similar sequences in ordered alignments.
 377          */
 378         if (!xrefsOnly && (mappedProtein.contains(aaSeq)
 379                 || mappedDna.contains(cdnaSeq)))
 380         {
 381           continue;
 382         }
 383         if (mappingExists(proteinAlignment.getCodonFrames(),
 384                 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
 385         {
 386           mappingExistsOrAdded = true;
 387         }
 388         else
 389         {
 390           MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
 391           if (map != null)
 392           {
 393             acf.addMap(cdnaSeq, aaSeq, map);
 394             mappingExistsOrAdded = true;
 395             proteinMapped = true;
 396             mappedDna.add(cdnaSeq);
 397             mappedProtein.add(aaSeq);
 398           }
 399         }
 400       }
 401       if (proteinMapped)
 402       {
 403         proteinAlignment.addCodonFrame(acf);
 404       }
 405     }
 406     return mappingExistsOrAdded;
 407   }
 408
 409   /**
 410    * Answers true if the mappings include one between the given (dataset)
 411    * sequences.
 412    */
 413   protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
 414           SequenceI aaSeq, SequenceI cdnaSeq)
 415   {
 416     if (mappings != null)
 417     {
 418       for (AlignedCodonFrame acf : mappings)
 419       {
 420         if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
 421         {
 422           return true;
 423         }
 424       }
 425     }
 426     return false;
 427   }
 428
 429   /**
 430    * Builds a mapping (if possible) of a cDNA to a protein sequence.
 431    * <ul>
 432    * <li>first checks if the cdna translates exactly to the protein
 433    * sequence</li>
 434    * <li>else checks for translation after removing a STOP codon</li>
 435    * <li>else checks for translation after removing a START codon</li>
 436    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
 437    * </ul>
 438    * Returns null if no mapping is determined.
 439    *
 440    * @param proteinSeq
 441    *          the aligned protein sequence
 442    * @param cdnaSeq
 443    *          the aligned cdna sequence
 444    * @return
 445    */
 446   public static MapList mapCdnaToProtein(SequenceI proteinSeq,
 447           SequenceI cdnaSeq)
 448   {
 449     /*
 450      * Here we handle either dataset sequence set (desktop) or absent (applet).
 451      * Use only the char[] form of the sequence to avoid creating possibly large
 452      * String objects.
 453      */
 454     final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
 455     char[] aaSeqChars = proteinDataset != null
 456             ? proteinDataset.getSequence()
 457             : proteinSeq.getSequence();
 458     final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
 459     char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
 460             : cdnaSeq.getSequence();
 461     if (aaSeqChars == null || cdnaSeqChars == null)
 462     {
 463       return null;
 464     }
 465
 466     /*
 467      * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
 468      */
 469     final int mappedLength = CODON_LENGTH * aaSeqChars.length;
 470     int cdnaLength = cdnaSeqChars.length;
 471     int cdnaStart = cdnaSeq.getStart();
 472     int cdnaEnd = cdnaSeq.getEnd();
 473     final int proteinStart = proteinSeq.getStart();
 474     final int proteinEnd = proteinSeq.getEnd();
 475
 476     /*
 477      * If lengths don't match, try ignoring stop codon (if present)
 478      */
 479     if (cdnaLength != mappedLength && cdnaLength > 2)
 480     {
 481       String lastCodon = String.valueOf(cdnaSeqChars,
 482               cdnaLength - CODON_LENGTH, CODON_LENGTH)
 483               .toUpperCase(Locale.ROOT);
 484       for (String stop : ResidueProperties.STOP_CODONS)
 485       {
 486         if (lastCodon.equals(stop))
 487         {
 488           cdnaEnd -= CODON_LENGTH;
 489           cdnaLength -= CODON_LENGTH;
 490           break;
 491         }
 492       }
 493     }
 494
 495     /*
 496      * If lengths still don't match, try ignoring start codon.
 497      */
 498     int startOffset = 0;
 499     if (cdnaLength != mappedLength && cdnaLength > 2
 500             && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
 501                     .toUpperCase(Locale.ROOT)
 502                     .equals(ResidueProperties.START))
 503     {
 504       startOffset += CODON_LENGTH;
 505       cdnaStart += CODON_LENGTH;
 506       cdnaLength -= CODON_LENGTH;
 507     }
 508
 509     if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
 510     {
 511       /*
 512        * protein is translation of dna (+/- start/stop codons)
 513        */
 514       MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
 515               new int[]
 516               { proteinStart, proteinEnd }, CODON_LENGTH, 1);
 517       return map;
 518     }
 519
 520     /*
 521      * translation failed - try mapping CDS annotated regions of dna
 522      */
 523     return mapCdsToProtein(cdnaSeq, proteinSeq);
 524   }
 525
 526   /**
 527    * Test whether the given cdna sequence, starting at the given offset,
 528    * translates to the given amino acid sequence, using the standard translation
 529    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
 530    *
 531    * @param cdnaSeqChars
 532    * @param cdnaStart
 533    * @param aaSeqChars
 534    * @return
 535    */
 536   protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
 537           char[] aaSeqChars)
 538   {
 539     if (cdnaSeqChars == null || aaSeqChars == null)
 540     {
 541       return false;
 542     }
 543
 544     int aaPos = 0;
 545     int dnaPos = cdnaStart;
 546     for (; dnaPos < cdnaSeqChars.length - 2
 547             && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
 548     {
 549       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
 550       final String translated = ResidueProperties.codonTranslate(codon);
 551
 552       /*
 553        * allow * in protein to match untranslatable in dna
 554        */
 555       final char aaRes = aaSeqChars[aaPos];
 556       if ((translated == null || ResidueProperties.STOP.equals(translated))
 557               && aaRes == '*')
 558       {
 559         continue;
 560       }
 561       if (translated == null || !(aaRes == translated.charAt(0)))
 562       {
 563         // debug
 564         // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
 565         // + ": "
 566         // + codon + "(" + translated + ") != " + aaRes));
 567         return false;
 568       }
 569     }
 570
 571     /*
 572      * check we matched all of the protein sequence
 573      */
 574     if (aaPos != aaSeqChars.length)
 575     {
 576       return false;
 577     }
 578
 579     /*
 580      * check we matched all of the dna except
 581      * for optional trailing STOP codon
 582      */
 583     if (dnaPos == cdnaSeqChars.length)
 584     {
 585       return true;
 586     }
 587     if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
 588     {
 589       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
 590       if (ResidueProperties.STOP
 591               .equals(ResidueProperties.codonTranslate(codon)))
 592       {
 593         return true;
 594       }
 595     }
 596     return false;
 597   }
 598
 599   /**
 600    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
 601    * currently assumes that we are aligning cDNA to match protein.
 602    *
 603    * @param seq
 604    *          the sequence to be realigned
 605    * @param al
 606    *          the alignment whose sequence alignment is to be 'copied'
 607    * @param gap
 608    *          character string represent a gap in the realigned sequence
 609    * @param preserveUnmappedGaps
 610    * @param preserveMappedGaps
 611    * @return true if the sequence was realigned, false if it could not be
 612    */
 613   public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
 614           String gap, boolean preserveMappedGaps,
 615           boolean preserveUnmappedGaps)
 616   {
 617     /*
 618      * Get any mappings from the source alignment to the target (dataset)
 619      * sequence.
 620      */
 621     // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
 622     // all mappings. Would it help to constrain this?
 623     List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
 624     if (mappings == null || mappings.isEmpty())
 625     {
 626       return false;
 627     }
 628
 629     /*
 630      * Locate the aligned source sequence whose dataset sequence is mapped. We
 631      * just take the first match here (as we can't align like more than one
 632      * sequence).
 633      */
 634     SequenceI alignFrom = null;
 635     AlignedCodonFrame mapping = null;
 636     for (AlignedCodonFrame mp : mappings)
 637     {
 638       alignFrom = mp.findAlignedSequence(seq, al);
 639       if (alignFrom != null)
 640       {
 641         mapping = mp;
 642         break;
 643       }
 644     }
 645
 646     if (alignFrom == null)
 647     {
 648       return false;
 649     }
 650     alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
 651             preserveMappedGaps, preserveUnmappedGaps);
 652     return true;
 653   }
 654
 655   /**
 656    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
 657    * match residues and codons. Flags control whether existing gaps in unmapped
 658    * (intron) and mapped (exon) regions are preserved or not. Gaps between
 659    * intron and exon are only retained if both flags are set.
 660    *
 661    * @param alignTo
 662    * @param alignFrom
 663    * @param mapping
 664    * @param myGap
 665    * @param sourceGap
 666    * @param preserveUnmappedGaps
 667    * @param preserveMappedGaps
 668    */
 669   public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
 670           AlignedCodonFrame mapping, String myGap, char sourceGap,
 671           boolean preserveMappedGaps, boolean preserveUnmappedGaps)
 672   {
 673     // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
 674
 675     // aligned and dataset sequence positions, all base zero
 676     int thisSeqPos = 0;
 677     int sourceDsPos = 0;
 678
 679     int basesWritten = 0;
 680     char myGapChar = myGap.charAt(0);
 681     int ratio = myGap.length();
 682
 683     int fromOffset = alignFrom.getStart() - 1;
 684     int toOffset = alignTo.getStart() - 1;
 685     int sourceGapMappedLength = 0;
 686     boolean inExon = false;
 687     final int toLength = alignTo.getLength();
 688     final int fromLength = alignFrom.getLength();
 689     StringBuilder thisAligned = new StringBuilder(2 * toLength);
 690
 691     /*
 692      * Traverse the 'model' aligned sequence
 693      */
 694     for (int i = 0; i < fromLength; i++)
 695     {
 696       char sourceChar = alignFrom.getCharAt(i);
 697       if (sourceChar == sourceGap)
 698       {
 699         sourceGapMappedLength += ratio;
 700         continue;
 701       }
 702
 703       /*
 704        * Found a non-gap character. Locate its mapped region if any.
 705        */
 706       sourceDsPos++;
 707       // Note mapping positions are base 1, our sequence positions base 0
 708       int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
 709               sourceDsPos + fromOffset);
 710       if (mappedPos == null)
 711       {
 712         /*
 713          * unmapped position; treat like a gap
 714          */
 715         sourceGapMappedLength += ratio;
 716         // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
 717         // residue "
 718         // + sourceDsPos + "(" + sourceChar + ")");
 719         // return;
 720         continue;
 721       }
 722
 723       int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
 724       int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
 725       StringBuilder trailingCopiedGap = new StringBuilder();
 726
 727       /*
 728        * Copy dna sequence up to and including this codon. Optionally, include
 729        * gaps before the codon starts (in introns) and/or after the codon starts
 730        * (in exons).
 731        *
 732        * Note this only works for 'linear' splicing, not reverse or interleaved.
 733        * But then 'align dna as protein' doesn't make much sense otherwise.
 734        */
 735       int intronLength = 0;
 736       while (basesWritten + toOffset < mappedCodonEnd
 737               && thisSeqPos < toLength)
 738       {
 739         final char c = alignTo.getCharAt(thisSeqPos++);
 740         if (c != myGapChar)
 741         {
 742           basesWritten++;
 743           int sourcePosition = basesWritten + toOffset;
 744           if (sourcePosition < mappedCodonStart)
 745           {
 746             /*
 747              * Found an unmapped (intron) base. First add in any preceding gaps
 748              * (if wanted).
 749              */
 750             if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
 751             {
 752               thisAligned.append(trailingCopiedGap.toString());
 753               intronLength += trailingCopiedGap.length();
 754               trailingCopiedGap = new StringBuilder();
 755             }
 756             intronLength++;
 757             inExon = false;
 758           }
 759           else
 760           {
 761             final boolean startOfCodon = sourcePosition == mappedCodonStart;
 762             int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
 763                     preserveUnmappedGaps, sourceGapMappedLength, inExon,
 764                     trailingCopiedGap.length(), intronLength, startOfCodon);
 765             for (int k = 0; k < gapsToAdd; k++)
 766             {
 767               thisAligned.append(myGapChar);
 768             }
 769             sourceGapMappedLength = 0;
 770             inExon = true;
 771           }
 772           thisAligned.append(c);
 773           trailingCopiedGap = new StringBuilder();
 774         }
 775         else
 776         {
 777           if (inExon && preserveMappedGaps)
 778           {
 779             trailingCopiedGap.append(myGapChar);
 780           }
 781           else if (!inExon && preserveUnmappedGaps)
 782           {
 783             trailingCopiedGap.append(myGapChar);
 784           }
 785         }
 786       }
 787     }
 788
 789     /*
 790      * At end of model aligned sequence. Copy any remaining target sequence, optionally
 791      * including (intron) gaps.
 792      */
 793     while (thisSeqPos < toLength)
 794     {
 795       final char c = alignTo.getCharAt(thisSeqPos++);
 796       if (c != myGapChar || preserveUnmappedGaps)
 797       {
 798         thisAligned.append(c);
 799       }
 800       sourceGapMappedLength--;
 801     }
 802
 803     /*
 804      * finally add gaps to pad for any trailing source gaps or
 805      * unmapped characters
 806      */
 807     if (preserveUnmappedGaps)
 808     {
 809       while (sourceGapMappedLength > 0)
 810       {
 811         thisAligned.append(myGapChar);
 812         sourceGapMappedLength--;
 813       }
 814     }
 815
 816     /*
 817      * All done aligning, set the aligned sequence.
 818      */
 819     alignTo.setSequence(new String(thisAligned));
 820   }
 821
 822   /**
 823    * Helper method to work out how many gaps to insert when realigning.
 824    *
 825    * @param preserveMappedGaps
 826    * @param preserveUnmappedGaps
 827    * @param sourceGapMappedLength
 828    * @param inExon
 829    * @param trailingCopiedGap
 830    * @param intronLength
 831    * @param startOfCodon
 832    * @return
 833    */
 834   protected static int calculateGapsToInsert(boolean preserveMappedGaps,
 835           boolean preserveUnmappedGaps, int sourceGapMappedLength,
 836           boolean inExon, int trailingGapLength, int intronLength,
 837           final boolean startOfCodon)
 838   {
 839     int gapsToAdd = 0;
 840     if (startOfCodon)
 841     {
 842       /*
 843        * Reached start of codon. Ignore trailing gaps in intron unless we are
 844        * preserving gaps in both exon and intron. Ignore them anyway if the
 845        * protein alignment introduces a gap at least as large as the intronic
 846        * region.
 847        */
 848       if (inExon && !preserveMappedGaps)
 849       {
 850         trailingGapLength = 0;
 851       }
 852       if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
 853       {
 854         trailingGapLength = 0;
 855       }
 856       if (inExon)
 857       {
 858         gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 859       }
 860       else
 861       {
 862         if (intronLength + trailingGapLength <= sourceGapMappedLength)
 863         {
 864           gapsToAdd = sourceGapMappedLength - intronLength;
 865         }
 866         else
 867         {
 868           gapsToAdd = Math.min(
 869                   intronLength + trailingGapLength - sourceGapMappedLength,
 870                   trailingGapLength);
 871         }
 872       }
 873     }
 874     else
 875     {
 876       /*
 877        * second or third base of codon; check for any gaps in dna
 878        */
 879       if (!preserveMappedGaps)
 880       {
 881         trailingGapLength = 0;
 882       }
 883       gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 884     }
 885     return gapsToAdd;
 886   }
 887
 888   /**
 889    * Realigns the given protein to match the alignment of the dna, using codon
 890    * mappings to translate aligned codon positions to protein residues.
 891    *
 892    * @param protein
 893    *          the alignment whose sequences are realigned by this method
 894    * @param dna
 895    *          the dna alignment whose alignment we are 'copying'
 896    * @return the number of sequences that were realigned
 897    */
 898   public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
 899   {
 900     if (protein.isNucleotide() || !dna.isNucleotide())
 901     {
 902       jalview.bin.Console
 903               .errPrintln("Wrong alignment type in alignProteinAsDna");
 904       return 0;
 905     }
 906     List<SequenceI> unmappedProtein = new ArrayList<>();
 907     Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
 908             protein, dna, unmappedProtein);
 909     return alignProteinAs(protein, alignedCodons, unmappedProtein);
 910   }
 911
 912   /**
 913    * Realigns the given dna to match the alignment of the protein, using codon
 914    * mappings to translate aligned peptide positions to codons.
 915    *
 916    * Always produces a padded CDS alignment.
 917    *
 918    * @param dna
 919    *          the alignment whose sequences are realigned by this method
 920    * @param protein
 921    *          the protein alignment whose alignment we are 'copying'
 922    * @return the number of sequences that were realigned
 923    */
 924   public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
 925   {
 926     if (protein.isNucleotide() || !dna.isNucleotide())
 927     {
 928       jalview.bin.Console
 929               .errPrintln("Wrong alignment type in alignProteinAsDna");
 930       return 0;
 931     }
 932     // todo: implement this
 933     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
 934     int alignedCount = 0;
 935     int width = 0; // alignment width for padding CDS
 936     for (SequenceI dnaSeq : dna.getSequences())
 937     {
 938       if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
 939               dna.getGapCharacter()))
 940       {
 941         alignedCount++;
 942       }
 943       width = Math.max(dnaSeq.getLength(), width);
 944     }
 945     int oldwidth;
 946     int diff;
 947     for (SequenceI dnaSeq : dna.getSequences())
 948     {
 949       oldwidth = dnaSeq.getLength();
 950       diff = width - oldwidth;
 951       if (diff > 0)
 952       {
 953         dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
 954       }
 955     }
 956     return alignedCount;
 957   }
 958
 959   /**
 960    * Helper method to align (if possible) the dna sequence to match the
 961    * alignment of a mapped protein sequence. This is currently limited to
 962    * handling coding sequence only.
 963    *
 964    * @param cdsSeq
 965    * @param protein
 966    * @param mappings
 967    * @param gapChar
 968    * @return
 969    */
 970   static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
 971           AlignmentI protein, List<AlignedCodonFrame> mappings,
 972           char gapChar)
 973   {
 974     SequenceI cdsDss = cdsSeq.getDatasetSequence();
 975     if (cdsDss == null)
 976     {
 977       System.err
 978               .println("alignCdsSequenceAsProtein needs aligned sequence!");
 979       return false;
 980     }
 981
 982     List<AlignedCodonFrame> dnaMappings = MappingUtils
 983             .findMappingsForSequence(cdsSeq, mappings);
 984     for (AlignedCodonFrame mapping : dnaMappings)
 985     {
 986       SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
 987       if (peptide != null)
 988       {
 989         final int peptideLength = peptide.getLength();
 990         Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
 991         if (map != null)
 992         {
 993           MapList mapList = map.getMap();
 994           if (map.getTo() == peptide.getDatasetSequence())
 995           {
 996             mapList = mapList.getInverse();
 997           }
 998           final int cdsLength = cdsDss.getLength();
 999           int mappedFromLength = MappingUtils
1000                   .getLength(mapList.getFromRanges());
1001           int mappedToLength = MappingUtils
1002                   .getLength(mapList.getToRanges());
1003           boolean addStopCodon = (cdsLength == mappedFromLength
1004                   * CODON_LENGTH + CODON_LENGTH)
1005                   || (peptide.getDatasetSequence()
1006                           .getLength() == mappedFromLength - 1);
1007           if (cdsLength != mappedToLength && !addStopCodon)
1008           {
1009             jalview.bin.Console.errPrintln(String.format(
1010                     "Can't align cds as protein (length mismatch %d/%d): %s",
1011                     cdsLength, mappedToLength, cdsSeq.getName()));
1012           }
1013
1014           /*
1015            * pre-fill the aligned cds sequence with gaps
1016            */
1017           char[] alignedCds = new char[peptideLength * CODON_LENGTH
1018                   + (addStopCodon ? CODON_LENGTH : 0)];
1019           Arrays.fill(alignedCds, gapChar);
1020
1021           /*
1022            * walk over the aligned peptide sequence and insert mapped
1023            * codons for residues in the aligned cds sequence
1024            */
1025           int copiedBases = 0;
1026           int cdsStart = cdsDss.getStart();
1027           int proteinPos = peptide.getStart() - 1;
1028           int cdsCol = 0;
1029
1030           for (int col = 0; col < peptideLength; col++)
1031           {
1032             char residue = peptide.getCharAt(col);
1033
1034             if (Comparison.isGap(residue))
1035             {
1036               cdsCol += CODON_LENGTH;
1037             }
1038             else
1039             {
1040               proteinPos++;
1041               int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1042               if (codon == null)
1043               {
1044                 // e.g. incomplete start codon, X in peptide
1045                 cdsCol += CODON_LENGTH;
1046               }
1047               else
1048               {
1049                 for (int j = codon[0]; j <= codon[1]; j++)
1050                 {
1051                   char mappedBase = cdsDss.getCharAt(j - cdsStart);
1052                   alignedCds[cdsCol++] = mappedBase;
1053                   copiedBases++;
1054                 }
1055               }
1056             }
1057           }
1058
1059           /*
1060            * append stop codon if not mapped from protein,
1061            * closing it up to the end of the mapped sequence
1062            */
1063           if (copiedBases == cdsLength - CODON_LENGTH)
1064           {
1065             for (int i = alignedCds.length - 1; i >= 0; i--)
1066             {
1067               if (!Comparison.isGap(alignedCds[i]))
1068               {
1069                 cdsCol = i + 1; // gap just after end of sequence
1070                 break;
1071               }
1072             }
1073             for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1074             {
1075               alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1076             }
1077           }
1078           cdsSeq.setSequence(new String(alignedCds));
1079           return true;
1080         }
1081       }
1082     }
1083     return false;
1084   }
1085
1086   /**
1087    * Builds a map whose key is an aligned codon position (3 alignment column
1088    * numbers base 0), and whose value is a map from protein sequence to each
1089    * protein's peptide residue for that codon. The map generates an ordering of
1090    * the codons, and allows us to read off the peptides at each position in
1091    * order to assemble 'aligned' protein sequences.
1092    *
1093    * @param protein
1094    *          the protein alignment
1095    * @param dna
1096    *          the coding dna alignment
1097    * @param unmappedProtein
1098    *          any unmapped proteins are added to this list
1099    * @return
1100    */
1101   protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1102           AlignmentI protein, AlignmentI dna,
1103           List<SequenceI> unmappedProtein)
1104   {
1105     /*
1106      * maintain a list of any proteins with no mappings - these will be
1107      * rendered 'as is' in the protein alignment as we can't align them
1108      */
1109     unmappedProtein.addAll(protein.getSequences());
1110
1111     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1112
1113     /*
1114      * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1115      * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1116      * comparator keeps the codon positions ordered.
1117      */
1118     Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1119             new CodonComparator());
1120
1121     for (SequenceI dnaSeq : dna.getSequences())
1122     {
1123       for (AlignedCodonFrame mapping : mappings)
1124       {
1125         SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1126         if (prot != null)
1127         {
1128           Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1129           addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1130                   alignedCodons);
1131           unmappedProtein.remove(prot);
1132         }
1133       }
1134     }
1135
1136     /*
1137      * Finally add any unmapped peptide start residues (e.g. for incomplete
1138      * codons) as if at the codon position before the second residue
1139      */
1140     // TODO resolve JAL-2022 so this fudge can be removed
1141     int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1142     addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1143
1144     return alignedCodons;
1145   }
1146
1147   /**
1148    * Scans for any protein mapped from position 2 (meaning unmapped start
1149    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1150    * preceding position in the alignment
1151    *
1152    * @param alignedCodons
1153    *          the codon-to-peptide map
1154    * @param mappedSequenceCount
1155    *          the number of distinct sequences in the map
1156    */
1157   protected static void addUnmappedPeptideStarts(
1158           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1159           int mappedSequenceCount)
1160   {
1161     // TODO delete this ugly hack once JAL-2022 is resolved
1162     // i.e. we can model startPhase > 0 (incomplete start codon)
1163
1164     List<SequenceI> sequencesChecked = new ArrayList<>();
1165     AlignedCodon lastCodon = null;
1166     Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1167
1168     for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1169             .entrySet())
1170     {
1171       for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1172               .entrySet())
1173       {
1174         SequenceI seq = sequenceCodon.getKey();
1175         if (sequencesChecked.contains(seq))
1176         {
1177           continue;
1178         }
1179         sequencesChecked.add(seq);
1180         AlignedCodon codon = sequenceCodon.getValue();
1181         if (codon.peptideCol > 1)
1182         {
1183           jalview.bin.Console.errPrintln(
1184                   "Problem mapping protein with >1 unmapped start positions: "
1185                           + seq.getName());
1186         }
1187         else if (codon.peptideCol == 1)
1188         {
1189           /*
1190            * first position (peptideCol == 0) was unmapped - add it
1191            */
1192           if (lastCodon != null)
1193           {
1194             AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1195                     lastCodon.pos2, lastCodon.pos3,
1196                     String.valueOf(seq.getCharAt(0)), 0);
1197             toAdd.put(seq, firstPeptide);
1198           }
1199           else
1200           {
1201             /*
1202              * unmapped residue at start of alignment (no prior column) -
1203              * 'insert' at nominal codon [0, 0, 0]
1204              */
1205             AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1206                     String.valueOf(seq.getCharAt(0)), 0);
1207             toAdd.put(seq, firstPeptide);
1208           }
1209         }
1210         if (sequencesChecked.size() == mappedSequenceCount)
1211         {
1212           // no need to check past first mapped position in all sequences
1213           break;
1214         }
1215       }
1216       lastCodon = entry.getKey();
1217     }
1218
1219     /*
1220      * add any new codons safely after iterating over the map
1221      */
1222     for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1223     {
1224       addCodonToMap(alignedCodons, startCodon.getValue(),
1225               startCodon.getKey());
1226     }
1227   }
1228
1229   /**
1230    * Update the aligned protein sequences to match the codon alignments given in
1231    * the map.
1232    *
1233    * @param protein
1234    * @param alignedCodons
1235    *          an ordered map of codon positions (columns), with sequence/peptide
1236    *          values present in each column
1237    * @param unmappedProtein
1238    * @return
1239    */
1240   protected static int alignProteinAs(AlignmentI protein,
1241           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1242           List<SequenceI> unmappedProtein)
1243   {
1244     /*
1245      * prefill peptide sequences with gaps
1246      */
1247     int alignedWidth = alignedCodons.size();
1248     char[] gaps = new char[alignedWidth];
1249     Arrays.fill(gaps, protein.getGapCharacter());
1250     Map<SequenceI, char[]> peptides = new HashMap<>();
1251     for (SequenceI seq : protein.getSequences())
1252     {
1253       if (!unmappedProtein.contains(seq))
1254       {
1255         peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1256       }
1257     }
1258
1259     /*
1260      * Traverse the codons left to right (as defined by CodonComparator)
1261      * and insert peptides in each column where the sequence is mapped.
1262      * This gives a peptide 'alignment' where residues are aligned if their
1263      * corresponding codons occupy the same columns in the cdna alignment.
1264      */
1265     int column = 0;
1266     for (AlignedCodon codon : alignedCodons.keySet())
1267     {
1268       final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1269               .get(codon);
1270       for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1271       {
1272         char residue = entry.getValue().product.charAt(0);
1273         peptides.get(entry.getKey())[column] = residue;
1274       }
1275       column++;
1276     }
1277
1278     /*
1279      * and finally set the constructed sequences
1280      */
1281     for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1282     {
1283       entry.getKey().setSequence(new String(entry.getValue()));
1284     }
1285
1286     return 0;
1287   }
1288
1289   /**
1290    * Populate the map of aligned codons by traversing the given sequence
1291    * mapping, locating the aligned positions of mapped codons, and adding those
1292    * positions and their translation products to the map.
1293    *
1294    * @param dna
1295    *          the aligned sequence we are mapping from
1296    * @param protein
1297    *          the sequence to be aligned to the codons
1298    * @param gapChar
1299    *          the gap character in the dna sequence
1300    * @param seqMap
1301    *          a mapping to a sequence translation
1302    * @param alignedCodons
1303    *          the map we are building up
1304    */
1305   static void addCodonPositions(SequenceI dna, SequenceI protein,
1306           char gapChar, Mapping seqMap,
1307           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1308   {
1309     Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1310
1311     /*
1312      * add codon positions, and their peptide translations, to the alignment
1313      * map, while remembering the first codon mapped
1314      */
1315     while (codons.hasNext())
1316     {
1317       try
1318       {
1319         AlignedCodon codon = codons.next();
1320         addCodonToMap(alignedCodons, codon, protein);
1321       } catch (IncompleteCodonException e)
1322       {
1323         // possible incomplete trailing codon - ignore
1324       } catch (NoSuchElementException e)
1325       {
1326         // possibly peptide lacking STOP
1327       }
1328     }
1329   }
1330
1331   /**
1332    * Helper method to add a codon-to-peptide entry to the aligned codons map
1333    *
1334    * @param alignedCodons
1335    * @param codon
1336    * @param protein
1337    */
1338   protected static void addCodonToMap(
1339           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1340           AlignedCodon codon, SequenceI protein)
1341   {
1342     Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1343     if (seqProduct == null)
1344     {
1345       seqProduct = new HashMap<>();
1346       alignedCodons.put(codon, seqProduct);
1347     }
1348     seqProduct.put(protein, codon);
1349   }
1350
1351   /**
1352    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1353    * between at least one pair of sequences in the two alignments. Currently,
1354    * the logic is:
1355    * <ul>
1356    * <li>One alignment must be nucleotide, and the other protein</li>
1357    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1358    * <li>Mappable means the nucleotide translation matches the protein
1359    * sequence</li>
1360    * <li>The translation may ignore start and stop codons if present in the
1361    * nucleotide</li>
1362    * </ul>
1363    *
1364    * @param al1
1365    * @param al2
1366    * @return
1367    */
1368   public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1369   {
1370     if (al1 == null || al2 == null)
1371     {
1372       return false;
1373     }
1374
1375     /*
1376      * Require one nucleotide and one protein
1377      */
1378     if (al1.isNucleotide() == al2.isNucleotide())
1379     {
1380       return false;
1381     }
1382     AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1383     AlignmentI protein = dna == al1 ? al2 : al1;
1384     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1385     for (SequenceI dnaSeq : dna.getSequences())
1386     {
1387       for (SequenceI proteinSeq : protein.getSequences())
1388       {
1389         if (isMappable(dnaSeq, proteinSeq, mappings))
1390         {
1391           return true;
1392         }
1393       }
1394     }
1395     return false;
1396   }
1397
1398   /**
1399    * Returns true if the dna sequence is mapped, or could be mapped, to the
1400    * protein sequence.
1401    *
1402    * @param dnaSeq
1403    * @param proteinSeq
1404    * @param mappings
1405    * @return
1406    */
1407   protected static boolean isMappable(SequenceI dnaSeq,
1408           SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1409   {
1410     if (dnaSeq == null || proteinSeq == null)
1411     {
1412       return false;
1413     }
1414
1415     SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1416             : dnaSeq.getDatasetSequence();
1417     SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1418             ? proteinSeq
1419             : proteinSeq.getDatasetSequence();
1420
1421     for (AlignedCodonFrame mapping : mappings)
1422     {
1423       if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1424       {
1425         /*
1426          * already mapped
1427          */
1428         return true;
1429       }
1430     }
1431
1432     /*
1433      * Just try to make a mapping (it is not yet stored), test whether
1434      * successful.
1435      */
1436     return mapCdnaToProtein(proteinDs, dnaDs) != null;
1437   }
1438
1439   /**
1440    * Finds any reference annotations associated with the sequences in
1441    * sequenceScope, that are not already added to the alignment, and adds them
1442    * to the 'candidates' map. Also populates a lookup table of annotation
1443    * labels, keyed by calcId, for use in constructing tooltips or the like.
1444    *
1445    * @param sequenceScope
1446    *          the sequences to scan for reference annotations
1447    * @param labelForCalcId
1448    *          (optional) map to populate with label for calcId
1449    * @param candidates
1450    *          map to populate with annotations for sequence
1451    * @param al
1452    *          the alignment to check for presence of annotations
1453    */
1454   public static void findAddableReferenceAnnotations(
1455           List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1456           final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1457           AlignmentI al)
1458   {
1459     if (sequenceScope == null)
1460     {
1461       return;
1462     }
1463
1464     /*
1465      * For each sequence in scope, make a list of any annotations on the
1466      * underlying dataset sequence which are not already on the alignment.
1467      *
1468      * Add to a map of { alignmentSequence, <List of annotations to add> }
1469      */
1470     for (SequenceI seq : sequenceScope)
1471     {
1472       SequenceI dataset = seq.getDatasetSequence();
1473       if (dataset == null)
1474       {
1475         continue;
1476       }
1477       AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1478       if (datasetAnnotations == null)
1479       {
1480         continue;
1481       }
1482       final List<AlignmentAnnotation> result = new ArrayList<>();
1483       for (AlignmentAnnotation dsann : datasetAnnotations)
1484       {
1485         /*
1486          * Find matching annotations on the alignment. If none is found, then
1487          * add this annotation to the list of 'addable' annotations for this
1488          * sequence.
1489          */
1490         final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1491                 .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1492         boolean found = false;
1493         if (matchedAlignmentAnnotations != null)
1494         {
1495           for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1496           {
1497             if (dsann.description.equals(matched.description))
1498             {
1499               found = true;
1500               break;
1501             }
1502           }
1503         }
1504         if (!found)
1505         {
1506           result.add(dsann);
1507           if (labelForCalcId != null)
1508           {
1509             labelForCalcId.put(dsann.getCalcId(), dsann.label);
1510           }
1511         }
1512       }
1513       /*
1514        * Save any addable annotations for this sequence
1515        */
1516       if (!result.isEmpty())
1517       {
1518         candidates.put(seq, result);
1519       }
1520     }
1521   }
1522
1523   /**
1524    * Adds annotations to the top of the alignment annotations, in the same order
1525    * as their related sequences. If you already have an annotation and want to
1526    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1527    *
1528    * @param annotations
1529    *          the annotations to add
1530    * @param alignment
1531    *          the alignment to add them to
1532    * @param selectionGroup
1533    *          current selection group - may be null, if provided then any added
1534    *          annotation will be trimmed to just those columns in the selection
1535    *          group
1536    */
1537   public static void addReferenceAnnotations(
1538           Map<SequenceI, List<AlignmentAnnotation>> annotations,
1539           final AlignmentI alignment, final SequenceGroup selectionGroup)
1540   {
1541     for (SequenceI seq : annotations.keySet())
1542     {
1543       for (AlignmentAnnotation ann : annotations.get(seq))
1544       {
1545         addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1546       }
1547     }
1548   }
1549
1550
1551   public static boolean isSSAnnotationPresent( Map<SequenceI, List<AlignmentAnnotation>> annotations) {
1552
1553     for (SequenceI seq : annotations.keySet())
1554     {
1555       for (AlignmentAnnotation ann : annotations.get(seq))
1556       {
1557         if(ann.getDescription(false).startsWith(SS_ANNOTATION_LABEL)) {
1558           return true;
1559         }
1560       }
1561     }
1562     return false;
1563   }
1564
1565   /**
1566    * Make a copy of a reference annotation {@code ann} and add it to an
1567    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1568    * the extent of {@code selectionGroup}
1569    *
1570    * @param alignment
1571    * @param seq
1572    * @param ann
1573    * @param selectionGroup
1574    *          current selection group - may be null, if provided then any added
1575    *          annotation will be trimmed to just those columns in the selection
1576    *          group
1577    * @return annotation added to {@code seq and {@code alignment}
1578    */
1579   public static AlignmentAnnotation addReferenceAnnotationTo(
1580           final AlignmentI alignment, final SequenceI seq,
1581           final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1582   {
1583     AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1584     int startRes = 0;
1585     int endRes = ann.annotations.length;
1586     if (selectionGroup != null)
1587     {
1588       startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1589               seq.findPosition(selectionGroup.getStartRes())));
1590       endRes = -1 + Math.min(seq.getEnd(),
1591               seq.findPosition(selectionGroup.getEndRes()));
1592
1593     }
1594     copyAnn.restrict(startRes, endRes + 0);
1595
1596     /*
1597      * Add to the sequence (sets copyAnn.datasetSequence), unless the
1598      * original annotation is already on the sequence.
1599      */
1600     if (!seq.hasAnnotation(ann))
1601     {
1602       ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1603       if (cm != null)
1604       {
1605         seq.addContactListFor(copyAnn, cm);
1606       }
1607       seq.addAlignmentAnnotation(copyAnn);
1608     }
1609     // adjust for gaps
1610     copyAnn.adjustForAlignment();
1611     // add to the alignment and set visible
1612     alignment.addAnnotation(copyAnn);
1613     copyAnn.visible = true;
1614
1615     return copyAnn;
1616   }
1617
1618   /**
1619    * Set visibility of alignment annotations of specified types (labels), for
1620    * specified sequences. This supports controls like "Show all secondary
1621    * structure", "Hide all Temp factor", etc.
1622    *
1623    * @al the alignment to scan for annotations
1624    * @param types
1625    *          the types (labels) of annotations to be updated
1626    * @param forSequences
1627    *          if not null, only annotations linked to one of these sequences are
1628    *          in scope for update; if null, acts on all sequence annotations
1629    * @param anyType
1630    *          if this flag is true, 'types' is ignored (label not checked)
1631    * @param doShow
1632    *          if true, set visibility on, else set off
1633    */
1634   public static void showOrHideSequenceAnnotations(AlignmentI al,
1635           Collection<String> types, List<SequenceI> forSequences,
1636           boolean anyType, boolean doShow)
1637   {
1638     AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1639     if (anns != null)
1640     {
1641       for (AlignmentAnnotation aa : anns)
1642       {
1643         if (anyType || types.contains(aa.label))
1644         {
1645           if ((aa.sequenceRef != null) && (forSequences == null
1646                   || forSequences.contains(aa.sequenceRef)))
1647           {
1648             aa.visible = doShow;
1649           }
1650         }
1651       }
1652     }
1653   }
1654
1655   public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1656           AlignmentI al, int graphType)
1657   {
1658     AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1659     if (anns != null)
1660     {
1661       for (AlignmentAnnotation aa : anns)
1662       {
1663         if (aa.sequenceRef != null && aa.graph == graphType)
1664           return aa;
1665       }
1666     }
1667     return null;
1668   }
1669
1670   /**
1671    * Returns true if either sequence has a cross-reference to the other
1672    *
1673    * @param seq1
1674    * @param seq2
1675    * @return
1676    */
1677   public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1678   {
1679     // Note: moved here from class CrossRef as the latter class has dependencies
1680     // not availability to the applet's classpath
1681     return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1682   }
1683
1684   /**
1685    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1686    * that sequence name is structured as Source|AccessionId.
1687    *
1688    * @param seq1
1689    * @param seq2
1690    * @return
1691    */
1692   public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1693   {
1694     if (seq1 == null || seq2 == null)
1695     {
1696       return false;
1697     }
1698     String name = seq2.getName();
1699     final List<DBRefEntry> xrefs = seq1.getDBRefs();
1700     if (xrefs != null)
1701     {
1702       for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1703       {
1704         DBRefEntry xref = xrefs.get(ix);
1705         String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1706         // case-insensitive test, consistent with DBRefEntry.equalRef()
1707         if (xrefName.equalsIgnoreCase(name))
1708         {
1709           return true;
1710         }
1711       }
1712     }
1713     return false;
1714   }
1715
1716   /**
1717    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1718    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1719    * added to the original alignment's dataset, which is shared by the new
1720    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1721    * added to the alignment dataset.
1722    *
1723    * @param dna
1724    *          aligned nucleotide (dna or cds) sequences
1725    * @param dataset
1726    *          the alignment dataset the sequences belong to
1727    * @param products
1728    *          (optional) to restrict results to CDS that map to specified
1729    *          protein products
1730    * @return an alignment whose sequences are the cds-only parts of the dna
1731    *         sequences (or null if no mappings are found)
1732    */
1733   public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1734           AlignmentI dataset, SequenceI[] products)
1735   {
1736     if (dataset == null || dataset.getDataset() != null)
1737     {
1738       throw new IllegalArgumentException(
1739               "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1740     }
1741     List<SequenceI> foundSeqs = new ArrayList<>();
1742     List<SequenceI> cdsSeqs = new ArrayList<>();
1743     List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1744     HashSet<SequenceI> productSeqs = null;
1745     if (products != null)
1746     {
1747       productSeqs = new HashSet<>();
1748       for (SequenceI seq : products)
1749       {
1750         productSeqs.add(seq.getDatasetSequence() == null ? seq
1751                 : seq.getDatasetSequence());
1752       }
1753     }
1754
1755     /*
1756      * Construct CDS sequences from mappings on the alignment dataset.
1757      * The logic is:
1758      * - find the protein product(s) mapped to from each dna sequence
1759      * - if the mapping covers the whole dna sequence (give or take start/stop
1760      *   codon), take the dna as the CDS sequence
1761      * - else search dataset mappings for a suitable dna sequence, i.e. one
1762      *   whose whole sequence is mapped to the protein
1763      * - if no sequence found, construct one from the dna sequence and mapping
1764      *   (and add it to dataset so it is found if this is repeated)
1765      */
1766     for (SequenceI dnaSeq : dna)
1767     {
1768       SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1769               : dnaSeq.getDatasetSequence();
1770
1771       List<AlignedCodonFrame> seqMappings = MappingUtils
1772               .findMappingsForSequence(dnaSeq, mappings);
1773       for (AlignedCodonFrame mapping : seqMappings)
1774       {
1775         List<Mapping> mappingsFromSequence = mapping
1776                 .getMappingsFromSequence(dnaSeq);
1777
1778         for (Mapping aMapping : mappingsFromSequence)
1779         {
1780           MapList mapList = aMapping.getMap();
1781           if (mapList.getFromRatio() == 1)
1782           {
1783             /*
1784              * not a dna-to-protein mapping (likely dna-to-cds)
1785              */
1786             continue;
1787           }
1788
1789           /*
1790            * skip if mapping is not to one of the target set of proteins
1791            */
1792           SequenceI proteinProduct = aMapping.getTo();
1793           if (productSeqs != null && !productSeqs.contains(proteinProduct))
1794           {
1795             continue;
1796           }
1797
1798           /*
1799            * try to locate the CDS from the dataset mappings;
1800            * guard against duplicate results (for the case that protein has
1801            * dbrefs to both dna and cds sequences)
1802            */
1803           SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1804                   seqMappings, aMapping);
1805           if (cdsSeq != null)
1806           {
1807             if (!foundSeqs.contains(cdsSeq))
1808             {
1809               foundSeqs.add(cdsSeq);
1810               SequenceI derivedSequence = cdsSeq.deriveSequence();
1811               cdsSeqs.add(derivedSequence);
1812               if (!dataset.getSequences().contains(cdsSeq))
1813               {
1814                 dataset.addSequence(cdsSeq);
1815               }
1816             }
1817             continue;
1818           }
1819
1820           /*
1821            * didn't find mapped CDS sequence - construct it and add
1822            * its dataset sequence to the dataset
1823            */
1824           cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1825                   dataset).deriveSequence();
1826           // cdsSeq has a name constructed as CDS|<dbref>
1827           // <dbref> will be either the accession for the coding sequence,
1828           // marked in the /via/ dbref to the protein product accession
1829           // or it will be the original nucleotide accession.
1830           SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1831
1832           cdsSeqs.add(cdsSeq);
1833
1834           /*
1835            * build the mapping from CDS to protein
1836            */
1837           List<int[]> cdsRange = Collections
1838                   .singletonList(new int[]
1839                   { cdsSeq.getStart(),
1840                       cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1841           MapList cdsToProteinMap = new MapList(cdsRange,
1842                   mapList.getToRanges(), mapList.getFromRatio(),
1843                   mapList.getToRatio());
1844
1845           if (!dataset.getSequences().contains(cdsSeqDss))
1846           {
1847             /*
1848              * if this sequence is a newly created one, add it to the dataset
1849              * and made a CDS to protein mapping (if sequence already exists,
1850              * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1851              */
1852             dataset.addSequence(cdsSeqDss);
1853             AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1854             cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1855                     cdsToProteinMap);
1856
1857             /*
1858              * guard against duplicating the mapping if repeating this action
1859              */
1860             if (!mappings.contains(cdsToProteinMapping))
1861             {
1862               mappings.add(cdsToProteinMapping);
1863             }
1864           }
1865
1866           propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1867                   proteinProduct, aMapping);
1868           /*
1869            * add another mapping from original 'from' range to CDS
1870            */
1871           AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1872           final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1873                   cdsRange, 1, 1);
1874           dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1875                   dnaToCdsMap);
1876           if (!mappings.contains(dnaToCdsMapping))
1877           {
1878             mappings.add(dnaToCdsMapping);
1879           }
1880
1881           /*
1882            * transfer dna chromosomal loci (if known) to the CDS
1883            * sequence (via the mapping)
1884            */
1885           final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1886           transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1887
1888           /*
1889            * add DBRef with mapping from protein to CDS
1890            * (this enables Get Cross-References from protein alignment)
1891            * This is tricky because we can't have two DBRefs with the
1892            * same source and accession, so need a different accession for
1893            * the CDS from the dna sequence
1894            */
1895
1896           // specific use case:
1897           // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1898           // ENSG02, ENSG03, with transcripts and products similarly named.
1899           // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1900
1901           // JBPNote: ?? can't actually create an example that demonstrates we
1902           // need to
1903           // synthesize an xref.
1904
1905           List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1906           for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1907           {
1908             DBRefEntry primRef = primrefs.get(ip);
1909             /*
1910              * create a cross-reference from CDS to the source sequence's
1911              * primary reference and vice versa
1912              */
1913             String source = primRef.getSource();
1914             String version = primRef.getVersion();
1915             DBRefEntry cdsCrossRef = new DBRefEntry(source,
1916                     source + ":" + version, primRef.getAccessionId());
1917             cdsCrossRef
1918                     .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1919             cdsSeqDss.addDBRef(cdsCrossRef);
1920
1921             dnaSeq.addDBRef(new DBRefEntry(source, version,
1922                     cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1923             // problem here is that the cross-reference is synthesized -
1924             // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1925             // 'CDS|emblcdsacc'
1926             // assuming cds version same as dna ?!?
1927
1928             DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1929                     cdsSeq.getName());
1930             //
1931             proteinToCdsRef.setMap(
1932                     new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1933             proteinProduct.addDBRef(proteinToCdsRef);
1934           }
1935           /*
1936            * transfer any features on dna that overlap the CDS
1937            */
1938           transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1939                   SequenceOntologyI.CDS);
1940         }
1941       }
1942     }
1943
1944     AlignmentI cds = new Alignment(
1945             cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1946     cds.setDataset(dataset);
1947
1948     return cds;
1949   }
1950
1951   /**
1952    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1953    * toSeq, mediated by the given mapping between the sequences
1954    *
1955    * @param fromSeq
1956    * @param targetToFrom
1957    *          Map
1958    * @param targetSeq
1959    */
1960   protected static void transferGeneLoci(SequenceI fromSeq,
1961           MapList targetToFrom, SequenceI targetSeq)
1962   {
1963     if (targetSeq.getGeneLoci() != null)
1964     {
1965       // already have - don't override
1966       return;
1967     }
1968     GeneLociI fromLoci = fromSeq.getGeneLoci();
1969     if (fromLoci == null)
1970     {
1971       return;
1972     }
1973
1974     MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
1975
1976     if (newMap != null)
1977     {
1978       targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
1979               fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
1980     }
1981   }
1982
1983   /**
1984    * A helper method that finds a CDS sequence in the alignment dataset that is
1985    * mapped to the given protein sequence, and either is, or has a mapping from,
1986    * the given dna sequence.
1987    *
1988    * @param mappings
1989    *          set of all mappings on the dataset
1990    * @param dnaSeq
1991    *          a dna (or cds) sequence we are searching from
1992    * @param seqMappings
1993    *          the set of mappings involving dnaSeq
1994    * @param aMapping
1995    *          a transcript-to-peptide mapping
1996    * @return
1997    */
1998   static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
1999           SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2000           Mapping aMapping)
2001   {
2002     /*
2003      * TODO a better dna-cds-protein mapping data representation to allow easy
2004      * navigation; until then this clunky looping around lists of mappings
2005      */
2006     SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2007             : dnaSeq.getDatasetSequence();
2008     SequenceI proteinProduct = aMapping.getTo();
2009
2010     /*
2011      * is this mapping from the whole dna sequence (i.e. CDS)?
2012      * allowing for possible stop codon on dna but not peptide
2013      */
2014     int mappedFromLength = MappingUtils
2015             .getLength(aMapping.getMap().getFromRanges());
2016     int dnaLength = seqDss.getLength();
2017     if (mappedFromLength == dnaLength
2018             || mappedFromLength == dnaLength - CODON_LENGTH)
2019     {
2020       /*
2021        * if sequence has CDS features, this is a transcript with no UTR
2022        * - do not take this as the CDS sequence! (JAL-2789)
2023        */
2024       if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2025               .isEmpty())
2026       {
2027         return seqDss;
2028       }
2029     }
2030
2031     /*
2032      * looks like we found the dna-to-protein mapping; search for the
2033      * corresponding cds-to-protein mapping
2034      */
2035     List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2036             .findMappingsForSequence(proteinProduct, mappings);
2037     for (AlignedCodonFrame acf : mappingsToPeptide)
2038     {
2039       for (SequenceToSequenceMapping map : acf.getMappings())
2040       {
2041         Mapping mapping = map.getMapping();
2042         if (mapping != aMapping
2043                 && mapping.getMap().getFromRatio() == CODON_LENGTH
2044                 && proteinProduct == mapping.getTo()
2045                 && seqDss != map.getFromSeq())
2046         {
2047           mappedFromLength = MappingUtils
2048                   .getLength(mapping.getMap().getFromRanges());
2049           if (mappedFromLength == map.getFromSeq().getLength())
2050           {
2051             /*
2052             * found a 3:1 mapping to the protein product which covers
2053             * the whole dna sequence i.e. is from CDS; finally check the CDS
2054             * is mapped from the given dna start sequence
2055             */
2056             SequenceI cdsSeq = map.getFromSeq();
2057             // todo this test is weak if seqMappings contains multiple mappings;
2058             // we get away with it if transcript:cds relationship is 1:1
2059             List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2060                     .findMappingsForSequence(cdsSeq, seqMappings);
2061             if (!dnaToCdsMaps.isEmpty())
2062             {
2063               return cdsSeq;
2064             }
2065           }
2066         }
2067       }
2068     }
2069     return null;
2070   }
2071
2072   /**
2073    * Helper method that makes a CDS sequence as defined by the mappings from the
2074    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2075    * forward or reverse strand).
2076    *
2077    * @param seq
2078    * @param mapping
2079    * @param dataset
2080    *          - existing dataset. We check for sequences that look like the CDS
2081    *          we are about to construct, if one exists already, then we will
2082    *          just return that one.
2083    * @return CDS sequence (as a dataset sequence)
2084    */
2085   static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2086           AlignmentI dataset)
2087   {
2088     /*
2089      * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2090      * if set (e.g. EMBL protein_id), else sequence name appended
2091      */
2092     String mapFromId = mapping.getMappedFromId();
2093     final String seqId = "CDS|"
2094             + (mapFromId != null ? mapFromId : seq.getName());
2095
2096     SequenceI newSeq = null;
2097
2098     /*
2099      * construct CDS sequence by splicing mapped from ranges
2100      */
2101     char[] seqChars = seq.getSequence();
2102     List<int[]> fromRanges = mapping.getMap().getFromRanges();
2103     int cdsWidth = MappingUtils.getLength(fromRanges);
2104     char[] newSeqChars = new char[cdsWidth];
2105
2106     int newPos = 0;
2107     for (int[] range : fromRanges)
2108     {
2109       if (range[0] <= range[1])
2110       {
2111         // forward strand mapping - just copy the range
2112         int length = range[1] - range[0] + 1;
2113         System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2114                 length);
2115         newPos += length;
2116       }
2117       else
2118       {
2119         // reverse strand mapping - copy and complement one by one
2120         for (int i = range[0]; i >= range[1]; i--)
2121         {
2122           newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2123         }
2124       }
2125
2126       newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2127     }
2128
2129     if (dataset != null)
2130     {
2131       SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2132       if (matches != null)
2133       {
2134         boolean matched = false;
2135         for (SequenceI mtch : matches)
2136         {
2137           if (mtch.getStart() != newSeq.getStart())
2138           {
2139             continue;
2140           }
2141           if (mtch.getEnd() != newSeq.getEnd())
2142           {
2143             continue;
2144           }
2145           if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2146           {
2147             continue;
2148           }
2149           if (!matched)
2150           {
2151             matched = true;
2152             newSeq = mtch;
2153           }
2154           else
2155           {
2156             Console.error(
2157                     "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2158                             + mtch.toString());
2159           }
2160         }
2161       }
2162     }
2163     // newSeq.setDescription(mapFromId);
2164
2165     return newSeq;
2166   }
2167
2168   /**
2169    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2170    * the given mapping.
2171    *
2172    * @param cdsSeq
2173    * @param contig
2174    * @param proteinProduct
2175    * @param mapping
2176    * @return list of DBRefEntrys added
2177    */
2178   protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2179           SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2180   {
2181
2182     // gather direct refs from contig congruent with mapping
2183     List<DBRefEntry> direct = new ArrayList<>();
2184     HashSet<String> directSources = new HashSet<>();
2185
2186     List<DBRefEntry> refs = contig.getDBRefs();
2187     if (refs != null)
2188     {
2189       for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2190       {
2191         DBRefEntry dbr = refs.get(ib);
2192         MapList map;
2193         if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2194         {
2195           // check if map is the CDS mapping
2196           if (mapping.getMap().equals(map))
2197           {
2198             direct.add(dbr);
2199             directSources.add(dbr.getSource());
2200           }
2201         }
2202       }
2203     }
2204     List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2205             proteinProduct.getDBRefs(),
2206             directSources.toArray(new String[0]));
2207     List<DBRefEntry> propagated = new ArrayList<>();
2208
2209     // and generate appropriate mappings
2210     for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2211     {
2212       DBRefEntry cdsref = direct.get(ic);
2213       Mapping m = cdsref.getMap();
2214       // clone maplist and mapping
2215       MapList cdsposmap = new MapList(
2216               Arrays.asList(new int[][]
2217               { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2218               m.getMap().getToRanges(), 3, 1);
2219       Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2220
2221       // create dbref
2222       DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2223               cdsref.getVersion(), cdsref.getAccessionId(),
2224               new Mapping(cdsmap.getTo(), cdsposmap));
2225
2226       // and see if we can map to the protein product for this mapping.
2227       // onSource is the filtered set of accessions on protein that we are
2228       // tranferring, so we assume accession is the same.
2229       if (cdsmap.getTo() == null && onSource != null)
2230       {
2231         List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2232                 cdsref.getAccessionId());
2233         if (sourceRefs != null)
2234         {
2235           for (DBRefEntry srcref : sourceRefs)
2236           {
2237             if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2238             {
2239               // we have found a complementary dbref on the protein product, so
2240               // update mapping's getTo
2241               newref.getMap().setTo(proteinProduct);
2242             }
2243           }
2244         }
2245       }
2246       cdsSeq.addDBRef(newref);
2247       propagated.add(newref);
2248     }
2249     return propagated;
2250   }
2251
2252   /**
2253    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2254    * feature start/end ranges, optionally omitting specified feature types.
2255    * Returns the number of features copied.
2256    *
2257    * @param fromSeq
2258    * @param toSeq
2259    * @param mapping
2260    *          the mapping from 'fromSeq' to 'toSeq'
2261    * @param select
2262    *          if not null, only features of this type are copied (including
2263    *          subtypes in the Sequence Ontology)
2264    * @param omitting
2265    */
2266   protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2267           MapList mapping, String select, String... omitting)
2268   {
2269     SequenceI copyTo = toSeq;
2270     while (copyTo.getDatasetSequence() != null)
2271     {
2272       copyTo = copyTo.getDatasetSequence();
2273     }
2274     if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2275     {
2276       return 0; // shared dataset sequence
2277     }
2278
2279     /*
2280      * get features, optionally restricted by an ontology term
2281      */
2282     List<SequenceFeature> sfs = select == null
2283             ? fromSeq.getFeatures().getPositionalFeatures()
2284             : fromSeq.getFeatures().getFeaturesByOntology(select);
2285
2286     int count = 0;
2287     for (SequenceFeature sf : sfs)
2288     {
2289       String type = sf.getType();
2290       boolean omit = false;
2291       for (String toOmit : omitting)
2292       {
2293         if (type.equals(toOmit))
2294         {
2295           omit = true;
2296         }
2297       }
2298       if (omit)
2299       {
2300         continue;
2301       }
2302
2303       /*
2304        * locate the mapped range - null if either start or end is
2305        * not mapped (no partial overlaps are calculated)
2306        */
2307       int start = sf.getBegin();
2308       int end = sf.getEnd();
2309       int[] mappedTo = mapping.locateInTo(start, end);
2310       /*
2311        * if whole exon range doesn't map, try interpreting it
2312        * as 5' or 3' exon overlapping the CDS range
2313        */
2314       if (mappedTo == null)
2315       {
2316         mappedTo = mapping.locateInTo(end, end);
2317         if (mappedTo != null)
2318         {
2319           /*
2320            * end of exon is in CDS range - 5' overlap
2321            * to a range from the start of the peptide
2322            */
2323           mappedTo[0] = 1;
2324         }
2325       }
2326       if (mappedTo == null)
2327       {
2328         mappedTo = mapping.locateInTo(start, start);
2329         if (mappedTo != null)
2330         {
2331           /*
2332            * start of exon is in CDS range - 3' overlap
2333            * to a range up to the end of the peptide
2334            */
2335           mappedTo[1] = toSeq.getLength();
2336         }
2337       }
2338       if (mappedTo != null)
2339       {
2340         int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2341         int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2342         SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2343                 sf.getFeatureGroup(), sf.getScore());
2344         copyTo.addSequenceFeature(copy);
2345         count++;
2346       }
2347     }
2348     return count;
2349   }
2350
2351   /**
2352    * Returns a mapping from dna to protein by inspecting sequence features of
2353    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2354    * length is 3 times the peptide length (optionally after dropping a trailing
2355    * stop codon). This method does not check whether the CDS nucleotide sequence
2356    * translates to the peptide sequence.
2357    *
2358    * @param dnaSeq
2359    * @param proteinSeq
2360    * @return
2361    */
2362   public static MapList mapCdsToProtein(SequenceI dnaSeq,
2363           SequenceI proteinSeq)
2364   {
2365     List<int[]> ranges = findCdsPositions(dnaSeq);
2366     int mappedDnaLength = MappingUtils.getLength(ranges);
2367
2368     /*
2369      * if not a whole number of codons, truncate mapping
2370      */
2371     int codonRemainder = mappedDnaLength % CODON_LENGTH;
2372     if (codonRemainder > 0)
2373     {
2374       mappedDnaLength -= codonRemainder;
2375       MappingUtils.removeEndPositions(codonRemainder, ranges);
2376     }
2377
2378     int proteinLength = proteinSeq.getLength();
2379     int proteinStart = proteinSeq.getStart();
2380     int proteinEnd = proteinSeq.getEnd();
2381
2382     /*
2383      * incomplete start codon may mean X at start of peptide
2384      * we ignore both for mapping purposes
2385      */
2386     if (proteinSeq.getCharAt(0) == 'X')
2387     {
2388       // todo JAL-2022 support startPhase > 0
2389       proteinStart++;
2390       proteinLength--;
2391     }
2392     List<int[]> proteinRange = new ArrayList<>();
2393
2394     /*
2395      * dna length should map to protein (or protein plus stop codon)
2396      */
2397     int codesForResidues = mappedDnaLength / CODON_LENGTH;
2398     if (codesForResidues == (proteinLength + 1))
2399     {
2400       // assuming extra codon is for STOP and not in peptide
2401       // todo: check trailing codon is indeed a STOP codon
2402       codesForResidues--;
2403       mappedDnaLength -= CODON_LENGTH;
2404       MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2405     }
2406
2407     if (codesForResidues == proteinLength)
2408     {
2409       proteinRange.add(new int[] { proteinStart, proteinEnd });
2410       return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2411     }
2412     return null;
2413   }
2414
2415   /**
2416    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2417    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2418    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2419    * position order, so this method is only valid for linear CDS in the same
2420    * sense as the protein product.
2421    *
2422    * @param dnaSeq
2423    * @return
2424    */
2425   protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2426   {
2427     List<int[]> result = new ArrayList<>();
2428
2429     List<SequenceFeature> sfs = dnaSeq.getFeatures()
2430             .getFeaturesByOntology(SequenceOntologyI.CDS);
2431     if (sfs.isEmpty())
2432     {
2433       return result;
2434     }
2435     SequenceFeatures.sortFeatures(sfs, true);
2436
2437     for (SequenceFeature sf : sfs)
2438     {
2439       int phase = 0;
2440       try
2441       {
2442         String s = sf.getPhase();
2443         if (s != null)
2444         {
2445           phase = Integer.parseInt(s);
2446         }
2447       } catch (NumberFormatException e)
2448       {
2449         // leave as zero
2450       }
2451       /*
2452        * phase > 0 on first codon means 5' incomplete - skip to the start
2453        * of the next codon; example ENST00000496384
2454        */
2455       int begin = sf.getBegin();
2456       int end = sf.getEnd();
2457       if (result.isEmpty() && phase > 0)
2458       {
2459         begin += phase;
2460         if (begin > end)
2461         {
2462           // shouldn't happen!
2463           System.err
2464                   .println("Error: start phase extends beyond start CDS in "
2465                           + dnaSeq.getName());
2466         }
2467       }
2468       result.add(new int[] { begin, end });
2469     }
2470
2471     /*
2472      * Finally sort ranges by start position. This avoids a dependency on
2473      * keeping features in order on the sequence (if they are in order anyway,
2474      * the sort will have almost no work to do). The implicit assumption is CDS
2475      * ranges are assembled in order. Other cases should not use this method,
2476      * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2477      */
2478     Collections.sort(result, IntRangeComparator.ASCENDING);
2479     return result;
2480   }
2481
2482   /**
2483    * Makes an alignment with a copy of the given sequences, adding in any
2484    * non-redundant sequences which are mapped to by the cross-referenced
2485    * sequences.
2486    *
2487    * @param seqs
2488    * @param xrefs
2489    * @param dataset
2490    *          the alignment dataset shared by the new copy
2491    * @return
2492    */
2493   public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2494           SequenceI[] xrefs, AlignmentI dataset)
2495   {
2496     AlignmentI copy = new Alignment(new Alignment(seqs));
2497     copy.setDataset(dataset);
2498     boolean isProtein = !copy.isNucleotide();
2499     SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2500     if (xrefs != null)
2501     {
2502       // BH 2019.01.25 recoded to remove iterators
2503
2504       for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2505       {
2506         SequenceI xref = xrefs[ix];
2507         List<DBRefEntry> dbrefs = xref.getDBRefs();
2508         if (dbrefs != null)
2509         {
2510           for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2511           {
2512             DBRefEntry dbref = dbrefs.get(ir);
2513             Mapping map = dbref.getMap();
2514             SequenceI mto;
2515             if (map == null || (mto = map.getTo()) == null
2516                     || mto.isProtein() != isProtein)
2517             {
2518               continue;
2519             }
2520             SequenceI mappedTo = mto;
2521             SequenceI match = matcher.findIdMatch(mappedTo);
2522             if (match == null)
2523             {
2524               matcher.add(mappedTo);
2525               copy.addSequence(mappedTo);
2526             }
2527           }
2528         }
2529       }
2530     }
2531     return copy;
2532   }
2533
2534   /**
2535    * Try to align sequences in 'unaligned' to match the alignment of their
2536    * mapped regions in 'aligned'. For example, could use this to align CDS
2537    * sequences which are mapped to their parent cDNA sequences.
2538    *
2539    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2540    * dna-to-protein or protein-to-dna use alternative methods.
2541    *
2542    * @param unaligned
2543    *          sequences to be aligned
2544    * @param aligned
2545    *          holds aligned sequences and their mappings
2546    * @return
2547    */
2548   public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2549   {
2550     /*
2551      * easy case - aligning a copy of aligned sequences
2552      */
2553     if (alignAsSameSequences(unaligned, aligned))
2554     {
2555       return unaligned.getHeight();
2556     }
2557
2558     /*
2559      * fancy case - aligning via mappings between sequences
2560      */
2561     List<SequenceI> unmapped = new ArrayList<>();
2562     Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2563             unaligned, aligned, unmapped);
2564     int width = columnMap.size();
2565     char gap = unaligned.getGapCharacter();
2566     int realignedCount = 0;
2567     // TODO: verify this loop scales sensibly for very wide/high alignments
2568
2569     for (SequenceI seq : unaligned.getSequences())
2570     {
2571       if (!unmapped.contains(seq))
2572       {
2573         char[] newSeq = new char[width];
2574         Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2575                                   // Integer iteration below
2576         int newCol = 0;
2577         int lastCol = 0;
2578
2579         /*
2580          * traverse the map to find columns populated
2581          * by our sequence
2582          */
2583         for (Integer column : columnMap.keySet())
2584         {
2585           Character c = columnMap.get(column).get(seq);
2586           if (c != null)
2587           {
2588             /*
2589              * sequence has a character at this position
2590              *
2591              */
2592             newSeq[newCol] = c;
2593             lastCol = newCol;
2594           }
2595           newCol++;
2596         }
2597
2598         /*
2599          * trim trailing gaps
2600          */
2601         if (lastCol < width)
2602         {
2603           char[] tmp = new char[lastCol + 1];
2604           System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2605           newSeq = tmp;
2606         }
2607         // TODO: optimise SequenceI to avoid char[]->String->char[]
2608         seq.setSequence(String.valueOf(newSeq));
2609         realignedCount++;
2610       }
2611     }
2612     return realignedCount;
2613   }
2614
2615   /**
2616    * If unaligned and aligned sequences share the same dataset sequences, then
2617    * simply copies the aligned sequences to the unaligned sequences and returns
2618    * true; else returns false
2619    *
2620    * @param unaligned
2621    *          - sequences to be aligned based on aligned
2622    * @param aligned
2623    *          - 'guide' alignment containing sequences derived from same dataset
2624    *          as unaligned
2625    * @return
2626    */
2627   static boolean alignAsSameSequences(AlignmentI unaligned,
2628           AlignmentI aligned)
2629   {
2630     if (aligned.getDataset() == null || unaligned.getDataset() == null)
2631     {
2632       return false; // should only pass alignments with datasets here
2633     }
2634
2635     // map from dataset sequence to alignment sequence(s)
2636     Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2637     for (SequenceI seq : aligned.getSequences())
2638     {
2639       SequenceI ds = seq.getDatasetSequence();
2640       if (alignedDatasets.get(ds) == null)
2641       {
2642         alignedDatasets.put(ds, new ArrayList<SequenceI>());
2643       }
2644       alignedDatasets.get(ds).add(seq);
2645     }
2646
2647     /*
2648      * first pass - check whether all sequences to be aligned share a
2649      * dataset sequence with an aligned sequence; also note the leftmost
2650      * ungapped column from which to copy
2651      */
2652     int leftmost = Integer.MAX_VALUE;
2653     for (SequenceI seq : unaligned.getSequences())
2654     {
2655       final SequenceI ds = seq.getDatasetSequence();
2656       if (!alignedDatasets.containsKey(ds))
2657       {
2658         return false;
2659       }
2660       SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2661       int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2662       leftmost = Math.min(leftmost, startCol);
2663     }
2664
2665     /*
2666      * second pass - copy aligned sequences;
2667      * heuristic rule: pair off sequences in order for the case where
2668      * more than one shares the same dataset sequence
2669      */
2670     final char gapCharacter = aligned.getGapCharacter();
2671     for (SequenceI seq : unaligned.getSequences())
2672     {
2673       List<SequenceI> alignedSequences = alignedDatasets
2674               .get(seq.getDatasetSequence());
2675       if (alignedSequences.isEmpty())
2676       {
2677         /*
2678          * defensive check - shouldn't happen! (JAL-3536)
2679          */
2680         continue;
2681       }
2682       SequenceI alignedSeq = alignedSequences.get(0);
2683
2684       /*
2685        * gap fill for leading (5') UTR if any
2686        */
2687       // TODO this copies intron columns - wrong!
2688       int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2689       int endCol = alignedSeq.findIndex(seq.getEnd());
2690       char[] seqchars = new char[endCol - leftmost + 1];
2691       Arrays.fill(seqchars, gapCharacter);
2692       char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2693       System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2694               toCopy.length);
2695       seq.setSequence(String.valueOf(seqchars));
2696       if (alignedSequences.size() > 0)
2697       {
2698         // pop off aligned sequences (except the last one)
2699         alignedSequences.remove(0);
2700       }
2701     }
2702
2703     /*
2704      * finally remove gapped columns (e.g. introns)
2705      */
2706     new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2707             unaligned.getWidth() - 1, unaligned);
2708
2709     return true;
2710   }
2711
2712   /**
2713    * Returns a map whose key is alignment column number (base 1), and whose
2714    * values are a map of sequence characters in that column.
2715    *
2716    * @param unaligned
2717    * @param aligned
2718    * @param unmapped
2719    * @return
2720    */
2721   static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2722           AlignmentI unaligned, AlignmentI aligned,
2723           List<SequenceI> unmapped)
2724   {
2725     /*
2726      * Map will hold, for each aligned column position, a map of
2727      * {unalignedSequence, characterPerSequence} at that position.
2728      * TreeMap keeps the entries in ascending column order.
2729      */
2730     SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2731
2732     /*
2733      * record any sequences that have no mapping so can't be realigned
2734      */
2735     unmapped.addAll(unaligned.getSequences());
2736
2737     List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2738
2739     for (SequenceI seq : unaligned.getSequences())
2740     {
2741       for (AlignedCodonFrame mapping : mappings)
2742       {
2743         SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2744         if (fromSeq != null)
2745         {
2746           Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2747           if (addMappedPositions(seq, fromSeq, seqMap, map))
2748           {
2749             unmapped.remove(seq);
2750           }
2751         }
2752       }
2753     }
2754     return map;
2755   }
2756
2757   /**
2758    * Helper method that adds to a map the mapped column positions of a sequence.
2759    * <br>
2760    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2761    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2762    * sequence.
2763    *
2764    * @param seq
2765    *          the sequence whose column positions we are recording
2766    * @param fromSeq
2767    *          a sequence that is mapped to the first sequence
2768    * @param seqMap
2769    *          the mapping from 'fromSeq' to 'seq'
2770    * @param map
2771    *          a map to add the column positions (in fromSeq) of the mapped
2772    *          positions of seq
2773    * @return
2774    */
2775   static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2776           Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2777   {
2778     if (seqMap == null)
2779     {
2780       return false;
2781     }
2782
2783     /*
2784      * invert mapping if it is from unaligned to aligned sequence
2785      */
2786     if (seqMap.getTo() == fromSeq.getDatasetSequence())
2787     {
2788       seqMap = new Mapping(seq.getDatasetSequence(),
2789               seqMap.getMap().getInverse());
2790     }
2791
2792     int toStart = seq.getStart();
2793
2794     /*
2795      * traverse [start, end, start, end...] ranges in fromSeq
2796      */
2797     for (int[] fromRange : seqMap.getMap().getFromRanges())
2798     {
2799       for (int i = 0; i < fromRange.length - 1; i += 2)
2800       {
2801         boolean forward = fromRange[i + 1] >= fromRange[i];
2802
2803         /*
2804          * find the range mapped to (sequence positions base 1)
2805          */
2806         int[] range = seqMap.locateMappedRange(fromRange[i],
2807                 fromRange[i + 1]);
2808         if (range == null)
2809         {
2810           jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2811                   + " from " + fromSeq.getName());
2812           return false;
2813         }
2814         int fromCol = fromSeq.findIndex(fromRange[i]);
2815         int mappedCharPos = range[0];
2816
2817         /*
2818          * walk over the 'from' aligned sequence in forward or reverse
2819          * direction; when a non-gap is found, record the column position
2820          * of the next character of the mapped-to sequence; stop when all
2821          * the characters of the range have been counted
2822          */
2823         while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2824                 && fromCol >= 0)
2825         {
2826           if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2827           {
2828             /*
2829              * mapped from sequence has a character in this column
2830              * record the column position for the mapped to character
2831              */
2832             Map<SequenceI, Character> seqsMap = map.get(fromCol);
2833             if (seqsMap == null)
2834             {
2835               seqsMap = new HashMap<>();
2836               map.put(fromCol, seqsMap);
2837             }
2838             seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2839             mappedCharPos++;
2840           }
2841           fromCol += (forward ? 1 : -1);
2842         }
2843       }
2844     }
2845     return true;
2846   }
2847
2848   // strictly temporary hack until proper criteria for aligning protein to cds
2849   // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
2850   public static boolean looksLikeEnsembl(AlignmentI alignment)
2851   {
2852     for (SequenceI seq : alignment.getSequences())
2853     {
2854       String name = seq.getName();
2855       if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2856       {
2857         return false;
2858       }
2859     }
2860     return true;
2861   }
2862
2863
2864   public static List<String> getSecondaryStructureSources(AlignmentAnnotation[] annotations) {
2865
2866       List<String> ssSources = new ArrayList<>();
2867       Set<String> addedLabels = new HashSet<>(); // to keep track of added labels
2868
2869       for (AlignmentAnnotation annotation : annotations) {
2870           String label = annotation.label;
2871           if (SECONDARY_STRUCTURE_LABELS.containsKey(label) && !addedLabels.contains(label)) {
2872               ssSources.add(SECONDARY_STRUCTURE_LABELS.get(label));
2873               addedLabels.add(label); // Add the label to the set
2874           }
2875       }
2876
2877       return ssSources;
2878   }
2879
2880   public static boolean isSecondaryStructurePresent(AlignmentAnnotation[] annotations)
2881   {
2882     boolean ssPresent = false;
2883
2884     for (AlignmentAnnotation aa : annotations)
2885     {
2886       if(ssPresent) {
2887         break;
2888       }
2889
2890       if (SECONDARY_STRUCTURE_LABELS.containsKey(aa.label)) {
2891           ssPresent = true;
2892           break;
2893       }
2894     }
2895
2896     return ssPresent;
2897
2898   }
2899 }