+ * Tries to add chromosomal coordinates to any nucleotide sequence which does
+ * not already have them. Coordinates are retrieved from Ensembl given an
+ * Ensembl identifier, either on the sequence itself or on a peptide sequence
+ * it has a reference to.
+ *
+ * <pre>
+ * Example (human):
+ * - fetch EMBLCDS cross-references for Uniprot entry P30419
+ * - the EMBL sequences do not have xrefs to Ensembl
+ * - the Uniprot entry has xrefs to
+ * ENSP00000258960, ENSP00000468424, ENST00000258960, ENST00000592782
+ * - either of the transcript ids can be used to retrieve gene loci e.g.
+ * http://rest.ensembl.org/map/cds/ENST00000592782/1..100000
+ * Example (invertebrate):
+ * - fetch EMBLCDS cross-references for Uniprot entry Q43517 (FER1_SOLLC)
+ * - the Uniprot entry has an xref to ENSEMBLPLANTS Solyc10g044520.1.1
+ * - can retrieve gene loci with
+ * http://rest.ensemblgenomes.org/map/cds/Solyc10g044520.1.1/1..100000
+ * </pre>
+ *
+ * @param sequences
+ */
+ public static void findGeneLoci(List<SequenceI> sequences)
+ {
+ Map<DBRefEntry, GeneLociI> retrievedLoci = new HashMap<>();
+ for (SequenceI seq : sequences)
+ {
+ findGeneLoci(seq, retrievedLoci);
+ }
+ }
+
+ /**
+ * Tres to find chromosomal coordinates for the sequence, by searching its
+ * direct and indirect cross-references for Ensembl. If the loci have already
+ * been retrieved, just reads them out of the map of retrievedLoci; this is
+ * the case of an alternative transcript for the same protein. Otherwise calls
+ * a REST service to retrieve the loci, and if successful, adds them to the
+ * sequence and to the retrievedLoci.
+ *
+ * @param seq
+ * @param retrievedLoci
+ */
+ static void findGeneLoci(SequenceI seq,
+ Map<DBRefEntry, GeneLociI> retrievedLoci)
+ {
+ /*
+ * don't replace any existing chromosomal coordinates
+ */
+ if (seq == null || seq.isProtein() || seq.getGeneLoci() != null
+ || seq.getDBRefs() == null)
+ {
+ return;
+ }
+
+ Set<String> ensemblDivisions = new EnsemblInfo().getDivisions();
+
+ /*
+ * first look for direct dbrefs from sequence to Ensembl
+ */
+ String[] divisionsArray = ensemblDivisions
+ .toArray(new String[ensemblDivisions.size()]);
+ DBRefEntry[] seqRefs = seq.getDBRefs();
+ DBRefEntry[] directEnsemblRefs = DBRefUtils.selectRefs(seqRefs,
+ divisionsArray);
+ if (directEnsemblRefs != null)
+ {
+ for (DBRefEntry ensemblRef : directEnsemblRefs)
+ {
+ if (fetchGeneLoci(seq, ensemblRef, retrievedLoci))
+ {
+ return;
+ }
+ }
+ }
+
+ /*
+ * else look for indirect dbrefs from sequence to Ensembl
+ */
+ for (DBRefEntry dbref : seq.getDBRefs())
+ {
+ if (dbref.getMap() != null && dbref.getMap().getTo() != null)
+ {
+ DBRefEntry[] dbrefs = dbref.getMap().getTo().getDBRefs();
+ DBRefEntry[] indirectEnsemblRefs = DBRefUtils.selectRefs(dbrefs,
+ divisionsArray);
+ if (indirectEnsemblRefs != null)
+ {
+ for (DBRefEntry ensemblRef : indirectEnsemblRefs)
+ {
+ if (fetchGeneLoci(seq, ensemblRef, retrievedLoci))
+ {
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Retrieves chromosomal coordinates for the Ensembl (or EnsemblGenomes)
+ * identifier in dbref. If successful, and the sequence length matches gene
+ * loci length, then add it to the sequence, and to the retrievedLoci map.
+ * Answers true if successful, else false.
+ *
+ * @param seq
+ * @param dbref
+ * @param retrievedLoci
+ * @return
+ */
+ static boolean fetchGeneLoci(SequenceI seq, DBRefEntry dbref,
+ Map<DBRefEntry, GeneLociI> retrievedLoci)
+ {
+ String accession = dbref.getAccessionId();
+ String division = dbref.getSource();
+
+ /*
+ * hack: ignore cross-references to Ensembl protein ids
+ * (can't fetch chromosomal mapping for these)
+ * todo: is there an equivalent in EnsemblGenomes?
+ */
+ if (accession.startsWith("ENSP"))
+ {
+ return false;
+ }
+ EnsemblMap mapper = new EnsemblMap();
+
+ /*
+ * try CDS mapping first
+ */
+ GeneLociI geneLoci = mapper.getCdsMapping(division, accession, 1,
+ seq.getLength());
+ if (geneLoci != null)
+ {
+ MapList map = geneLoci.getMap();
+ int mappedFromLength = MappingUtils.getLength(map.getFromRanges());
+ if (mappedFromLength == seq.getLength())
+ {
+ seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(),
+ geneLoci.getChromosomeId(), geneLoci.getMap());
+ retrievedLoci.put(dbref, geneLoci);
+ return true;
+ }
+ }
+
+ /*
+ * else try CDNA mapping
+ */
+ geneLoci = mapper.getCdnaMapping(division, accession, 1,
+ seq.getLength());
+ if (geneLoci != null)
+ {
+ MapList map = geneLoci.getMap();
+ int mappedFromLength = MappingUtils.getLength(map.getFromRanges());
+ if (mappedFromLength == seq.getLength())
+ {
+ seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(),
+ geneLoci.getChromosomeId(), geneLoci.getMap());
+ retrievedLoci.put(dbref, geneLoci);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * @param alignment
+ * @param dataset
+ * @param dna
+ * @param xrefs
+ * @param xrefsAlignment
+ * @return
+ */
+ protected AlignmentI copyAlignmentForSplitFrame(AlignmentI alignment,
+ AlignmentI dataset, boolean dna, AlignmentI xrefs,
+ AlignmentI xrefsAlignment)
+ {
+ AlignmentI copyAlignment;
+ boolean copyAlignmentIsAligned = false;
+ if (dna)
+ {
+ copyAlignment = AlignmentUtils.makeCdsAlignment(sel, dataset,
+ xrefsAlignment.getSequencesArray());
+ if (copyAlignment.getHeight() == 0)
+ {
+ JvOptionPane.showMessageDialog(alignFrame,
+ MessageManager.getString("label.cant_map_cds"),
+ MessageManager.getString("label.operation_failed"),
+ JvOptionPane.OK_OPTION);
+ System.err.println("Failed to make CDS alignment");
+ return null;
+ }
+
+ /*
+ * pending getting Embl transcripts to 'align',
+ * we are only doing this for Ensembl
+ */
+ // TODO proper criteria for 'can align as cdna'
+ if (DBRefSource.ENSEMBL.equalsIgnoreCase(source)
+ || AlignmentUtils.looksLikeEnsembl(alignment))
+ {
+ copyAlignment.alignAs(alignment);
+ copyAlignmentIsAligned = true;
+ }
+ }
+ else
+ {
+ copyAlignment = AlignmentUtils.makeCopyAlignment(sel,
+ xrefs.getSequencesArray(), dataset);
+ }
+ copyAlignment
+ .setGapCharacter(alignFrame.viewport.getGapCharacter());
+
+ StructureSelectionManager ssm = StructureSelectionManager
+ .getStructureSelectionManager(Desktop.instance);
+
+ /*
+ * register any new mappings for sequence mouseover etc
+ * (will not duplicate any previously registered mappings)
+ */
+ ssm.registerMappings(dataset.getCodonFrames());
+
+ if (copyAlignment.getHeight() <= 0)
+ {
+ System.err.println(
+ "No Sequences generated for xRef type " + source);
+ return null;
+ }
+
+ /*
+ * align protein to dna
+ */
+ if (dna && copyAlignmentIsAligned)
+ {
+ xrefsAlignment.alignAs(copyAlignment);
+ }
+ else
+ {
+ /*
+ * align cdna to protein - currently only if
+ * fetching and aligning Ensembl transcripts!
+ */
+ // TODO: generalise for other sources of locus/transcript/cds data
+ if (dna && DBRefSource.ENSEMBL.equalsIgnoreCase(source))
+ {
+ copyAlignment.alignAs(xrefsAlignment);
+ }
+ }
+
+ return copyAlignment;
+ }
+
+ /**