X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FCrossRef.java;h=629850511fefc57400d34c9e07753c938e442c3e;hb=797df64fa2a0a30773d0f48f5494d4155e5a8be3;hp=f475ecb4cc6c9fa5b96446ab44e562a42d2eb9b0;hpb=6906bad46d9a65df68c979fc2afe7a1c73cbcec5;p=jalview.git diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index f475ecb..6298505 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -1,3 +1,20 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7) + * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with Jalview. If not, see . + */ package jalview.analysis; import java.util.Enumeration; @@ -11,8 +28,8 @@ import jalview.datamodel.DBRefSource; import jalview.datamodel.DBRefEntry; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; -import jalview.ws.ASequenceFetcher; import jalview.ws.SequenceFetcher; +import jalview.ws.seqfetcher.ASequenceFetcher; /** * Functions for cross-referencing sequence databases. user must first specify @@ -39,20 +56,22 @@ public class CrossRef else { rfs = jalview.util.DBRefUtils.selectRefs(rfs, - DBRefSource.DNACODINGDBS); // could attempt to find other cross refs and return here - ie PDB xrefs (not dna, not protein seq) + DBRefSource.DNACODINGDBS); // could attempt to find other cross + // refs and return here - ie PDB xrefs + // (not dna, not protein seq) } return rfs; } - public static Hashtable classifyDbRefs(DBRefEntry[] rfs) { Hashtable classes = new Hashtable(); - classes.put(DBRefSource.PROTEINDBS, jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS)); - classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils.selectRefs(rfs, - DBRefSource.DNACODINGDBS)); - classes.put(DBRefSource.DOMAINDBS, jalview.util.DBRefUtils.selectRefs(rfs, - DBRefSource.DOMAINDBS)); + classes.put(DBRefSource.PROTEINDBS, + jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.PROTEINDBS)); + classes.put(DBRefSource.DNACODINGDBS, jalview.util.DBRefUtils + .selectRefs(rfs, DBRefSource.DNACODINGDBS)); + classes.put(DBRefSource.DOMAINDBS, + jalview.util.DBRefUtils.selectRefs(rfs, DBRefSource.DOMAINDBS)); // classes.put(OTHER, ) return classes; } @@ -67,49 +86,58 @@ public class CrossRef { return findSequenceXrefTypes(dna, seqs, null); } + /** - * Indirect references are references from other sequences from the dataset to any of the direct - * DBRefEntrys on the given sequences. + * Indirect references are references from other sequences from the dataset to + * any of the direct DBRefEntrys on the given sequences. + * * @param dna * true if seqs are DNA seqs * @param seqs * @return a list of sequence database cross reference source types */ - public static String[] findSequenceXrefTypes(boolean dna, SequenceI[] seqs, AlignmentI dataset) + public static String[] findSequenceXrefTypes(boolean dna, + SequenceI[] seqs, AlignmentI dataset) { String[] dbrefs = null; Vector refs = new Vector(); for (int s = 0; s < seqs.length; s++) { - SequenceI dss = seqs[s]; - while (dss.getDatasetSequence()!=null) - { - dss = dss.getDatasetSequence(); - } - DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef()); - for (int r = 0; rfs != null && r < rfs.length; r++) + if (seqs[s] != null) { - if (!refs.contains(rfs[r].getSource())) + + SequenceI dss = seqs[s]; + while (dss.getDatasetSequence() != null) { - refs.addElement(rfs[r].getSource()); + dss = dss.getDatasetSequence(); } - } - if (dataset!=null) - { - // search for references to this sequence's direct references. - DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); - Vector rseqs = new Vector(); - CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, null); // don't need to specify codon frame for mapping here - Enumeration lr = rseqs.elements(); - while (lr.hasMoreElements()) + DBRefEntry[] rfs = findXDbRefs(dna, dss.getDBRef()); + for (int r = 0; rfs != null && r < rfs.length; r++) { - SequenceI rs = (SequenceI) lr.nextElement(); - DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); - for (int r=0; rfs != null && r < rfs.length; r++) + if (!refs.contains(rfs[r].getSource())) { - if (!refs.contains(rfs[r].getSource())) + refs.addElement(rfs[r].getSource()); + } + } + if (dataset != null) + { + // search for references to this sequence's direct references. + DBRefEntry[] lrfs = CrossRef + .findXDbRefs(!dna, seqs[s].getDBRef()); + Vector rseqs = new Vector(); + CrossRef.searchDatasetXrefs(seqs[s], !dna, lrfs, dataset, rseqs, + null); // don't need to specify codon frame for mapping here + Enumeration lr = rseqs.elements(); + while (lr.hasMoreElements()) + { + SequenceI rs = (SequenceI) lr.nextElement(); + DBRefEntry[] xrs = findXDbRefs(dna, rs.getDBRef()); + for (int r = 0; rfs != null && r < rfs.length; r++) { - refs.addElement(rfs[r].getSource()); + if (!refs.contains(rfs[r].getSource())) + { + refs.addElement(rfs[r].getSource()); + } } } } @@ -152,7 +180,9 @@ public class CrossRef { if (cdna[c].getSource().equals(DBRefSource.EMBLCDS)) { - // retrieve CDS dataset sequences + System.err + .println("TODO: unimplemented sequence retrieval for coding region sequence."); + // TODO: retrieve CDS dataset sequences // need global dataset sequence retriever/resolver to reuse refs // and construct Mapping entry. // insert gaps in CDS according to peptide gaps. @@ -196,24 +226,32 @@ public class CrossRef { Vector rseqs = new Vector(); Alignment ral = null; - AlignedCodonFrame cf=new AlignedCodonFrame(0); // nominal width + AlignedCodonFrame cf = new AlignedCodonFrame(0); // nominal width for (int s = 0; s < seqs.length; s++) { SequenceI dss = seqs[s]; - while (dss.getDatasetSequence()!=null) + while (dss.getDatasetSequence() != null) { dss = dss.getDatasetSequence(); } boolean found = false; DBRefEntry[] xrfs = CrossRef.findXDbRefs(dna, dss.getDBRef()); - if ((xrfs == null || xrfs.length == 0) && dataset!=null) + if ((xrfs == null || xrfs.length == 0) && dataset != null) { System.out.println("Attempting to find ds Xrefs refs."); - DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less ambiguous would be a 'find primary dbRefEntry' method. + DBRefEntry[] lrfs = CrossRef.findXDbRefs(!dna, seqs[s].getDBRef()); // less + // ambiguous + // would + // be a + // 'find + // primary + // dbRefEntry' + // method. // filter for desired source xref here - found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, rseqs, cf); + found = CrossRef.searchDatasetXrefs(dss, !dna, lrfs, dataset, + rseqs, cf); } - for (int r = 0; xrfs!=null && r < xrfs.length; r++) + for (int r = 0; xrfs != null && r < xrfs.length; r++) { if (source != null && !source.equals(xrfs[r].getSource())) continue; @@ -223,14 +261,17 @@ public class CrossRef { Sequence rsq = new Sequence(xrfs[r].getMap().getTo()); rseqs.addElement(rsq); - if (xrfs[r].getMap().getMap().getFromRatio()!=xrfs[r].getMap().getMap().getToRatio()) + if (xrfs[r].getMap().getMap().getFromRatio() != xrfs[r] + .getMap().getMap().getToRatio()) { // get sense of map correct for adding to product alignment. if (dna) { // map is from dna seq to a protein product cf.addMap(dss, rsq, xrfs[r].getMap().getMap()); - } else { + } + else + { // map should be from protein seq to its coding dna cf.addMap(rsq, dss, xrfs[r].getMap().getMap().getInverse()); } @@ -238,13 +279,13 @@ public class CrossRef found = true; } } - else + if (!found) { // do a bit more work - search for sequences with references matching // xrefs on this sequence. if (dataset != null) { - found = searchDataset(dss, xrfs[r], dataset, rseqs, cf); + found |= searchDataset(dss, xrfs[r], dataset, rseqs, cf); // ,false,!dna); if (found) xrfs[r] = null; // we've recovered seqs for this one. } @@ -265,8 +306,10 @@ public class CrossRef for (int r = 0; r < xrfs.length; r++) { // filter out any irrelevant or irretrievable references - if (xrfs[r]==null || ((source != null && !source.equals(xrfs[r].getSource())) - || !sftch.isFetchable(xrfs[r].getSource()))) + if (xrfs[r] == null + || ((source != null && !source.equals(xrfs[r] + .getSource())) || !sftch.isFetchable(xrfs[r] + .getSource()))) { l--; xrfs[r] = null; @@ -275,7 +318,7 @@ public class CrossRef if (l > 0) { System.out - .println("Attempting to retrieve cross referenced sequences."); + .println("Attempting to retrieve cross referenced sequences."); DBRefEntry[] t = new DBRefEntry[l]; l = 0; for (int r = 0; r < xrfs.length; r++) @@ -286,18 +329,68 @@ public class CrossRef xrfs = t; try { - retrieved = sftch.getSequences(xrfs); + retrieved = sftch.getSequences(xrfs); // problem here is we don't + // know which of xrfs + // resulted in which + // retrieved element } catch (Exception e) { System.err - .println("Problem whilst retrieving cross references for Sequence : " - + seqs[s].getName()); + .println("Problem whilst retrieving cross references for Sequence : " + + seqs[s].getName()); e.printStackTrace(); } if (retrieved != null) { for (int rs = 0; rs < retrieved.length; rs++) { + // TODO: examine each sequence for 'redundancy' + jalview.datamodel.DBRefEntry[] dbr = retrieved[rs] + .getDBRef(); + if (dbr != null && dbr.length > 0) + { + for (int di = 0; di < dbr.length; di++) + { + // find any entry where we should put in the sequence being + // cross-referenced into the map + jalview.datamodel.Mapping map = dbr[di].getMap(); + if (map != null) + { + if (map.getTo() != null && map.getMap() != null) + { + // should search the local dataset to find any existing + // candidates for To ! + try + { + // compare ms with dss and replace with dss in mapping + // if map is congruent + SequenceI ms = map.getTo(); + int sf = map.getMap().getToLowest(); + int st = map.getMap().getToHighest(); + SequenceI mappedrg = ms.getSubSequence(sf, st); + SequenceI loc = dss.getSubSequence(sf, st); + if (mappedrg.getLength() > 0 + && mappedrg.getSequenceAsString().equals( + loc.getSequenceAsString())) + { + System.err + .println("Mapping updated for retrieved crossreference"); + // method to update all refs of existing To on + // retrieved sequence with dss and merge any props + // on To onto dss. + map.setTo(dss); + } + } catch (Exception e) + { + System.err + .println("Exception when consolidating Mapped sequence set..."); + e.printStackTrace(System.err); + } + } + } + } + } + retrieved[rs].updatePDBIds(); rseqs.addElement(retrieved[rs]); } } @@ -310,7 +403,7 @@ public class CrossRef SequenceI[] rsqs = new SequenceI[rseqs.size()]; rseqs.copyInto(rsqs); ral = new Alignment(rsqs); - if (cf!=null && cf.getProtMappings()!=null) + if (cf != null && cf.getProtMappings() != null) { ral.addCodonFrame(cf); } @@ -319,20 +412,24 @@ public class CrossRef } /** - * find references to lrfs in the cross-reference set of each sequence in dataset (that is not equal to sequenceI) - * Identifies matching DBRefEntry based on source and accession string only - Map and Version are nulled. + * find references to lrfs in the cross-reference set of each sequence in + * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry + * based on source and accession string only - Map and Version are nulled. + * * @param sequenceI * @param lrfs * @param dataset * @param rseqs * @return true if matches were found. */ - private static boolean searchDatasetXrefs(SequenceI sequenceI, boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, AlignedCodonFrame cf) + private static boolean searchDatasetXrefs(SequenceI sequenceI, + boolean dna, DBRefEntry[] lrfs, AlignmentI dataset, Vector rseqs, + AlignedCodonFrame cf) { - boolean found=false; - if (lrfs==null) + boolean found = false; + if (lrfs == null) return false; - for (int i=0;i