From 76df8bea6b86d373af576a25c9f156495c5f47ec Mon Sep 17 00:00:00 2001 From: jprocter Date: Wed, 6 Feb 2013 16:43:45 +0000 Subject: [PATCH] JAL-1265 basic support for parsing database accessions from Stockholm JAL-851 - guess the primary sequence accession based on RFAM or PFAM alignment accession code --- src/jalview/io/StockholmFile.java | 115 +++++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 11 deletions(-) diff --git a/src/jalview/io/StockholmFile.java b/src/jalview/io/StockholmFile.java index ff2b4fa..02ab5c1 100644 --- a/src/jalview/io/StockholmFile.java +++ b/src/jalview/io/StockholmFile.java @@ -146,7 +146,7 @@ public class StockholmFile extends AlignFile // End of the alignment, pass stuff back this.noSeqs = seqs.size(); - String propety = null; + String seqdb,dbsource = null; Regex pf = new Regex("PF[0-9]{5}(.*)"); // Finds AC for Pfam Regex rf = new Regex("RF[0-9]{5}(.*)"); // Finds AC for Rfam if (getAlignmentProperty("AC") != null) @@ -154,11 +154,12 @@ public class StockholmFile extends AlignFile String dbType = getAlignmentProperty("AC").toString(); if (pf.search(dbType)) { - propety = "PFAM"; + // PFAM Alignment - so references are typically from Uniprot + dbsource = "PFAM"; } else if (rf.search(dbType)) { - propety = "RFAM"; + dbsource = "RFAM"; } } // logger.debug("Number of sequences: " + this.noSeqs); @@ -216,16 +217,19 @@ public class StockholmFile extends AlignFile } } - if (accAnnotations != null && accAnnotations.containsKey("AC") - && propety != null) + if (accAnnotations != null && accAnnotations.containsKey("AC")) { - String dbr = (String) accAnnotations.get("AC"); - if (dbr != null) + if (dbsource != null) { - String src = propety; - String acn = dbr.toString(); - jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn); - } + String dbr = (String) accAnnotations.get("AC"); + if (dbr != null) + { + // we could get very clever here - but for now - just try to guess accession type from source of alignment plus structure of accession + guessDatabaseFor(seqO, dbr, dbsource); + + } + } + // else - do what ? add the data anyway and prompt the user to specify what references these are ? } Hashtable features = null; @@ -563,6 +567,95 @@ public class StockholmFile extends AlignFile } } + /** + * Demangle an accession string and guess the originating sequence database for a given sequence + * @param seqO sequence to be annotated + * @param dbr Accession string for sequence + * @param dbsource source database for alignment (PFAM or RFAM) + */ + private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource) + { + DBRefEntry dbrf=null; + List dbrs=new ArrayList(); + String seqdb="Unknown",sdbac=""+dbr; + int st=-1,en=-1,p; + if ((st=sdbac.indexOf("/"))>-1) + { + String num,range=sdbac.substring(st+1); + sdbac = sdbac.substring(0,st); + if ((p=range.indexOf("-"))>-1) + { + p++; + if (p-1) + { + // strip of last subdomain + sdbac = sdbac.substring(0,sdbac.indexOf(".")); + dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource, sdbac); + if (dbrf!=null) + { + dbrs.add(dbrf); + } + } + dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource, dbr); + if (dbr!=null) + { + dbrs.add(dbrf); + } + } else { + seqdb = "EMBL"; // total guess - could be ENA, or something else these days + if (sdbac.indexOf(".")>-1) + { + // strip off last subdomain + sdbac = sdbac.substring(0,sdbac.indexOf(".")); + dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource, sdbac); + if (dbrf!=null) + { + dbrs.add(dbrf); + } + } + + dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource, dbr); + if (dbrf!=null) + { + dbrs.add(dbrf); + } + } + if (st!=-1 && en!=-1) + { + for (DBRefEntry d:dbrs) + { + jalview.util.MapList mp = new jalview.util.MapList(new int[] { seqO.getStart(),seqO.getEnd()},new int[] { st,en},1,1); + jalview.datamodel.Mapping mping = new Mapping(mp); + d.setMap(mping); + } + } + } + protected static AlignmentAnnotation parseAnnotationRow( Vector annotation, String label, String annots) { -- 1.7.10.2