2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import org.forester.go.GoTerm;
34 import org.forester.phylogeny.data.Accession;
35 import org.forester.util.ForesterUtil;
37 public final class EbiDbEntry implements SequenceDatabaseEntry {
39 public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
40 final EbiDbEntry e = new EbiDbEntry();
41 for( final String line : lines ) {
42 if ( line.startsWith( "PA" ) ) {
43 e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
45 else if ( line.startsWith( "DE" ) ) {
46 e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
48 else if ( line.startsWith( "OS" ) ) {
49 if ( line.indexOf( "(" ) > 0 ) {
50 e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
53 e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
56 else if ( line.startsWith( "OX" ) ) {
57 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
58 e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
64 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
65 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
66 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
67 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
68 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
69 final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
70 final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
72 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
73 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
76 final EbiDbEntry e = new EbiDbEntry();
77 final StringBuilder def = new StringBuilder();
78 boolean in_def = false;
79 boolean in_features = false;
80 boolean in_source = false;
81 boolean in_gene = false;
82 boolean in_cds = false;
83 boolean in_protein = false;
84 for( final String line : lines ) {
86 if ( line.startsWith( "ACCESSION " ) ) {
87 e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
90 else if ( line.startsWith( "DEFINITION " ) ) {
91 if ( line.indexOf( "[" ) > 0 ) {
92 def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) );
94 else if ( line.indexOf( "." ) > 0 ) {
95 def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) );
98 def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) );
102 else if ( line.startsWith( " ORGANISM " ) ) {
103 if ( line.indexOf( "(" ) > 0 ) {
104 e.setOs( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
107 e.setOs( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
111 else if ( line.startsWith( " " ) && in_def ) {
113 if ( line.indexOf( "[" ) > 0 ) {
114 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
116 else if ( line.indexOf( "." ) > 0 ) {
117 def.append( SequenceDbWsTools.extractTo( line, "." ) );
120 def.append( line.trim() );
128 if ( X_PATTERN.matcher( line ).find() ) {
138 if ( line.startsWith( "FEATURES " ) ) {
143 if ( in_features && line.startsWith( " source " ) ) {
150 if ( in_features && line.startsWith( " gene " ) ) {
158 if ( in_features && line.startsWith( " CDS " ) ) {
165 if ( in_features && line.startsWith( " Protein " ) ) {
173 if ( def.length() > 0 ) {
174 e.setDe( def.toString().trim() );
178 // FIXME actually this is NCBI entry
179 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
183 private String _tax_id;
186 private String _symbol;
187 private String _provider;
189 private ArrayList<Accession> _cross_references;
190 private String _gene_name;
191 // TODO PUBMED 15798186
193 // source /db_xref="taxon:9606"
197 // /db_xref="MIM:604739"
200 // /db_xref="MIM:604739"
201 // /db_xref="InterPro:IPR002475"
204 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
208 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
209 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
212 VERSION NM_184234.2 GI:336176061
214 SOURCE Homo sapiens (human)
215 ORGANISM Homo sapiens
216 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
217 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
218 Catarrhini; Hominidae; Homo.
219 REFERENCE 1 (bases 1 to 2881)
220 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
221 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
222 Meijer,G.A. and Fijneman,R.J.
223 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
225 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
227 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
228 levels correlated with chromosome 20q DNA copy number status.
229 REFERENCE 2 (bases 1 to 2881)
230 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
231 TITLE CAPER-alpha alternative splicing regulates the expression of
232 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
233 JOURNAL Cancer 118 (8), 2106-2116 (2012)
235 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
236 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
237 alternative splicing and controls the shift from VEGF(189) to
239 REFERENCE 3 (bases 1 to 2881)
240 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
242 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
243 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
245 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
247 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
248 REFERENCE 4 (bases 1 to 2881)
249 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
250 TITLE Identification of tumor-associated antigens as diagnostic and
251 predictive biomarkers in cancer
252 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
254 REFERENCE 5 (bases 1 to 2881)
255 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
256 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
257 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
259 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
261 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
262 protein 39) as a new transcriptional coregulator for v-Rel and
263 reveals an important role in modulating Rel's oncogenic activity.
264 REFERENCE 6 (bases 1 to 2881)
265 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
266 TITLE A novel SR-related protein is required for the second step of
268 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
270 REFERENCE 7 (bases 1 to 2881)
271 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
272 Berget,S.M. and O'Malley,B.W.
273 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
274 by U2AF65-related proteins CAPERalpha and CAPERbeta
275 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
277 REFERENCE 8 (bases 1 to 2881)
278 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
279 Ridenour,G., Hyde,J.D. and Witten,M.L.
280 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
281 lymphoblastic leukemia cell line
282 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
284 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
285 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
286 REFERENCE 9 (bases 1 to 2881)
287 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
288 TITLE Molecular cloning and characterization of CAPER, a novel
289 coactivator of activating protein-1 and estrogen receptors
290 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
292 REMARK GeneRIF: This paper describes the mouse gene.
293 REFERENCE 10 (bases 1 to 2881)
294 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
295 TITLE Novel nuclear autoantigen with splicing factor motifs identified
296 with antibody from hepatocellular carcinoma
297 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
299 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
300 reference sequence was derived from DC346351.1, BC141835.1 and
302 On Jun 16, 2011 this sequence version replaced gi:35493810.
304 Summary: This gene encodes a member of the U2AF65 family of
305 proteins. The encoded protein is found in the nucleus, where it
306 co-localizes with core spliceosomal proteins. It has been shown to
307 play a role in both steroid hormone receptor-mediated transcription
308 and alternative splicing, and it is also a transcriptional
309 coregulator of the viral oncoprotein v-Rel. Multiple transcript
310 variants have been observed for this gene. A related pseudogene has
311 been identified on chromosome X. [provided by RefSeq, Aug 2011].
313 Transcript Variant: This variant (1) encodes the longest isoform
314 (a, also called CC1.4).
316 Publication Note: This RefSeq record includes a subset of the
317 publications that are available for this gene. Please see the Gene
318 record to access additional publications.
320 ##Evidence-Data-START##
321 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
322 RNAseq introns :: mixed/partial sample support
323 ERS025081, ERS025082 [ECO:0000350]
324 ##Evidence-Data-END##
325 COMPLETENESS: complete on the 3' end.
326 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
327 1-578 DC346351.1 3-580
328 579-2872 BC141835.1 429-2722
329 2873-2881 C75555.1 1-9 c
330 FEATURES Location/Qualifiers
332 /organism="Homo sapiens"
334 /db_xref="taxon:9606"
339 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
340 /note="RNA binding motif protein 39"
341 /db_xref="GeneID:9584"
342 /db_xref="HGNC:15923"
343 /db_xref="HPRD:09201"
344 /db_xref="MIM:604739"
347 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
348 /inference="alignment:Splign:1.39.8"
351 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
352 /standard_name="REN58946"
353 /db_xref="UniSTS:383746"
354 misc_feature 221..223
356 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
357 /note="upstream in-frame stop codon"
360 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
361 /standard_name="G64285"
362 /db_xref="UniSTS:158667"
365 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
366 /inference="alignment:Splign:1.39.8"
369 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
370 /note="isoform a is encoded by transcript variant 1;
371 coactivator of activating protein-1 and estrogen
372 receptors; functional spliceosome-associated protein 59;
373 RNA-binding region (RNP1, RRM) containing 2;
374 hepatocellular carcinoma protein 1; splicing factor HCC1"
376 /product="RNA-binding protein 39 isoform a"
377 /protein_id="NP_909122.1"
378 /db_xref="GI:35493811"
379 /db_xref="CCDS:CCDS13266.1"
380 /db_xref="GeneID:9584"
381 /db_xref="HGNC:15923"
382 /db_xref="HPRD:09201"
383 /db_xref="MIM:604739"
384 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
385 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
386 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
387 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
388 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
389 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
390 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
391 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
392 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
393 PTYHNLFPDSMTATQLLVPSRR"
394 misc_feature 413..415
396 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
397 /experiment="experimental evidence, no additional details
399 /note="N-acetylalanine; propagated from
400 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
404 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
405 /inference="alignment:Splign:1.39.8"
409 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
410 /inference="alignment:Splign:1.39.8"
413 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
414 /standard_name="REN58786"
415 /db_xref="UniSTS:383586"
418 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
419 /standard_name="D19S1033"
420 /db_xref="UniSTS:154759"
423 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
424 /standard_name="REN58785"
425 /db_xref="UniSTS:383585"
427 polyA_signal 2851..2856
429 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
432 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
434 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
435 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
436 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
437 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
438 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
443 private EbiDbEntry() {
446 private void addCrossReference( final Accession accession ) {
447 if ( _cross_references == null ) {
448 _cross_references = new ArrayList<Accession>();
450 System.out.println( "XREF ADDED: " + accession );
451 _cross_references.add( accession );
455 public Object clone() throws CloneNotSupportedException {
456 throw new CloneNotSupportedException();
460 public String getAccession() {
465 public List<Accession> getCrossReferences() {
466 return _cross_references;
470 public String getGeneName() {
475 public List<GoTerm> getGoTerms() {
480 public String getProvider() {
485 public String getSequenceName() {
490 public String getSequenceSymbol() {
495 public String getTaxonomyIdentifier() {
502 public String getTaxonomyScientificName() {
507 public boolean isEmpty() {
508 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
509 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
510 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
513 private void setDe( final String rec_name ) {
519 private void setGeneName( final String gene_name ) {
520 if ( _gene_name == null ) {
521 _gene_name = gene_name;
525 private void setOs( final String os ) {
531 private void setPA( final String pa ) {
537 public void setProvider( final String provider ) {
538 _provider = provider;
541 private void setTaxId( final String tax_id ) {
542 if ( _tax_id == null ) {