2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.List;
29 import java.util.SortedSet;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import org.forester.go.GoTerm;
35 import org.forester.phylogeny.data.Accession;
36 import org.forester.phylogeny.data.Annotation;
37 import org.forester.util.ForesterUtil;
39 public final class EbiDbEntry implements SequenceDatabaseEntry {
41 // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
42 // final EbiDbEntry e = new EbiDbEntry();
43 // for( final String line : lines ) {
44 // if ( line.startsWith( "PA" ) ) {
45 // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
47 // else if ( line.startsWith( "DE" ) ) {
48 // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
50 // else if ( line.startsWith( "OS" ) ) {
51 // if ( line.indexOf( "(" ) > 0 ) {
52 // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
55 // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
58 // else if ( line.startsWith( "OX" ) ) {
59 // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
60 // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
66 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
67 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
68 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
69 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
70 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
71 final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
72 final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
73 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
74 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
75 final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
76 final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
77 final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
78 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
79 final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
80 final EbiDbEntry e = new EbiDbEntry();
81 final StringBuilder def = new StringBuilder();
82 boolean in_definition = false;
83 boolean in_features = false;
84 boolean in_source = false;
85 boolean in_gene = false;
86 boolean in_cds = false;
87 boolean in_mrna = false;
88 boolean in_protein = false;
89 for( final String line : lines ) {
90 if ( line.startsWith( "ACCESSION " ) ) {
91 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
92 in_definition = false;
94 else if ( line.startsWith( "ID " ) ) {
95 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
96 in_definition = false;
98 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
99 boolean definiton = false;
100 if ( line.startsWith( "DEFINITION " ) ) {
103 if ( line.indexOf( "[" ) > 0 ) {
105 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
108 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
111 else if ( line.indexOf( "." ) > 0 ) {
113 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
116 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
121 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
124 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
128 in_definition = true;
131 else if ( line.startsWith( " ORGANISM " ) ) {
132 if ( line.indexOf( "(" ) > 0 ) {
133 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
136 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
140 else if ( line.startsWith( "OS " ) ) {
141 if ( line.indexOf( "(" ) > 0 ) {
142 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
145 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
148 else if ( line.startsWith( " " ) && in_definition ) {
150 if ( line.indexOf( "[" ) > 0 ) {
151 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
153 else if ( line.indexOf( "." ) > 0 ) {
154 def.append( SequenceDbWsTools.extractTo( line, "." ) );
157 def.append( line.trim() );
161 in_definition = false;
163 if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
172 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
175 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
182 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
189 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
196 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
203 if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
211 final Matcher m = taxon_PATTERN.matcher( line );
213 e.setTaxId( m.group( 1 ) );
216 if ( in_cds || in_gene ) {
217 final Matcher hgnc = hgnc_PATTERN.matcher( line );
219 e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
221 final Matcher geneid = geneid_PATTERN.matcher( line );
222 if ( geneid.find() ) {
223 e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
226 if ( in_protein || in_cds || in_gene || in_mrna ) {
227 final Matcher ec = ec_PATTERN.matcher( line );
229 e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
231 final Matcher gene = gene_PATTERN.matcher( line );
233 e.setGeneName( gene.group( 1 ) );
235 final Matcher uniprot = uniprot_PATTERN.matcher( line );
236 if ( uniprot.find() ) {
237 e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
239 final Matcher interpro = interpro_PATTERN.matcher( line );
240 if ( interpro.find() ) {
241 e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
243 final Matcher mim = mim_PATTERN.matcher( line );
245 e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
247 final Matcher product = product_PATTERN.matcher( line );
248 if ( product.find() ) {
249 e.setSequenceSymbol( product.group( 1 ) );
251 final Matcher pdb = pdb_PATTERN.matcher( line );
253 e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
257 if ( def.length() > 0 ) {
258 e.setSequenceName( def.toString().trim() );
263 private static void x( final StringBuilder sb, final String s ) {
264 if ( sb.length() > 0 ) {
267 sb.append( s.trim() );
269 // FIXME actually this is NCBI entry
270 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
274 private String _tax_id;
275 private String _symbol;
276 private String _provider;
277 private SortedSet<Accession> _cross_references;
278 private SortedSet<Annotation> _annotations;
279 private String _gene_name;
281 // TODO PUBMED 15798186
283 // source /db_xref="taxon:9606"
287 // /db_xref="MIM:604739"
290 // /db_xref="MIM:604739"
291 // /db_xref="InterPro:IPR002475"
293 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
297 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
298 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
301 VERSION NM_184234.2 GI:336176061
303 SOURCE Homo sapiens (human)
304 ORGANISM Homo sapiens
305 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
306 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
307 Catarrhini; Hominidae; Homo.
308 REFERENCE 1 (bases 1 to 2881)
309 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
310 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
311 Meijer,G.A. and Fijneman,R.J.
312 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
314 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
316 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
317 levels correlated with chromosome 20q DNA copy number status.
318 REFERENCE 2 (bases 1 to 2881)
319 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
320 TITLE CAPER-alpha alternative splicing regulates the expression of
321 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
322 JOURNAL Cancer 118 (8), 2106-2116 (2012)
324 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
325 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
326 alternative splicing and controls the shift from VEGF(189) to
328 REFERENCE 3 (bases 1 to 2881)
329 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
331 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
332 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
334 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
336 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
337 REFERENCE 4 (bases 1 to 2881)
338 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
339 TITLE Identification of tumor-associated antigens as diagnostic and
340 predictive biomarkers in cancer
341 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
343 REFERENCE 5 (bases 1 to 2881)
344 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
345 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
346 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
348 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
350 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
351 protein 39) as a new transcriptional coregulator for v-Rel and
352 reveals an important role in modulating Rel's oncogenic activity.
353 REFERENCE 6 (bases 1 to 2881)
354 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
355 TITLE A novel SR-related protein is required for the second step of
357 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
359 REFERENCE 7 (bases 1 to 2881)
360 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
361 Berget,S.M. and O'Malley,B.W.
362 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
363 by U2AF65-related proteins CAPERalpha and CAPERbeta
364 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
366 REFERENCE 8 (bases 1 to 2881)
367 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
368 Ridenour,G., Hyde,J.D. and Witten,M.L.
369 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
370 lymphoblastic leukemia cell line
371 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
373 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
374 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
375 REFERENCE 9 (bases 1 to 2881)
376 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
377 TITLE Molecular cloning and characterization of CAPER, a novel
378 coactivator of activating protein-1 and estrogen receptors
379 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
381 REMARK GeneRIF: This paper describes the mouse gene.
382 REFERENCE 10 (bases 1 to 2881)
383 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
384 TITLE Novel nuclear autoantigen with splicing factor motifs identified
385 with antibody from hepatocellular carcinoma
386 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
388 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
389 reference sequence was derived from DC346351.1, BC141835.1 and
391 On Jun 16, 2011 this sequence version replaced gi:35493810.
393 Summary: This gene encodes a member of the U2AF65 family of
394 proteins. The encoded protein is found in the nucleus, where it
395 co-localizes with core spliceosomal proteins. It has been shown to
396 play a role in both steroid hormone receptor-mediated transcription
397 and alternative splicing, and it is also a transcriptional
398 coregulator of the viral oncoprotein v-Rel. Multiple transcript
399 variants have been observed for this gene. A related pseudogene has
400 been identified on chromosome X. [provided by RefSeq, Aug 2011].
402 Transcript Variant: This variant (1) encodes the longest isoform
403 (a, also called CC1.4).
405 Publication Note: This RefSeq record includes a subset of the
406 publications that are available for this gene. Please see the Gene
407 record to access additional publications.
409 ##Evidence-Data-START##
410 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
411 RNAseq introns :: mixed/partial sample support
412 ERS025081, ERS025082 [ECO:0000350]
413 ##Evidence-Data-END##
414 COMPLETENESS: complete on the 3' end.
415 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
416 1-578 DC346351.1 3-580
417 579-2872 BC141835.1 429-2722
418 2873-2881 C75555.1 1-9 c
419 FEATURES Location/Qualifiers
421 /organism="Homo sapiens"
423 /db_xref="taxon:9606"
428 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
429 /note="RNA binding motif protein 39"
430 /db_xref="GeneID:9584"
431 /db_xref="HGNC:15923"
432 /db_xref="HPRD:09201"
433 /db_xref="MIM:604739"
436 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
437 /inference="alignment:Splign:1.39.8"
440 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
441 /standard_name="REN58946"
442 /db_xref="UniSTS:383746"
443 misc_feature 221..223
445 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
446 /note="upstream in-frame stop codon"
449 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
450 /standard_name="G64285"
451 /db_xref="UniSTS:158667"
454 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
455 /inference="alignment:Splign:1.39.8"
458 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
459 /note="isoform a is encoded by transcript variant 1;
460 coactivator of activating protein-1 and estrogen
461 receptors; functional spliceosome-associated protein 59;
462 RNA-binding region (RNP1, RRM) containing 2;
463 hepatocellular carcinoma protein 1; splicing factor HCC1"
465 /product="RNA-binding protein 39 isoform a"
466 /protein_id="NP_909122.1"
467 /db_xref="GI:35493811"
468 /db_xref="CCDS:CCDS13266.1"
469 /db_xref="GeneID:9584"
470 /db_xref="HGNC:15923"
471 /db_xref="HPRD:09201"
472 /db_xref="MIM:604739"
473 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
474 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
475 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
476 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
477 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
478 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
479 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
480 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
481 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
482 PTYHNLFPDSMTATQLLVPSRR"
483 misc_feature 413..415
485 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
486 /experiment="experimental evidence, no additional details
488 /note="N-acetylalanine; propagated from
489 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
493 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
494 /inference="alignment:Splign:1.39.8"
498 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
499 /inference="alignment:Splign:1.39.8"
502 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
503 /standard_name="REN58786"
504 /db_xref="UniSTS:383586"
507 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
508 /standard_name="D19S1033"
509 /db_xref="UniSTS:154759"
512 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
513 /standard_name="REN58785"
514 /db_xref="UniSTS:383585"
516 polyA_signal 2851..2856
518 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
521 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
523 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
524 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
525 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
526 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
527 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
532 private EbiDbEntry() {
535 private void addCrossReference( final Accession accession ) {
536 if ( _cross_references == null ) {
537 _cross_references = new TreeSet<Accession>();
539 System.out.println( "XREF ADDED: " + accession );
540 _cross_references.add( accession );
544 public Object clone() throws CloneNotSupportedException {
545 throw new CloneNotSupportedException();
549 public String getAccession() {
554 public SortedSet<Accession> getCrossReferences() {
555 return _cross_references;
559 public String getGeneName() {
564 public SortedSet<GoTerm> getGoTerms() {
569 public String getProvider() {
574 public String getSequenceName() {
579 public String getSequenceSymbol() {
583 private void setSequenceSymbol( final String symbol ) {
588 public String getTaxonomyIdentifier() {
593 public String getTaxonomyScientificName() {
598 public boolean isEmpty() {
599 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
600 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
601 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
604 private void setSequenceName( final String rec_name ) {
610 private void setGeneName( final String gene_name ) {
611 if ( _gene_name == null ) {
612 _gene_name = gene_name;
616 private void setTaxonomyScientificName( final String os ) {
622 private void setAccession( final String pa ) {
628 public void setProvider( final String provider ) {
629 _provider = provider;
632 private void setTaxId( final String tax_id ) {
633 if ( _tax_id == null ) {
639 public SortedSet<Annotation> getAnnotations() {
643 private void addAnnotation( final Annotation annotation ) {
644 if ( _annotations == null ) {
645 _annotations = new TreeSet<Annotation>();
647 _annotations.add( annotation );