2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.List;
29 import java.util.SortedSet;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import org.forester.go.GoTerm;
35 import org.forester.phylogeny.data.Accession;
36 import org.forester.phylogeny.data.Annotation;
37 import org.forester.util.ForesterUtil;
39 public final class EbiDbEntry implements SequenceDatabaseEntry {
41 private SortedSet<Annotation> _annotations;
42 private String _chromosome;
43 private SortedSet<Accession> _cross_references;
45 private String _gene_name;
48 // FIXME actually this is NCBI entry
49 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
51 private String _provider;
52 private String _symbol;
53 private String _tax_id;
55 // TODO PUBMED 15798186
57 // source /db_xref="taxon:9606"
61 // /db_xref="MIM:604739"
64 // /db_xref="MIM:604739"
65 // /db_xref="InterPro:IPR002475"
67 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
71 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
72 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
75 VERSION NM_184234.2 GI:336176061
77 SOURCE Homo sapiens (human)
79 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
80 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
81 Catarrhini; Hominidae; Homo.
82 REFERENCE 1 (bases 1 to 2881)
83 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
84 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
85 Meijer,G.A. and Fijneman,R.J.
86 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
88 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
90 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
91 levels correlated with chromosome 20q DNA copy number status.
92 REFERENCE 2 (bases 1 to 2881)
93 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
94 TITLE CAPER-alpha alternative splicing regulates the expression of
95 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
96 JOURNAL Cancer 118 (8), 2106-2116 (2012)
98 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
99 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
100 alternative splicing and controls the shift from VEGF(189) to
102 REFERENCE 3 (bases 1 to 2881)
103 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
105 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
106 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
108 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
110 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
111 REFERENCE 4 (bases 1 to 2881)
112 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
113 TITLE Identification of tumor-associated antigens as diagnostic and
114 predictive biomarkers in cancer
115 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
117 REFERENCE 5 (bases 1 to 2881)
118 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
119 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
120 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
122 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
124 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
125 protein 39) as a new transcriptional coregulator for v-Rel and
126 reveals an important role in modulating Rel's oncogenic activity.
127 REFERENCE 6 (bases 1 to 2881)
128 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
129 TITLE A novel SR-related protein is required for the second step of
131 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
133 REFERENCE 7 (bases 1 to 2881)
134 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
135 Berget,S.M. and O'Malley,B.W.
136 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
137 by U2AF65-related proteins CAPERalpha and CAPERbeta
138 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
140 REFERENCE 8 (bases 1 to 2881)
141 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
142 Ridenour,G., Hyde,J.D. and Witten,M.L.
143 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
144 lymphoblastic leukemia cell line
145 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
147 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
148 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
149 REFERENCE 9 (bases 1 to 2881)
150 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
151 TITLE Molecular cloning and characterization of CAPER, a novel
152 coactivator of activating protein-1 and estrogen receptors
153 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
155 REMARK GeneRIF: This paper describes the mouse gene.
156 REFERENCE 10 (bases 1 to 2881)
157 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
158 TITLE Novel nuclear autoantigen with splicing factor motifs identified
159 with antibody from hepatocellular carcinoma
160 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
162 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
163 reference sequence was derived from DC346351.1, BC141835.1 and
165 On Jun 16, 2011 this sequence version replaced gi:35493810.
167 Summary: This gene encodes a member of the U2AF65 family of
168 proteins. The encoded protein is found in the nucleus, where it
169 co-localizes with core spliceosomal proteins. It has been shown to
170 play a role in both steroid hormone receptor-mediated transcription
171 and alternative splicing, and it is also a transcriptional
172 coregulator of the viral oncoprotein v-Rel. Multiple transcript
173 variants have been observed for this gene. A related pseudogene has
174 been identified on chromosome X. [provided by RefSeq, Aug 2011].
176 Transcript Variant: This variant (1) encodes the longest isoform
177 (a, also called CC1.4).
179 Publication Note: This RefSeq record includes a subset of the
180 publications that are available for this gene. Please see the Gene
181 record to access additional publications.
183 ##Evidence-Data-START##
184 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
185 RNAseq introns :: mixed/partial sample support
186 ERS025081, ERS025082 [ECO:0000350]
187 ##Evidence-Data-END##
188 COMPLETENESS: complete on the 3' end.
189 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
190 1-578 DC346351.1 3-580
191 579-2872 BC141835.1 429-2722
192 2873-2881 C75555.1 1-9 c
193 FEATURES Location/Qualifiers
195 /organism="Homo sapiens"
197 /db_xref="taxon:9606"
202 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
203 /note="RNA binding motif protein 39"
204 /db_xref="GeneID:9584"
205 /db_xref="HGNC:15923"
206 /db_xref="HPRD:09201"
207 /db_xref="MIM:604739"
210 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
211 /inference="alignment:Splign:1.39.8"
214 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
215 /standard_name="REN58946"
216 /db_xref="UniSTS:383746"
217 misc_feature 221..223
219 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
220 /note="upstream in-frame stop codon"
223 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
224 /standard_name="G64285"
225 /db_xref="UniSTS:158667"
228 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
229 /inference="alignment:Splign:1.39.8"
232 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
233 /note="isoform a is encoded by transcript variant 1;
234 coactivator of activating protein-1 and estrogen
235 receptors; functional spliceosome-associated protein 59;
236 RNA-binding region (RNP1, RRM) containing 2;
237 hepatocellular carcinoma protein 1; splicing factor HCC1"
239 /product="RNA-binding protein 39 isoform a"
240 /protein_id="NP_909122.1"
241 /db_xref="GI:35493811"
242 /db_xref="CCDS:CCDS13266.1"
243 /db_xref="GeneID:9584"
244 /db_xref="HGNC:15923"
245 /db_xref="HPRD:09201"
246 /db_xref="MIM:604739"
247 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
248 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
249 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
250 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
251 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
252 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
253 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
254 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
255 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
256 PTYHNLFPDSMTATQLLVPSRR"
257 misc_feature 413..415
259 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
260 /experiment="experimental evidence, no additional details
262 /note="N-acetylalanine; propagated from
263 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
267 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
268 /inference="alignment:Splign:1.39.8"
272 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
273 /inference="alignment:Splign:1.39.8"
276 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
277 /standard_name="REN58786"
278 /db_xref="UniSTS:383586"
281 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
282 /standard_name="D19S1033"
283 /db_xref="UniSTS:154759"
286 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
287 /standard_name="REN58785"
288 /db_xref="UniSTS:383585"
290 polyA_signal 2851..2856
292 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
295 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
297 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
298 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
299 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
300 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
301 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
306 private EbiDbEntry() {
310 public Object clone() throws CloneNotSupportedException {
311 throw new CloneNotSupportedException();
315 public String getAccession() {
320 public SortedSet<Annotation> getAnnotations() {
325 public String getChromosome() {
330 public SortedSet<Accession> getCrossReferences() {
331 return _cross_references;
335 public String getGeneName() {
340 public SortedSet<GoTerm> getGoTerms() {
345 public String getMap() {
350 public String getProvider() {
355 public String getSequenceName() {
360 public String getSequenceSymbol() {
365 public String getTaxonomyIdentifier() {
370 public String getTaxonomyScientificName() {
375 public boolean isEmpty() {
376 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
377 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
378 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
381 public void setProvider( final String provider ) {
382 _provider = provider;
385 private void addAnnotation( final Annotation annotation ) {
386 if ( _annotations == null ) {
387 _annotations = new TreeSet<Annotation>();
389 _annotations.add( annotation );
392 private void addCrossReference( final Accession accession ) {
393 if ( _cross_references == null ) {
394 _cross_references = new TreeSet<Accession>();
396 System.out.println( "XREF ADDED: " + accession );
397 _cross_references.add( accession );
400 private void setAccession( final String pa ) {
406 private void setChromosome( final String chromosome ) {
407 _chromosome = chromosome;
410 private void setGeneName( final String gene_name ) {
411 if ( _gene_name == null ) {
412 _gene_name = gene_name;
416 private void setMap( final String map ) {
420 private void setSequenceName( final String rec_name ) {
426 private void setSequenceSymbol( final String symbol ) {
430 private void setTaxId( final String tax_id ) {
431 if ( _tax_id == null ) {
436 private void setTaxonomyScientificName( final String os ) {
442 // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
443 // final EbiDbEntry e = new EbiDbEntry();
444 // for( final String line : lines ) {
445 // if ( line.startsWith( "PA" ) ) {
446 // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
448 // else if ( line.startsWith( "DE" ) ) {
449 // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
451 // else if ( line.startsWith( "OS" ) ) {
452 // if ( line.indexOf( "(" ) > 0 ) {
453 // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
456 // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
459 // else if ( line.startsWith( "OX" ) ) {
460 // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
461 // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
467 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
468 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
469 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
470 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
471 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
472 final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
473 final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
474 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
475 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
476 final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
477 final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
478 final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
479 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
480 final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
481 final EbiDbEntry e = new EbiDbEntry();
482 final StringBuilder def = new StringBuilder();
483 boolean in_definition = false;
484 boolean in_features = false;
485 boolean in_source = false;
486 boolean in_gene = false;
487 boolean in_cds = false;
488 boolean in_mrna = false;
489 boolean in_protein = false;
490 for( final String line : lines ) {
491 if ( line.startsWith( "ACCESSION " ) ) {
492 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
493 in_definition = false;
495 else if ( line.startsWith( "ID " ) ) {
496 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
497 in_definition = false;
499 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
500 boolean definiton = false;
501 if ( line.startsWith( "DEFINITION " ) ) {
504 if ( line.indexOf( "[" ) > 0 ) {
506 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
509 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
512 else if ( line.indexOf( "." ) > 0 ) {
514 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
517 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
522 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
525 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
529 in_definition = true;
532 else if ( line.startsWith( " ORGANISM " ) ) {
533 if ( line.indexOf( "(" ) > 0 ) {
534 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
537 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
541 else if ( line.startsWith( "OS " ) ) {
542 if ( line.indexOf( "(" ) > 0 ) {
543 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
546 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
549 else if ( line.startsWith( " " ) && in_definition ) {
551 if ( line.indexOf( "[" ) > 0 ) {
552 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
554 else if ( line.indexOf( "." ) > 0 ) {
555 def.append( SequenceDbWsTools.extractTo( line, "." ) );
558 def.append( line.trim() );
562 in_definition = false;
564 if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
573 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
576 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
583 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
590 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
597 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
604 if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
612 final Matcher ti = taxon_PATTERN.matcher( line );
614 e.setTaxId( ti.group( 1 ) );
616 final Matcher chr = chromosome_PATTERN.matcher( line );
618 e.setChromosome( chr.group( 1 ) );
620 final Matcher map = map_PATTERN.matcher( line );
622 e.setMap( map.group( 1 ) );
625 if ( in_cds || in_gene ) {
626 final Matcher hgnc = hgnc_PATTERN.matcher( line );
628 e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
630 final Matcher geneid = geneid_PATTERN.matcher( line );
631 if ( geneid.find() ) {
632 e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
635 if ( in_protein || in_cds || in_gene || in_mrna ) {
636 final Matcher ec = ec_PATTERN.matcher( line );
638 e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
640 final Matcher gene = gene_PATTERN.matcher( line );
642 e.setGeneName( gene.group( 1 ) );
644 final Matcher uniprot = uniprot_PATTERN.matcher( line );
645 if ( uniprot.find() ) {
646 e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
648 final Matcher interpro = interpro_PATTERN.matcher( line );
649 if ( interpro.find() ) {
650 e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
652 final Matcher mim = mim_PATTERN.matcher( line );
654 e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
656 final Matcher product = product_PATTERN.matcher( line );
657 if ( product.find() ) {
658 e.setSequenceSymbol( product.group( 1 ) );
660 final Matcher pdb = pdb_PATTERN.matcher( line );
662 e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
666 if ( def.length() > 0 ) {
667 e.setSequenceName( def.toString().trim() );
672 private static void x( final StringBuilder sb, final String s ) {
673 if ( sb.length() > 0 ) {
676 sb.append( s.trim() );