2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.regex.Pattern;
32 import org.forester.go.GoTerm;
33 import org.forester.phylogeny.data.Accession;
34 import org.forester.util.ForesterUtil;
36 public final class EbiDbEntry implements SequenceDatabaseEntry {
38 public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
39 final EbiDbEntry e = new EbiDbEntry();
40 for( final String line : lines ) {
41 if ( line.startsWith( "PA" ) ) {
42 e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
44 else if ( line.startsWith( "DE" ) ) {
45 e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
47 else if ( line.startsWith( "OS" ) ) {
48 if ( line.indexOf( "(" ) > 0 ) {
49 e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
52 e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
55 else if ( line.startsWith( "OX" ) ) {
56 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
57 e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
64 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
65 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
66 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
67 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
68 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
69 final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
70 final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
71 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
72 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
73 final EbiDbEntry e = new EbiDbEntry();
74 final StringBuilder def = new StringBuilder();
75 boolean in_def = false;
76 boolean in_features = false;
77 boolean in_source = false;
78 boolean in_gene = false;
79 boolean in_cds = false;
80 boolean in_protein = false;
81 for( final String line : lines ) {
82 if ( line.startsWith( "ACCESSION " ) ) {
83 e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
86 else if ( line.startsWith( "DEFINITION " ) ) {
87 if ( line.indexOf( "[" ) > 0 ) {
88 def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) );
90 else if ( line.indexOf( "." ) > 0 ) {
91 def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) );
94 def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) );
98 else if ( line.startsWith( " ORGANISM " ) ) {
99 if ( line.indexOf( "(" ) > 0 ) {
100 e.setOs( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
103 e.setOs( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
107 else if ( line.startsWith( " " ) && in_def ) {
109 if ( line.indexOf( "[" ) > 0 ) {
110 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
112 else if ( line.indexOf( "." ) > 0 ) {
113 def.append( SequenceDbWsTools.extractTo( line, "." ) );
116 def.append( line.trim() );
122 if ( X_PATTERN.matcher( line ).find() ) {
130 if ( line.startsWith( "FEATURES " ) ) {
133 if ( in_features && line.startsWith( " source " ) ) {
139 if ( in_features && line.startsWith( " gene " ) ) {
145 if ( in_features && line.startsWith( " CDS " ) ) {
151 if ( in_features && line.startsWith( " Protein " ) ) {
158 if ( def.length() > 0 ) {
159 e.setDe( def.toString().trim() );
163 // FIXME actually this is NCBI entry
164 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
168 private String _tax_id;
169 private String _symbol;
170 private String _provider;
171 private ArrayList<Accession> _cross_references;
172 private String _gene_name;
174 // TODO PUBMED 15798186
176 // source /db_xref="taxon:9606"
180 // /db_xref="MIM:604739"
183 // /db_xref="MIM:604739"
184 // /db_xref="InterPro:IPR002475"
186 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
190 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
191 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
194 VERSION NM_184234.2 GI:336176061
196 SOURCE Homo sapiens (human)
197 ORGANISM Homo sapiens
198 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
199 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
200 Catarrhini; Hominidae; Homo.
201 REFERENCE 1 (bases 1 to 2881)
202 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
203 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
204 Meijer,G.A. and Fijneman,R.J.
205 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
207 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
209 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
210 levels correlated with chromosome 20q DNA copy number status.
211 REFERENCE 2 (bases 1 to 2881)
212 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
213 TITLE CAPER-alpha alternative splicing regulates the expression of
214 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
215 JOURNAL Cancer 118 (8), 2106-2116 (2012)
217 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
218 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
219 alternative splicing and controls the shift from VEGF(189) to
221 REFERENCE 3 (bases 1 to 2881)
222 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
224 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
225 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
227 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
229 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
230 REFERENCE 4 (bases 1 to 2881)
231 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
232 TITLE Identification of tumor-associated antigens as diagnostic and
233 predictive biomarkers in cancer
234 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
236 REFERENCE 5 (bases 1 to 2881)
237 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
238 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
239 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
241 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
243 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
244 protein 39) as a new transcriptional coregulator for v-Rel and
245 reveals an important role in modulating Rel's oncogenic activity.
246 REFERENCE 6 (bases 1 to 2881)
247 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
248 TITLE A novel SR-related protein is required for the second step of
250 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
252 REFERENCE 7 (bases 1 to 2881)
253 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
254 Berget,S.M. and O'Malley,B.W.
255 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
256 by U2AF65-related proteins CAPERalpha and CAPERbeta
257 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
259 REFERENCE 8 (bases 1 to 2881)
260 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
261 Ridenour,G., Hyde,J.D. and Witten,M.L.
262 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
263 lymphoblastic leukemia cell line
264 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
266 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
267 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
268 REFERENCE 9 (bases 1 to 2881)
269 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
270 TITLE Molecular cloning and characterization of CAPER, a novel
271 coactivator of activating protein-1 and estrogen receptors
272 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
274 REMARK GeneRIF: This paper describes the mouse gene.
275 REFERENCE 10 (bases 1 to 2881)
276 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
277 TITLE Novel nuclear autoantigen with splicing factor motifs identified
278 with antibody from hepatocellular carcinoma
279 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
281 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
282 reference sequence was derived from DC346351.1, BC141835.1 and
284 On Jun 16, 2011 this sequence version replaced gi:35493810.
286 Summary: This gene encodes a member of the U2AF65 family of
287 proteins. The encoded protein is found in the nucleus, where it
288 co-localizes with core spliceosomal proteins. It has been shown to
289 play a role in both steroid hormone receptor-mediated transcription
290 and alternative splicing, and it is also a transcriptional
291 coregulator of the viral oncoprotein v-Rel. Multiple transcript
292 variants have been observed for this gene. A related pseudogene has
293 been identified on chromosome X. [provided by RefSeq, Aug 2011].
295 Transcript Variant: This variant (1) encodes the longest isoform
296 (a, also called CC1.4).
298 Publication Note: This RefSeq record includes a subset of the
299 publications that are available for this gene. Please see the Gene
300 record to access additional publications.
302 ##Evidence-Data-START##
303 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
304 RNAseq introns :: mixed/partial sample support
305 ERS025081, ERS025082 [ECO:0000350]
306 ##Evidence-Data-END##
307 COMPLETENESS: complete on the 3' end.
308 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
309 1-578 DC346351.1 3-580
310 579-2872 BC141835.1 429-2722
311 2873-2881 C75555.1 1-9 c
312 FEATURES Location/Qualifiers
314 /organism="Homo sapiens"
316 /db_xref="taxon:9606"
321 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
322 /note="RNA binding motif protein 39"
323 /db_xref="GeneID:9584"
324 /db_xref="HGNC:15923"
325 /db_xref="HPRD:09201"
326 /db_xref="MIM:604739"
329 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
330 /inference="alignment:Splign:1.39.8"
333 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
334 /standard_name="REN58946"
335 /db_xref="UniSTS:383746"
336 misc_feature 221..223
338 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
339 /note="upstream in-frame stop codon"
342 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
343 /standard_name="G64285"
344 /db_xref="UniSTS:158667"
347 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
348 /inference="alignment:Splign:1.39.8"
351 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
352 /note="isoform a is encoded by transcript variant 1;
353 coactivator of activating protein-1 and estrogen
354 receptors; functional spliceosome-associated protein 59;
355 RNA-binding region (RNP1, RRM) containing 2;
356 hepatocellular carcinoma protein 1; splicing factor HCC1"
358 /product="RNA-binding protein 39 isoform a"
359 /protein_id="NP_909122.1"
360 /db_xref="GI:35493811"
361 /db_xref="CCDS:CCDS13266.1"
362 /db_xref="GeneID:9584"
363 /db_xref="HGNC:15923"
364 /db_xref="HPRD:09201"
365 /db_xref="MIM:604739"
366 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
367 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
368 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
369 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
370 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
371 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
372 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
373 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
374 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
375 PTYHNLFPDSMTATQLLVPSRR"
376 misc_feature 413..415
378 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
379 /experiment="experimental evidence, no additional details
381 /note="N-acetylalanine; propagated from
382 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
386 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
387 /inference="alignment:Splign:1.39.8"
391 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
392 /inference="alignment:Splign:1.39.8"
395 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
396 /standard_name="REN58786"
397 /db_xref="UniSTS:383586"
400 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
401 /standard_name="D19S1033"
402 /db_xref="UniSTS:154759"
405 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
406 /standard_name="REN58785"
407 /db_xref="UniSTS:383585"
409 polyA_signal 2851..2856
411 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
414 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
416 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
417 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
418 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
419 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
420 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
425 private EbiDbEntry() {
428 private void addCrossReference( final Accession accession ) {
429 if ( _cross_references == null ) {
430 _cross_references = new ArrayList<Accession>();
432 System.out.println( "XREF ADDED: " + accession );
433 _cross_references.add( accession );
437 public Object clone() throws CloneNotSupportedException {
438 throw new CloneNotSupportedException();
442 public String getAccession() {
447 public List<Accession> getCrossReferences() {
448 return _cross_references;
452 public String getGeneName() {
457 public List<GoTerm> getGoTerms() {
462 public String getProvider() {
467 public String getSequenceName() {
472 public String getSequenceSymbol() {
477 public String getTaxonomyIdentifier() {
482 public String getTaxonomyScientificName() {
487 public boolean isEmpty() {
488 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
489 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
490 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
493 private void setDe( final String rec_name ) {
499 private void setGeneName( final String gene_name ) {
500 if ( _gene_name == null ) {
501 _gene_name = gene_name;
505 private void setOs( final String os ) {
511 private void setPA( final String pa ) {
517 public void setProvider( final String provider ) {
518 _provider = provider;
521 private void setTaxId( final String tax_id ) {
522 if ( _tax_id == null ) {