2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import org.forester.go.GoTerm;
34 import org.forester.phylogeny.data.Accession;
35 import org.forester.phylogeny.data.Annotation;
36 import org.forester.util.ForesterUtil;
38 public final class EbiDbEntry implements SequenceDatabaseEntry {
40 public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
41 final EbiDbEntry e = new EbiDbEntry();
42 for( final String line : lines ) {
43 if ( line.startsWith( "PA" ) ) {
44 e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
46 else if ( line.startsWith( "DE" ) ) {
47 e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
49 else if ( line.startsWith( "OS" ) ) {
50 if ( line.indexOf( "(" ) > 0 ) {
51 e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
54 e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
57 else if ( line.startsWith( "OX" ) ) {
58 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
59 e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
66 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
67 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
68 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
69 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
70 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
71 final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
72 final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
73 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
74 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
75 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"[\\.\\-\\d]+\"" );
76 final EbiDbEntry e = new EbiDbEntry();
77 final StringBuilder def = new StringBuilder();
78 boolean in_definition = false;
79 boolean in_features = false;
80 boolean in_source = false;
81 boolean in_gene = false;
82 boolean in_cds = false;
83 boolean in_protein = false;
84 for( final String line : lines ) {
85 if ( line.startsWith( "ACCESSION " ) ) {
86 e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
87 in_definition = false;
89 else if ( line.startsWith( "ID " ) ) {
90 e.setPA( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
91 in_definition = false;
93 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
94 boolean definiton = false;
95 if ( line.startsWith( "DEFINITION " ) ) {
98 if ( line.indexOf( "[" ) > 0 ) {
100 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
103 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
106 else if ( line.indexOf( "." ) > 0 ) {
108 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
111 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
116 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
119 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
123 in_definition = true;
126 else if ( line.startsWith( " ORGANISM " ) ) {
127 if ( line.indexOf( "(" ) > 0 ) {
128 e.setOs( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
131 e.setOs( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
135 else if ( line.startsWith( "OS " ) ) {
136 if ( line.indexOf( "(" ) > 0 ) {
137 e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
140 e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
143 else if ( line.startsWith( " " ) && in_definition ) {
145 if ( line.indexOf( "[" ) > 0 ) {
146 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
148 else if ( line.indexOf( "." ) > 0 ) {
149 def.append( SequenceDbWsTools.extractTo( line, "." ) );
152 def.append( line.trim() );
156 in_definition = false;
158 if ( X_PATTERN.matcher( line ).find() ) {
166 if ( line.startsWith( "FEATURES " ) ) {
169 if ( in_features && line.startsWith( " source " ) ) {
175 if ( in_features && line.startsWith( " gene " ) ) {
181 if ( in_features && line.startsWith( " CDS " ) ) {
187 if ( in_features && line.startsWith( " Protein " ) ) {
193 if ( in_protein || in_cds ) {
194 final Matcher m = ec_PATTERN.matcher( line );
196 e.addAnnotation( new Annotation( "EC", m.group( 1 ) ) );
200 if ( def.length() > 0 ) {
201 e.setDe( def.toString().trim() );
206 private static void x( final StringBuilder sb, final String s ) {
207 if ( sb.length() > 0 ) {
210 sb.append( s.trim() );
212 // FIXME actually this is NCBI entry
213 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
217 private String _tax_id;
218 private String _symbol;
219 private String _provider;
220 private List<Accession> _cross_references;
221 private List<Annotation> _annotations;
222 private String _gene_name;
224 // TODO PUBMED 15798186
226 // source /db_xref="taxon:9606"
230 // /db_xref="MIM:604739"
233 // /db_xref="MIM:604739"
234 // /db_xref="InterPro:IPR002475"
236 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
240 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
241 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
244 VERSION NM_184234.2 GI:336176061
246 SOURCE Homo sapiens (human)
247 ORGANISM Homo sapiens
248 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
249 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
250 Catarrhini; Hominidae; Homo.
251 REFERENCE 1 (bases 1 to 2881)
252 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
253 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
254 Meijer,G.A. and Fijneman,R.J.
255 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
257 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
259 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
260 levels correlated with chromosome 20q DNA copy number status.
261 REFERENCE 2 (bases 1 to 2881)
262 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
263 TITLE CAPER-alpha alternative splicing regulates the expression of
264 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
265 JOURNAL Cancer 118 (8), 2106-2116 (2012)
267 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
268 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
269 alternative splicing and controls the shift from VEGF(189) to
271 REFERENCE 3 (bases 1 to 2881)
272 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
274 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
275 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
277 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
279 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
280 REFERENCE 4 (bases 1 to 2881)
281 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
282 TITLE Identification of tumor-associated antigens as diagnostic and
283 predictive biomarkers in cancer
284 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
286 REFERENCE 5 (bases 1 to 2881)
287 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
288 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
289 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
291 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
293 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
294 protein 39) as a new transcriptional coregulator for v-Rel and
295 reveals an important role in modulating Rel's oncogenic activity.
296 REFERENCE 6 (bases 1 to 2881)
297 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
298 TITLE A novel SR-related protein is required for the second step of
300 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
302 REFERENCE 7 (bases 1 to 2881)
303 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
304 Berget,S.M. and O'Malley,B.W.
305 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
306 by U2AF65-related proteins CAPERalpha and CAPERbeta
307 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
309 REFERENCE 8 (bases 1 to 2881)
310 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
311 Ridenour,G., Hyde,J.D. and Witten,M.L.
312 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
313 lymphoblastic leukemia cell line
314 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
316 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
317 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
318 REFERENCE 9 (bases 1 to 2881)
319 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
320 TITLE Molecular cloning and characterization of CAPER, a novel
321 coactivator of activating protein-1 and estrogen receptors
322 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
324 REMARK GeneRIF: This paper describes the mouse gene.
325 REFERENCE 10 (bases 1 to 2881)
326 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
327 TITLE Novel nuclear autoantigen with splicing factor motifs identified
328 with antibody from hepatocellular carcinoma
329 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
331 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
332 reference sequence was derived from DC346351.1, BC141835.1 and
334 On Jun 16, 2011 this sequence version replaced gi:35493810.
336 Summary: This gene encodes a member of the U2AF65 family of
337 proteins. The encoded protein is found in the nucleus, where it
338 co-localizes with core spliceosomal proteins. It has been shown to
339 play a role in both steroid hormone receptor-mediated transcription
340 and alternative splicing, and it is also a transcriptional
341 coregulator of the viral oncoprotein v-Rel. Multiple transcript
342 variants have been observed for this gene. A related pseudogene has
343 been identified on chromosome X. [provided by RefSeq, Aug 2011].
345 Transcript Variant: This variant (1) encodes the longest isoform
346 (a, also called CC1.4).
348 Publication Note: This RefSeq record includes a subset of the
349 publications that are available for this gene. Please see the Gene
350 record to access additional publications.
352 ##Evidence-Data-START##
353 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
354 RNAseq introns :: mixed/partial sample support
355 ERS025081, ERS025082 [ECO:0000350]
356 ##Evidence-Data-END##
357 COMPLETENESS: complete on the 3' end.
358 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
359 1-578 DC346351.1 3-580
360 579-2872 BC141835.1 429-2722
361 2873-2881 C75555.1 1-9 c
362 FEATURES Location/Qualifiers
364 /organism="Homo sapiens"
366 /db_xref="taxon:9606"
371 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
372 /note="RNA binding motif protein 39"
373 /db_xref="GeneID:9584"
374 /db_xref="HGNC:15923"
375 /db_xref="HPRD:09201"
376 /db_xref="MIM:604739"
379 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
380 /inference="alignment:Splign:1.39.8"
383 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
384 /standard_name="REN58946"
385 /db_xref="UniSTS:383746"
386 misc_feature 221..223
388 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
389 /note="upstream in-frame stop codon"
392 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
393 /standard_name="G64285"
394 /db_xref="UniSTS:158667"
397 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
398 /inference="alignment:Splign:1.39.8"
401 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
402 /note="isoform a is encoded by transcript variant 1;
403 coactivator of activating protein-1 and estrogen
404 receptors; functional spliceosome-associated protein 59;
405 RNA-binding region (RNP1, RRM) containing 2;
406 hepatocellular carcinoma protein 1; splicing factor HCC1"
408 /product="RNA-binding protein 39 isoform a"
409 /protein_id="NP_909122.1"
410 /db_xref="GI:35493811"
411 /db_xref="CCDS:CCDS13266.1"
412 /db_xref="GeneID:9584"
413 /db_xref="HGNC:15923"
414 /db_xref="HPRD:09201"
415 /db_xref="MIM:604739"
416 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
417 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
418 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
419 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
420 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
421 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
422 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
423 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
424 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
425 PTYHNLFPDSMTATQLLVPSRR"
426 misc_feature 413..415
428 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
429 /experiment="experimental evidence, no additional details
431 /note="N-acetylalanine; propagated from
432 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
436 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
437 /inference="alignment:Splign:1.39.8"
441 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
442 /inference="alignment:Splign:1.39.8"
445 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
446 /standard_name="REN58786"
447 /db_xref="UniSTS:383586"
450 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
451 /standard_name="D19S1033"
452 /db_xref="UniSTS:154759"
455 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
456 /standard_name="REN58785"
457 /db_xref="UniSTS:383585"
459 polyA_signal 2851..2856
461 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
464 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
466 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
467 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
468 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
469 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
470 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
475 private EbiDbEntry() {
478 private void addCrossReference( final Accession accession ) {
479 if ( _cross_references == null ) {
480 _cross_references = new ArrayList<Accession>();
482 System.out.println( "XREF ADDED: " + accession );
483 _cross_references.add( accession );
487 public Object clone() throws CloneNotSupportedException {
488 throw new CloneNotSupportedException();
492 public String getAccession() {
497 public List<Accession> getCrossReferences() {
498 return _cross_references;
502 public String getGeneName() {
507 public List<GoTerm> getGoTerms() {
512 public String getProvider() {
517 public String getSequenceName() {
522 public String getSequenceSymbol() {
527 public String getTaxonomyIdentifier() {
532 public String getTaxonomyScientificName() {
537 public boolean isEmpty() {
538 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
539 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
540 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
543 private void setDe( final String rec_name ) {
549 private void setGeneName( final String gene_name ) {
550 if ( _gene_name == null ) {
551 _gene_name = gene_name;
555 private void setOs( final String os ) {
561 private void setPA( final String pa ) {
567 public void setProvider( final String provider ) {
568 _provider = provider;
571 private void setTaxId( final String tax_id ) {
572 if ( _tax_id == null ) {
578 public List<Annotation> getAnnotations() {
582 private void addAnnotation( final Annotation annotation ) {
583 if ( _annotations == null ) {
584 _annotations = new ArrayList<Annotation>();
586 _annotations.add( annotation );