2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.ArrayList;
29 import java.util.List;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
33 import org.forester.go.GoTerm;
34 import org.forester.phylogeny.data.Accession;
35 import org.forester.phylogeny.data.Annotation;
36 import org.forester.util.ForesterUtil;
38 public final class EbiDbEntry implements SequenceDatabaseEntry {
40 // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
41 // final EbiDbEntry e = new EbiDbEntry();
42 // for( final String line : lines ) {
43 // if ( line.startsWith( "PA" ) ) {
44 // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
46 // else if ( line.startsWith( "DE" ) ) {
47 // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
49 // else if ( line.startsWith( "OS" ) ) {
50 // if ( line.indexOf( "(" ) > 0 ) {
51 // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
54 // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
57 // else if ( line.startsWith( "OX" ) ) {
58 // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
59 // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
65 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
66 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
67 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
68 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
69 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
70 final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
71 final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
72 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
73 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
74 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
75 final EbiDbEntry e = new EbiDbEntry();
76 final StringBuilder def = new StringBuilder();
77 boolean in_definition = false;
78 boolean in_features = false;
79 boolean in_source = false;
80 boolean in_gene = false;
81 boolean in_cds = false;
82 boolean in_protein = false;
83 for( final String line : lines ) {
84 if ( line.startsWith( "ACCESSION " ) ) {
85 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
86 in_definition = false;
88 else if ( line.startsWith( "ID " ) ) {
89 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
90 in_definition = false;
92 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
93 boolean definiton = false;
94 if ( line.startsWith( "DEFINITION " ) ) {
97 if ( line.indexOf( "[" ) > 0 ) {
99 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
102 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
105 else if ( line.indexOf( "." ) > 0 ) {
107 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
110 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
115 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
118 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
122 in_definition = true;
125 else if ( line.startsWith( " ORGANISM " ) ) {
126 if ( line.indexOf( "(" ) > 0 ) {
127 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
130 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
134 else if ( line.startsWith( "OS " ) ) {
135 if ( line.indexOf( "(" ) > 0 ) {
136 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
139 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
142 else if ( line.startsWith( " " ) && in_definition ) {
144 if ( line.indexOf( "[" ) > 0 ) {
145 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
147 else if ( line.indexOf( "." ) > 0 ) {
148 def.append( SequenceDbWsTools.extractTo( line, "." ) );
151 def.append( line.trim() );
155 in_definition = false;
157 if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
165 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
168 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
174 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
180 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
186 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
193 final Matcher m = taxon_xref_PATTERN.matcher( line );
195 e.setTaxId( m.group( 1 ) );
198 if ( in_protein || in_cds ) {
199 final Matcher m = ec_PATTERN.matcher( line );
201 e.addAnnotation( new Annotation( "EC", m.group( 1 ) ) );
204 if ( in_protein || in_cds || in_gene ) {
205 final Matcher m = gene_PATTERN.matcher( line );
207 e.setGeneName( m.group( 1 ) );
211 if ( def.length() > 0 ) {
212 e.setSequenceName( def.toString().trim() );
217 private static void x( final StringBuilder sb, final String s ) {
218 if ( sb.length() > 0 ) {
221 sb.append( s.trim() );
223 // FIXME actually this is NCBI entry
224 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
228 private String _tax_id;
229 private String _symbol;
230 private String _provider;
231 private List<Accession> _cross_references;
232 private List<Annotation> _annotations;
233 private String _gene_name;
235 // TODO PUBMED 15798186
237 // source /db_xref="taxon:9606"
241 // /db_xref="MIM:604739"
244 // /db_xref="MIM:604739"
245 // /db_xref="InterPro:IPR002475"
247 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
251 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
252 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
255 VERSION NM_184234.2 GI:336176061
257 SOURCE Homo sapiens (human)
258 ORGANISM Homo sapiens
259 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
260 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
261 Catarrhini; Hominidae; Homo.
262 REFERENCE 1 (bases 1 to 2881)
263 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
264 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
265 Meijer,G.A. and Fijneman,R.J.
266 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
268 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
270 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
271 levels correlated with chromosome 20q DNA copy number status.
272 REFERENCE 2 (bases 1 to 2881)
273 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
274 TITLE CAPER-alpha alternative splicing regulates the expression of
275 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
276 JOURNAL Cancer 118 (8), 2106-2116 (2012)
278 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
279 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
280 alternative splicing and controls the shift from VEGF(189) to
282 REFERENCE 3 (bases 1 to 2881)
283 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
285 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
286 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
288 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
290 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
291 REFERENCE 4 (bases 1 to 2881)
292 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
293 TITLE Identification of tumor-associated antigens as diagnostic and
294 predictive biomarkers in cancer
295 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
297 REFERENCE 5 (bases 1 to 2881)
298 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
299 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
300 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
302 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
304 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
305 protein 39) as a new transcriptional coregulator for v-Rel and
306 reveals an important role in modulating Rel's oncogenic activity.
307 REFERENCE 6 (bases 1 to 2881)
308 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
309 TITLE A novel SR-related protein is required for the second step of
311 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
313 REFERENCE 7 (bases 1 to 2881)
314 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
315 Berget,S.M. and O'Malley,B.W.
316 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
317 by U2AF65-related proteins CAPERalpha and CAPERbeta
318 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
320 REFERENCE 8 (bases 1 to 2881)
321 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
322 Ridenour,G., Hyde,J.D. and Witten,M.L.
323 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
324 lymphoblastic leukemia cell line
325 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
327 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
328 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
329 REFERENCE 9 (bases 1 to 2881)
330 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
331 TITLE Molecular cloning and characterization of CAPER, a novel
332 coactivator of activating protein-1 and estrogen receptors
333 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
335 REMARK GeneRIF: This paper describes the mouse gene.
336 REFERENCE 10 (bases 1 to 2881)
337 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
338 TITLE Novel nuclear autoantigen with splicing factor motifs identified
339 with antibody from hepatocellular carcinoma
340 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
342 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
343 reference sequence was derived from DC346351.1, BC141835.1 and
345 On Jun 16, 2011 this sequence version replaced gi:35493810.
347 Summary: This gene encodes a member of the U2AF65 family of
348 proteins. The encoded protein is found in the nucleus, where it
349 co-localizes with core spliceosomal proteins. It has been shown to
350 play a role in both steroid hormone receptor-mediated transcription
351 and alternative splicing, and it is also a transcriptional
352 coregulator of the viral oncoprotein v-Rel. Multiple transcript
353 variants have been observed for this gene. A related pseudogene has
354 been identified on chromosome X. [provided by RefSeq, Aug 2011].
356 Transcript Variant: This variant (1) encodes the longest isoform
357 (a, also called CC1.4).
359 Publication Note: This RefSeq record includes a subset of the
360 publications that are available for this gene. Please see the Gene
361 record to access additional publications.
363 ##Evidence-Data-START##
364 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
365 RNAseq introns :: mixed/partial sample support
366 ERS025081, ERS025082 [ECO:0000350]
367 ##Evidence-Data-END##
368 COMPLETENESS: complete on the 3' end.
369 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
370 1-578 DC346351.1 3-580
371 579-2872 BC141835.1 429-2722
372 2873-2881 C75555.1 1-9 c
373 FEATURES Location/Qualifiers
375 /organism="Homo sapiens"
377 /db_xref="taxon:9606"
382 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
383 /note="RNA binding motif protein 39"
384 /db_xref="GeneID:9584"
385 /db_xref="HGNC:15923"
386 /db_xref="HPRD:09201"
387 /db_xref="MIM:604739"
390 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
391 /inference="alignment:Splign:1.39.8"
394 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
395 /standard_name="REN58946"
396 /db_xref="UniSTS:383746"
397 misc_feature 221..223
399 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
400 /note="upstream in-frame stop codon"
403 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
404 /standard_name="G64285"
405 /db_xref="UniSTS:158667"
408 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
409 /inference="alignment:Splign:1.39.8"
412 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
413 /note="isoform a is encoded by transcript variant 1;
414 coactivator of activating protein-1 and estrogen
415 receptors; functional spliceosome-associated protein 59;
416 RNA-binding region (RNP1, RRM) containing 2;
417 hepatocellular carcinoma protein 1; splicing factor HCC1"
419 /product="RNA-binding protein 39 isoform a"
420 /protein_id="NP_909122.1"
421 /db_xref="GI:35493811"
422 /db_xref="CCDS:CCDS13266.1"
423 /db_xref="GeneID:9584"
424 /db_xref="HGNC:15923"
425 /db_xref="HPRD:09201"
426 /db_xref="MIM:604739"
427 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
428 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
429 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
430 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
431 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
432 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
433 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
434 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
435 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
436 PTYHNLFPDSMTATQLLVPSRR"
437 misc_feature 413..415
439 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
440 /experiment="experimental evidence, no additional details
442 /note="N-acetylalanine; propagated from
443 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
447 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
448 /inference="alignment:Splign:1.39.8"
452 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
453 /inference="alignment:Splign:1.39.8"
456 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
457 /standard_name="REN58786"
458 /db_xref="UniSTS:383586"
461 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
462 /standard_name="D19S1033"
463 /db_xref="UniSTS:154759"
466 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
467 /standard_name="REN58785"
468 /db_xref="UniSTS:383585"
470 polyA_signal 2851..2856
472 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
475 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
477 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
478 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
479 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
480 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
481 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
486 private EbiDbEntry() {
489 private void addCrossReference( final Accession accession ) {
490 if ( _cross_references == null ) {
491 _cross_references = new ArrayList<Accession>();
493 System.out.println( "XREF ADDED: " + accession );
494 _cross_references.add( accession );
498 public Object clone() throws CloneNotSupportedException {
499 throw new CloneNotSupportedException();
503 public String getAccession() {
508 public List<Accession> getCrossReferences() {
509 return _cross_references;
513 public String getGeneName() {
518 public List<GoTerm> getGoTerms() {
523 public String getProvider() {
528 public String getSequenceName() {
533 public String getSequenceSymbol() {
538 public String getTaxonomyIdentifier() {
543 public String getTaxonomyScientificName() {
548 public boolean isEmpty() {
549 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
550 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
551 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
554 private void setSequenceName( final String rec_name ) {
560 private void setGeneName( final String gene_name ) {
561 if ( _gene_name == null ) {
562 _gene_name = gene_name;
566 private void setTaxonomyScientificName( final String os ) {
572 private void setAccession( final String pa ) {
578 public void setProvider( final String provider ) {
579 _provider = provider;
582 private void setTaxId( final String tax_id ) {
583 if ( _tax_id == null ) {
589 public List<Annotation> getAnnotations() {
593 private void addAnnotation( final Annotation annotation ) {
594 if ( _annotations == null ) {
595 _annotations = new ArrayList<Annotation>();
597 _annotations.add( annotation );