2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.List;
29 import java.util.SortedSet;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import org.forester.go.GoTerm;
35 import org.forester.phylogeny.data.Accession;
36 import org.forester.phylogeny.data.Annotation;
37 import org.forester.util.ForesterUtil;
39 public final class EbiDbEntry implements SequenceDatabaseEntry {
41 // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
42 // final EbiDbEntry e = new EbiDbEntry();
43 // for( final String line : lines ) {
44 // if ( line.startsWith( "PA" ) ) {
45 // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
47 // else if ( line.startsWith( "DE" ) ) {
48 // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
50 // else if ( line.startsWith( "OS" ) ) {
51 // if ( line.indexOf( "(" ) > 0 ) {
52 // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
55 // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
58 // else if ( line.startsWith( "OX" ) ) {
59 // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
60 // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
66 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
67 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
68 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
69 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
70 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
71 final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
72 final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
73 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
74 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
75 final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"HGNC:(\\d+)\"" );
76 final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
77 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
78 final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
79 final EbiDbEntry e = new EbiDbEntry();
80 final StringBuilder def = new StringBuilder();
81 boolean in_definition = false;
82 boolean in_features = false;
83 boolean in_source = false;
84 boolean in_gene = false;
85 boolean in_cds = false;
86 boolean in_mrna = false;
87 boolean in_protein = false;
88 for( final String line : lines ) {
89 if ( line.startsWith( "ACCESSION " ) ) {
90 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
91 in_definition = false;
93 else if ( line.startsWith( "ID " ) ) {
94 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
95 in_definition = false;
97 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
98 boolean definiton = false;
99 if ( line.startsWith( "DEFINITION " ) ) {
102 if ( line.indexOf( "[" ) > 0 ) {
104 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
107 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
110 else if ( line.indexOf( "." ) > 0 ) {
112 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
115 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
120 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
123 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
127 in_definition = true;
130 else if ( line.startsWith( " ORGANISM " ) ) {
131 if ( line.indexOf( "(" ) > 0 ) {
132 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
135 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
139 else if ( line.startsWith( "OS " ) ) {
140 if ( line.indexOf( "(" ) > 0 ) {
141 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
144 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
147 else if ( line.startsWith( " " ) && in_definition ) {
149 if ( line.indexOf( "[" ) > 0 ) {
150 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
152 else if ( line.indexOf( "." ) > 0 ) {
153 def.append( SequenceDbWsTools.extractTo( line, "." ) );
156 def.append( line.trim() );
160 in_definition = false;
162 if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
171 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
174 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
181 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
188 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
195 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
202 if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
210 final Matcher m = taxon_PATTERN.matcher( line );
212 e.setTaxId( m.group( 1 ) );
215 if ( in_cds || in_gene ) {
216 final Matcher hgnc = hgnc_PATTERN.matcher( line );
218 e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
220 final Matcher geneid = geneid_PATTERN.matcher( line );
221 if ( geneid.find() ) {
222 e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
225 if ( in_protein || in_cds || in_gene || in_mrna ) {
226 final Matcher ec = ec_PATTERN.matcher( line );
228 e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
230 final Matcher gene = gene_PATTERN.matcher( line );
232 e.setGeneName( gene.group( 1 ) );
234 final Matcher uniprot = uniprot_PATTERN.matcher( line );
235 if ( uniprot.find() ) {
236 e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
238 final Matcher interpro = interpro_PATTERN.matcher( line );
239 if ( interpro.find() ) {
240 e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
242 final Matcher mim = mim_PATTERN.matcher( line );
244 e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
246 final Matcher product = product_PATTERN.matcher( line );
247 if ( product.find() ) {
248 e.setSequenceSymbol( product.group( 1 ) );
252 if ( def.length() > 0 ) {
253 e.setSequenceName( def.toString().trim() );
258 private static void x( final StringBuilder sb, final String s ) {
259 if ( sb.length() > 0 ) {
262 sb.append( s.trim() );
264 // FIXME actually this is NCBI entry
265 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
269 private String _tax_id;
270 private String _symbol;
271 private String _provider;
272 private SortedSet<Accession> _cross_references;
273 private SortedSet<Annotation> _annotations;
274 private String _gene_name;
276 // TODO PUBMED 15798186
278 // source /db_xref="taxon:9606"
282 // /db_xref="MIM:604739"
285 // /db_xref="MIM:604739"
286 // /db_xref="InterPro:IPR002475"
288 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
292 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
293 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
296 VERSION NM_184234.2 GI:336176061
298 SOURCE Homo sapiens (human)
299 ORGANISM Homo sapiens
300 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
301 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
302 Catarrhini; Hominidae; Homo.
303 REFERENCE 1 (bases 1 to 2881)
304 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
305 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
306 Meijer,G.A. and Fijneman,R.J.
307 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
309 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
311 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
312 levels correlated with chromosome 20q DNA copy number status.
313 REFERENCE 2 (bases 1 to 2881)
314 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
315 TITLE CAPER-alpha alternative splicing regulates the expression of
316 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
317 JOURNAL Cancer 118 (8), 2106-2116 (2012)
319 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
320 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
321 alternative splicing and controls the shift from VEGF(189) to
323 REFERENCE 3 (bases 1 to 2881)
324 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
326 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
327 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
329 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
331 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
332 REFERENCE 4 (bases 1 to 2881)
333 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
334 TITLE Identification of tumor-associated antigens as diagnostic and
335 predictive biomarkers in cancer
336 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
338 REFERENCE 5 (bases 1 to 2881)
339 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
340 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
341 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
343 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
345 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
346 protein 39) as a new transcriptional coregulator for v-Rel and
347 reveals an important role in modulating Rel's oncogenic activity.
348 REFERENCE 6 (bases 1 to 2881)
349 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
350 TITLE A novel SR-related protein is required for the second step of
352 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
354 REFERENCE 7 (bases 1 to 2881)
355 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
356 Berget,S.M. and O'Malley,B.W.
357 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
358 by U2AF65-related proteins CAPERalpha and CAPERbeta
359 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
361 REFERENCE 8 (bases 1 to 2881)
362 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
363 Ridenour,G., Hyde,J.D. and Witten,M.L.
364 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
365 lymphoblastic leukemia cell line
366 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
368 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
369 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
370 REFERENCE 9 (bases 1 to 2881)
371 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
372 TITLE Molecular cloning and characterization of CAPER, a novel
373 coactivator of activating protein-1 and estrogen receptors
374 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
376 REMARK GeneRIF: This paper describes the mouse gene.
377 REFERENCE 10 (bases 1 to 2881)
378 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
379 TITLE Novel nuclear autoantigen with splicing factor motifs identified
380 with antibody from hepatocellular carcinoma
381 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
383 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
384 reference sequence was derived from DC346351.1, BC141835.1 and
386 On Jun 16, 2011 this sequence version replaced gi:35493810.
388 Summary: This gene encodes a member of the U2AF65 family of
389 proteins. The encoded protein is found in the nucleus, where it
390 co-localizes with core spliceosomal proteins. It has been shown to
391 play a role in both steroid hormone receptor-mediated transcription
392 and alternative splicing, and it is also a transcriptional
393 coregulator of the viral oncoprotein v-Rel. Multiple transcript
394 variants have been observed for this gene. A related pseudogene has
395 been identified on chromosome X. [provided by RefSeq, Aug 2011].
397 Transcript Variant: This variant (1) encodes the longest isoform
398 (a, also called CC1.4).
400 Publication Note: This RefSeq record includes a subset of the
401 publications that are available for this gene. Please see the Gene
402 record to access additional publications.
404 ##Evidence-Data-START##
405 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
406 RNAseq introns :: mixed/partial sample support
407 ERS025081, ERS025082 [ECO:0000350]
408 ##Evidence-Data-END##
409 COMPLETENESS: complete on the 3' end.
410 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
411 1-578 DC346351.1 3-580
412 579-2872 BC141835.1 429-2722
413 2873-2881 C75555.1 1-9 c
414 FEATURES Location/Qualifiers
416 /organism="Homo sapiens"
418 /db_xref="taxon:9606"
423 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
424 /note="RNA binding motif protein 39"
425 /db_xref="GeneID:9584"
426 /db_xref="HGNC:15923"
427 /db_xref="HPRD:09201"
428 /db_xref="MIM:604739"
431 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
432 /inference="alignment:Splign:1.39.8"
435 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
436 /standard_name="REN58946"
437 /db_xref="UniSTS:383746"
438 misc_feature 221..223
440 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
441 /note="upstream in-frame stop codon"
444 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
445 /standard_name="G64285"
446 /db_xref="UniSTS:158667"
449 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
450 /inference="alignment:Splign:1.39.8"
453 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
454 /note="isoform a is encoded by transcript variant 1;
455 coactivator of activating protein-1 and estrogen
456 receptors; functional spliceosome-associated protein 59;
457 RNA-binding region (RNP1, RRM) containing 2;
458 hepatocellular carcinoma protein 1; splicing factor HCC1"
460 /product="RNA-binding protein 39 isoform a"
461 /protein_id="NP_909122.1"
462 /db_xref="GI:35493811"
463 /db_xref="CCDS:CCDS13266.1"
464 /db_xref="GeneID:9584"
465 /db_xref="HGNC:15923"
466 /db_xref="HPRD:09201"
467 /db_xref="MIM:604739"
468 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
469 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
470 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
471 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
472 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
473 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
474 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
475 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
476 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
477 PTYHNLFPDSMTATQLLVPSRR"
478 misc_feature 413..415
480 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
481 /experiment="experimental evidence, no additional details
483 /note="N-acetylalanine; propagated from
484 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
488 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
489 /inference="alignment:Splign:1.39.8"
493 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
494 /inference="alignment:Splign:1.39.8"
497 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
498 /standard_name="REN58786"
499 /db_xref="UniSTS:383586"
502 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
503 /standard_name="D19S1033"
504 /db_xref="UniSTS:154759"
507 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
508 /standard_name="REN58785"
509 /db_xref="UniSTS:383585"
511 polyA_signal 2851..2856
513 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
516 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
518 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
519 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
520 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
521 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
522 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
527 private EbiDbEntry() {
530 private void addCrossReference( final Accession accession ) {
531 if ( _cross_references == null ) {
532 _cross_references = new TreeSet<Accession>();
534 System.out.println( "XREF ADDED: " + accession );
535 _cross_references.add( accession );
539 public Object clone() throws CloneNotSupportedException {
540 throw new CloneNotSupportedException();
544 public String getAccession() {
549 public SortedSet<Accession> getCrossReferences() {
550 return _cross_references;
554 public String getGeneName() {
559 public SortedSet<GoTerm> getGoTerms() {
564 public String getProvider() {
569 public String getSequenceName() {
574 public String getSequenceSymbol() {
578 private void setSequenceSymbol( String symbol ) {
583 public String getTaxonomyIdentifier() {
588 public String getTaxonomyScientificName() {
593 public boolean isEmpty() {
594 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
595 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
596 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
599 private void setSequenceName( final String rec_name ) {
605 private void setGeneName( final String gene_name ) {
606 if ( _gene_name == null ) {
607 _gene_name = gene_name;
611 private void setTaxonomyScientificName( final String os ) {
617 private void setAccession( final String pa ) {
623 public void setProvider( final String provider ) {
624 _provider = provider;
627 private void setTaxId( final String tax_id ) {
628 if ( _tax_id == null ) {
634 public SortedSet<Annotation> getAnnotations() {
638 private void addAnnotation( final Annotation annotation ) {
639 if ( _annotations == null ) {
640 _annotations = new TreeSet<Annotation>();
642 _annotations.add( annotation );