2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.List;
29 import java.util.SortedSet;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import org.forester.go.GoTerm;
35 import org.forester.phylogeny.data.Accession;
36 import org.forester.phylogeny.data.Annotation;
37 import org.forester.util.ForesterUtil;
39 public final class EbiDbEntry implements SequenceDatabaseEntry {
41 // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
42 // final EbiDbEntry e = new EbiDbEntry();
43 // for( final String line : lines ) {
44 // if ( line.startsWith( "PA" ) ) {
45 // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
47 // else if ( line.startsWith( "DE" ) ) {
48 // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
50 // else if ( line.startsWith( "OS" ) ) {
51 // if ( line.indexOf( "(" ) > 0 ) {
52 // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
55 // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
58 // else if ( line.startsWith( "OX" ) ) {
59 // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
60 // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
66 public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
67 final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
68 final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
69 final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
70 final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
71 final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
72 final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
73 final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
74 final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
75 final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
76 final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
77 final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
80 final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
81 final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
82 final EbiDbEntry e = new EbiDbEntry();
83 final StringBuilder def = new StringBuilder();
84 boolean in_definition = false;
85 boolean in_features = false;
86 boolean in_source = false;
87 boolean in_gene = false;
88 boolean in_cds = false;
89 boolean in_mrna = false;
90 boolean in_protein = false;
91 for( final String line : lines ) {
92 if ( line.startsWith( "ACCESSION " ) ) {
93 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
94 in_definition = false;
96 else if ( line.startsWith( "ID " ) ) {
97 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
98 in_definition = false;
100 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
101 boolean definiton = false;
102 if ( line.startsWith( "DEFINITION " ) ) {
105 if ( line.indexOf( "[" ) > 0 ) {
107 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
110 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
113 else if ( line.indexOf( "." ) > 0 ) {
115 x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
118 x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
123 x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
126 x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
130 in_definition = true;
133 else if ( line.startsWith( " ORGANISM " ) ) {
134 if ( line.indexOf( "(" ) > 0 ) {
135 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
138 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
142 else if ( line.startsWith( "OS " ) ) {
143 if ( line.indexOf( "(" ) > 0 ) {
144 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
147 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
150 else if ( line.startsWith( " " ) && in_definition ) {
152 if ( line.indexOf( "[" ) > 0 ) {
153 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
155 else if ( line.indexOf( "." ) > 0 ) {
156 def.append( SequenceDbWsTools.extractTo( line, "." ) );
159 def.append( line.trim() );
163 in_definition = false;
165 if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
174 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
177 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
184 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
191 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
198 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
205 if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
213 final Matcher m = taxon_PATTERN.matcher( line );
215 e.setTaxId( m.group( 1 ) );
218 if ( in_cds || in_gene ) {
219 final Matcher hgnc = hgnc_PATTERN.matcher( line );
221 e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
223 final Matcher geneid = geneid_PATTERN.matcher( line );
224 if ( geneid.find() ) {
225 e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
228 if ( in_protein || in_cds || in_gene || in_mrna ) {
229 final Matcher ec = ec_PATTERN.matcher( line );
231 e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
233 final Matcher gene = gene_PATTERN.matcher( line );
235 e.setGeneName( gene.group( 1 ) );
237 final Matcher uniprot = uniprot_PATTERN.matcher( line );
238 if ( uniprot.find() ) {
239 e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
241 final Matcher interpro = interpro_PATTERN.matcher( line );
242 if ( interpro.find() ) {
243 e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
245 final Matcher mim = mim_PATTERN.matcher( line );
247 e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
249 final Matcher product = product_PATTERN.matcher( line );
250 if ( product.find() ) {
251 e.setSequenceSymbol( product.group( 1 ) );
253 final Matcher pdb = pdb_PATTERN.matcher( line );
255 e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
259 if ( def.length() > 0 ) {
260 e.setSequenceName( def.toString().trim() );
265 private static void x( final StringBuilder sb, final String s ) {
266 if ( sb.length() > 0 ) {
269 sb.append( s.trim() );
271 // FIXME actually this is NCBI entry
272 //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
276 private String _tax_id;
277 private String _symbol;
278 private String _provider;
279 private SortedSet<Accession> _cross_references;
280 private SortedSet<Annotation> _annotations;
281 private String _gene_name;
283 // TODO PUBMED 15798186
285 // source /db_xref="taxon:9606"
289 // /db_xref="MIM:604739"
292 // /db_xref="MIM:604739"
293 // /db_xref="InterPro:IPR002475"
295 // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
299 LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013
300 DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript
303 VERSION NM_184234.2 GI:336176061
305 SOURCE Homo sapiens (human)
306 ORGANISM Homo sapiens
307 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
308 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
309 Catarrhini; Hominidae; Homo.
310 REFERENCE 1 (bases 1 to 2881)
311 AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
312 Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
313 Meijer,G.A. and Fijneman,R.J.
314 TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
316 JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012)
318 REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
319 levels correlated with chromosome 20q DNA copy number status.
320 REFERENCE 2 (bases 1 to 2881)
321 AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
322 TITLE CAPER-alpha alternative splicing regulates the expression of
323 vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
324 JOURNAL Cancer 118 (8), 2106-2116 (2012)
326 REMARK GeneRIF: Increased VEGF(165) expression is secondary to the
327 down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
328 alternative splicing and controls the shift from VEGF(189) to
330 REFERENCE 3 (bases 1 to 2881)
331 AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
333 TITLE Proteomic analysis of nuclei isolated from cancer cell lines
334 treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
336 JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010)
338 REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
339 REFERENCE 4 (bases 1 to 2881)
340 AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M.
341 TITLE Identification of tumor-associated antigens as diagnostic and
342 predictive biomarkers in cancer
343 JOURNAL Methods Mol. Biol. 520, 1-10 (2009)
345 REFERENCE 5 (bases 1 to 2881)
346 AUTHORS Dutta,J., Fan,G. and Gelinas,C.
347 TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
348 lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
350 JOURNAL J. Virol. 82 (21), 10792-10802 (2008)
352 REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif
353 protein 39) as a new transcriptional coregulator for v-Rel and
354 reveals an important role in modulating Rel's oncogenic activity.
355 REFERENCE 6 (bases 1 to 2881)
356 AUTHORS Cazalla,D., Newton,K. and Caceres,J.F.
357 TITLE A novel SR-related protein is required for the second step of
359 JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
361 REFERENCE 7 (bases 1 to 2881)
362 AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
363 Berget,S.M. and O'Malley,B.W.
364 TITLE Steroid hormone receptor coactivation and alternative RNA splicing
365 by U2AF65-related proteins CAPERalpha and CAPERbeta
366 JOURNAL Mol. Cell 17 (3), 429-439 (2005)
368 REFERENCE 8 (bases 1 to 2881)
369 AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
370 Ridenour,G., Hyde,J.D. and Witten,M.L.
371 TITLE Dose-dependent transcriptome changes by metal ores on a human acute
372 lymphoblastic leukemia cell line
373 JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003)
375 REMARK GeneRIF: 10 genes were down-regulated following treatment of the
376 T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
377 REFERENCE 9 (bases 1 to 2881)
378 AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
379 TITLE Molecular cloning and characterization of CAPER, a novel
380 coactivator of activating protein-1 and estrogen receptors
381 JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002)
383 REMARK GeneRIF: This paper describes the mouse gene.
384 REFERENCE 10 (bases 1 to 2881)
385 AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
386 TITLE Novel nuclear autoantigen with splicing factor motifs identified
387 with antibody from hepatocellular carcinoma
388 JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993)
390 COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The
391 reference sequence was derived from DC346351.1, BC141835.1 and
393 On Jun 16, 2011 this sequence version replaced gi:35493810.
395 Summary: This gene encodes a member of the U2AF65 family of
396 proteins. The encoded protein is found in the nucleus, where it
397 co-localizes with core spliceosomal proteins. It has been shown to
398 play a role in both steroid hormone receptor-mediated transcription
399 and alternative splicing, and it is also a transcriptional
400 coregulator of the viral oncoprotein v-Rel. Multiple transcript
401 variants have been observed for this gene. A related pseudogene has
402 been identified on chromosome X. [provided by RefSeq, Aug 2011].
404 Transcript Variant: This variant (1) encodes the longest isoform
405 (a, also called CC1.4).
407 Publication Note: This RefSeq record includes a subset of the
408 publications that are available for this gene. Please see the Gene
409 record to access additional publications.
411 ##Evidence-Data-START##
412 Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
413 RNAseq introns :: mixed/partial sample support
414 ERS025081, ERS025082 [ECO:0000350]
415 ##Evidence-Data-END##
416 COMPLETENESS: complete on the 3' end.
417 PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP
418 1-578 DC346351.1 3-580
419 579-2872 BC141835.1 429-2722
420 2873-2881 C75555.1 1-9 c
421 FEATURES Location/Qualifiers
423 /organism="Homo sapiens"
425 /db_xref="taxon:9606"
430 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
431 /note="RNA binding motif protein 39"
432 /db_xref="GeneID:9584"
433 /db_xref="HGNC:15923"
434 /db_xref="HPRD:09201"
435 /db_xref="MIM:604739"
438 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
439 /inference="alignment:Splign:1.39.8"
442 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
443 /standard_name="REN58946"
444 /db_xref="UniSTS:383746"
445 misc_feature 221..223
447 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
448 /note="upstream in-frame stop codon"
451 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
452 /standard_name="G64285"
453 /db_xref="UniSTS:158667"
456 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
457 /inference="alignment:Splign:1.39.8"
460 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
461 /note="isoform a is encoded by transcript variant 1;
462 coactivator of activating protein-1 and estrogen
463 receptors; functional spliceosome-associated protein 59;
464 RNA-binding region (RNP1, RRM) containing 2;
465 hepatocellular carcinoma protein 1; splicing factor HCC1"
467 /product="RNA-binding protein 39 isoform a"
468 /protein_id="NP_909122.1"
469 /db_xref="GI:35493811"
470 /db_xref="CCDS:CCDS13266.1"
471 /db_xref="GeneID:9584"
472 /db_xref="HGNC:15923"
473 /db_xref="HPRD:09201"
474 /db_xref="MIM:604739"
475 /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
476 HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
477 KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
478 AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
479 LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
480 ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
481 ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
482 FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
483 IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
484 PTYHNLFPDSMTATQLLVPSRR"
485 misc_feature 413..415
487 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
488 /experiment="experimental evidence, no additional details
490 /note="N-acetylalanine; propagated from
491 UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
495 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
496 /inference="alignment:Splign:1.39.8"
500 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
501 /inference="alignment:Splign:1.39.8"
504 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
505 /standard_name="REN58786"
506 /db_xref="UniSTS:383586"
509 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
510 /standard_name="D19S1033"
511 /db_xref="UniSTS:154759"
514 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
515 /standard_name="REN58785"
516 /db_xref="UniSTS:383585"
518 polyA_signal 2851..2856
520 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
523 /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
525 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
526 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
527 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
528 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
529 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
534 private EbiDbEntry() {
537 private void addCrossReference( final Accession accession ) {
538 if ( _cross_references == null ) {
539 _cross_references = new TreeSet<Accession>();
541 System.out.println( "XREF ADDED: " + accession );
542 _cross_references.add( accession );
546 public Object clone() throws CloneNotSupportedException {
547 throw new CloneNotSupportedException();
551 public String getAccession() {
556 public SortedSet<Accession> getCrossReferences() {
557 return _cross_references;
561 public String getGeneName() {
566 public SortedSet<GoTerm> getGoTerms() {
571 public String getProvider() {
576 public String getSequenceName() {
581 public String getSequenceSymbol() {
585 private void setSequenceSymbol( String symbol ) {
590 public String getTaxonomyIdentifier() {
595 public String getTaxonomyScientificName() {
600 public boolean isEmpty() {
601 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
602 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
603 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
606 private void setSequenceName( final String rec_name ) {
612 private void setGeneName( final String gene_name ) {
613 if ( _gene_name == null ) {
614 _gene_name = gene_name;
618 private void setTaxonomyScientificName( final String os ) {
624 private void setAccession( final String pa ) {
630 public void setProvider( final String provider ) {
631 _provider = provider;
634 private void setTaxId( final String tax_id ) {
635 if ( _tax_id == null ) {
641 public SortedSet<Annotation> getAnnotations() {
645 private void addAnnotation( final Annotation annotation ) {
646 if ( _annotations == null ) {
647 _annotations = new TreeSet<Annotation>();
649 _annotations.add( annotation );