X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FEbiDbEntry.java;h=24e0bb53f9718ca6b5f3d6c35fd5dc4e18df85b8;hb=fbb7c0a322111e5221773fed19591da29296efb5;hp=8c2ce9af78358a74f64f47af7fbba09de581ce98;hpb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java index 8c2ce9a..24e0bb5 100644 --- a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java @@ -38,7 +38,23 @@ import org.forester.sequence.MolecularSequence; import org.forester.util.ForesterUtil; public final class EbiDbEntry implements SequenceDatabaseEntry { - + + private final static boolean DEBUG = false; + + private final static Pattern LETTERS_PATTERN = Pattern.compile( "^[A-Z]+" ); + private final static Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); + private final static Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w\\.]+)\"" ); + private final static Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); + private final static Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); + private final static Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); + private final static Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" ); + private final static Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" ); + private final static Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" ); + private final static Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" ); + private final static Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" ); + private final static Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" ); + private final static Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" ); + private SortedSet _annotations; private String _chromosome; private SortedSet _cross_references; @@ -46,264 +62,11 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { private String _gene_name; private String _map; private String _os; - // FIXME actually this is NCBI entry - //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ private String _pa; private String _provider; private String _symbol; private String _tax_id; - - // TODO PUBMED 15798186 - //TODO (FEATURES) - // source /db_xref="taxon:9606" - // gene 1..2881 - // /gene="RBM39" - // - // /db_xref="MIM:604739" - // CDS - // /gene="RBM39" - // /db_xref="MIM:604739" - // /db_xref="InterPro:IPR002475" - // /product="Bcl-2" - // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse? - // - // Protein - /* - LOCUS NM_184234 2881 bp mRNA linear PRI 16-JUN-2013 - DEFINITION Homo sapiens RNA binding motif protein 39 (RBM39), transcript - variant 1, mRNA. - ACCESSION NM_184234 - VERSION NM_184234.2 GI:336176061 - KEYWORDS RefSeq. - SOURCE Homo sapiens (human) - ORGANISM Homo sapiens - Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; - Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; - Catarrhini; Hominidae; Homo. - REFERENCE 1 (bases 1 to 2881) - AUTHORS Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M., - Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F., - Meijer,G.A. and Fijneman,R.J. - TITLE CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma - progression - JOURNAL Cell Oncol (Dordr) 35 (4), 293-300 (2012) - PUBMED 22711543 - REMARK GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression - levels correlated with chromosome 20q DNA copy number status. - REFERENCE 2 (bases 1 to 2881) - AUTHORS Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S. - TITLE CAPER-alpha alternative splicing regulates the expression of - vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells - JOURNAL Cancer 118 (8), 2106-2116 (2012) - PUBMED 22009261 - REMARK GeneRIF: Increased VEGF(165) expression is secondary to the - down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates - alternative splicing and controls the shift from VEGF(189) to - VEGF(165) . - REFERENCE 3 (bases 1 to 2881) - AUTHORS Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and - Newton,D.L. - TITLE Proteomic analysis of nuclei isolated from cancer cell lines - treated with indenoisoquinoline NSC 724998, a novel topoisomerase I - inhibitor - JOURNAL J. Proteome Res. 9 (8), 4016-4027 (2010) - PUBMED 20515076 - REMARK Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128] - REFERENCE 4 (bases 1 to 2881) - AUTHORS Zhang,J.Y., Looi,K.S. and Tan,E.M. - TITLE Identification of tumor-associated antigens as diagnostic and - predictive biomarkers in cancer - JOURNAL Methods Mol. Biol. 520, 1-10 (2009) - PUBMED 19381943 - REFERENCE 5 (bases 1 to 2881) - AUTHORS Dutta,J., Fan,G. and Gelinas,C. - TITLE CAPERalpha is a novel Rel-TAD-interacting factor that inhibits - lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein - v-Rel - JOURNAL J. Virol. 82 (21), 10792-10802 (2008) - PUBMED 18753212 - REMARK GeneRIF: this study identifies CAPERalpha (RNA binding motif - protein 39) as a new transcriptional coregulator for v-Rel and - reveals an important role in modulating Rel's oncogenic activity. - REFERENCE 6 (bases 1 to 2881) - AUTHORS Cazalla,D., Newton,K. and Caceres,J.F. - TITLE A novel SR-related protein is required for the second step of - Pre-mRNA splicing - JOURNAL Mol. Cell. Biol. 25 (8), 2969-2980 (2005) - PUBMED 15798186 - REFERENCE 7 (bases 1 to 2881) - AUTHORS Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M., - Berget,S.M. and O'Malley,B.W. - TITLE Steroid hormone receptor coactivation and alternative RNA splicing - by U2AF65-related proteins CAPERalpha and CAPERbeta - JOURNAL Mol. Cell 17 (3), 429-439 (2005) - PUBMED 15694343 - REFERENCE 8 (bases 1 to 2881) - AUTHORS Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J., - Ridenour,G., Hyde,J.D. and Witten,M.L. - TITLE Dose-dependent transcriptome changes by metal ores on a human acute - lymphoblastic leukemia cell line - JOURNAL Toxicol Ind Health 19 (7-10), 157-163 (2003) - PUBMED 15747776 - REMARK GeneRIF: 10 genes were down-regulated following treatment of the - T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h - REFERENCE 9 (bases 1 to 2881) - AUTHORS Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W. - TITLE Molecular cloning and characterization of CAPER, a novel - coactivator of activating protein-1 and estrogen receptors - JOURNAL J. Biol. Chem. 277 (2), 1229-1234 (2002) - PUBMED 11704680 - REMARK GeneRIF: This paper describes the mouse gene. - REFERENCE 10 (bases 1 to 2881) - AUTHORS Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M. - TITLE Novel nuclear autoantigen with splicing factor motifs identified - with antibody from hepatocellular carcinoma - JOURNAL J. Clin. Invest. 92 (5), 2419-2426 (1993) - PUBMED 8227358 - COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The - reference sequence was derived from DC346351.1, BC141835.1 and - C75555.1. - On Jun 16, 2011 this sequence version replaced gi:35493810. - - Summary: This gene encodes a member of the U2AF65 family of - proteins. The encoded protein is found in the nucleus, where it - co-localizes with core spliceosomal proteins. It has been shown to - play a role in both steroid hormone receptor-mediated transcription - and alternative splicing, and it is also a transcriptional - coregulator of the viral oncoprotein v-Rel. Multiple transcript - variants have been observed for this gene. A related pseudogene has - been identified on chromosome X. [provided by RefSeq, Aug 2011]. - - Transcript Variant: This variant (1) encodes the longest isoform - (a, also called CC1.4). - - Publication Note: This RefSeq record includes a subset of the - publications that are available for this gene. Please see the Gene - record to access additional publications. - - ##Evidence-Data-START## - Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332] - RNAseq introns :: mixed/partial sample support - ERS025081, ERS025082 [ECO:0000350] - ##Evidence-Data-END## - COMPLETENESS: complete on the 3' end. - PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN COMP - 1-578 DC346351.1 3-580 - 579-2872 BC141835.1 429-2722 - 2873-2881 C75555.1 1-9 c - FEATURES Location/Qualifiers - source 1..2881 - /organism="Homo sapiens" - /mol_type="mRNA" - /db_xref="taxon:9606" - /chromosome="20" - /map="20q11.22" - gene 1..2881 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /note="RNA binding motif protein 39" - /db_xref="GeneID:9584" - /db_xref="HGNC:15923" - /db_xref="HPRD:09201" - /db_xref="MIM:604739" - exon 1..396 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /inference="alignment:Splign:1.39.8" - STS 35..262 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /standard_name="REN58946" - /db_xref="UniSTS:383746" - misc_feature 221..223 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /note="upstream in-frame stop codon" - STS 299..453 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /standard_name="G64285" - /db_xref="UniSTS:158667" - exon 397..460 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /inference="alignment:Splign:1.39.8" - CDS 410..2002 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /note="isoform a is encoded by transcript variant 1; - coactivator of activating protein-1 and estrogen - receptors; functional spliceosome-associated protein 59; - RNA-binding region (RNP1, RRM) containing 2; - hepatocellular carcinoma protein 1; splicing factor HCC1" - /codon_start=1 - /product="RNA-binding protein 39 isoform a" - /protein_id="NP_909122.1" - /db_xref="GI:35493811" - /db_xref="CCDS:CCDS13266.1" - /db_xref="GeneID:9584" - /db_xref="HGNC:15923" - /db_xref="HPRD:09201" - /db_xref="MIM:604739" - /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS - HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP - KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL - AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV - LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI - ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS - ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE - FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV - IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL - PTYHNLFPDSMTATQLLVPSRR" - misc_feature 413..415 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /experiment="experimental evidence, no additional details - recorded" - /note="N-acetylalanine; propagated from - UniProtKB/Swiss-Prot (Q14498.2); acetylation site" - - exon 461..510 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /inference="alignment:Splign:1.39.8" - - exon 1902..2874 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /inference="alignment:Splign:1.39.8" - STS 1956..2182 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /standard_name="REN58786" - /db_xref="UniSTS:383586" - STS 2104..2148 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /standard_name="D19S1033" - /db_xref="UniSTS:154759" - STS 2145..2400 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - /standard_name="REN58785" - /db_xref="UniSTS:383585" - - polyA_signal 2851..2856 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - polyA_site 2874 - /gene="RBM39" - /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2" - ORIGIN - 1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt - 61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat - 121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc - 181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc - 241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac - - - - */ + private EbiDbEntry() { } @@ -379,10 +142,12 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); } - public void setProvider( final String provider ) { - _provider = provider; - } + @Override + public MolecularSequence getMolecularSequence() { + // TODO Auto-generated method stub + return null; + } private void addAnnotation( final Annotation annotation ) { if ( _annotations == null ) { _annotations = new TreeSet(); @@ -394,7 +159,9 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { if ( _cross_references == null ) { _cross_references = new TreeSet(); } - System.out.println( "XREF ADDED: " + accession ); + if ( DEBUG ) { + System.out.println( "XREF ADDED: " + accession ); + } _cross_references.add( accession ); } @@ -439,46 +206,16 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { _os = os; } } - - // public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { - // final EbiDbEntry e = new EbiDbEntry(); - // for( final String line : lines ) { - // if ( line.startsWith( "PA" ) ) { - // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) ); - // } - // else if ( line.startsWith( "DE" ) ) { - // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) ); - // } - // else if ( line.startsWith( "OS" ) ) { - // if ( line.indexOf( "(" ) > 0 ) { - // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); - // } - // else { - // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) ); - // } - // } - // else if ( line.startsWith( "OX" ) ) { - // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) ); - // } - // } - // } - // return e; - // } - public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List lines ) { - final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" ); - final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); - final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" ); - final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); - final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); - final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); - final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" ); - final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" ); - final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" ); - final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" ); - final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" ); - final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" ); - final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" ); + + private static void append( final StringBuilder sb, final String s ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( s.trim() ); + } + + public final static SequenceDatabaseEntry createInstance( final List lines ) { + final EbiDbEntry e = new EbiDbEntry(); final StringBuilder def = new StringBuilder(); boolean in_definition = false; @@ -504,26 +241,26 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } if ( line.indexOf( "[" ) > 0 ) { if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) ); + append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) ); } else { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) ); + append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) ); } } else if ( line.indexOf( "." ) > 0 ) { if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) ); + append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) ); } else { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) ); + append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) ); } } else { if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) ); + append( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) ); } else { - x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) ); + append( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) ); } } if ( definiton ) { @@ -537,7 +274,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { else { e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) ); } - // in_def = false; } else if ( line.startsWith( "OS " ) ) { if ( line.indexOf( "(" ) > 0 ) { @@ -562,14 +298,13 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { else { in_definition = false; } - if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) { + if ( !line.startsWith( "FT " ) && LETTERS_PATTERN.matcher( line ).find() ) { in_features = false; in_source = false; in_gene = false; in_cds = false; in_mrna = false; in_protein = false; - // in_def = false; } if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) { in_features = true; @@ -670,16 +405,4 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { return e; } - private static void x( final StringBuilder sb, final String s ) { - if ( sb.length() > 0 ) { - sb.append( " " ); - } - sb.append( s.trim() ); - } - - @Override - public MolecularSequence getMolecularSequence() { - // TODO Auto-generated method stub - return null; - } }