forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java

   1 // $Id:
   2 // forester -- software libraries and applications
   3 // for genomics and evolutionary biology research.
   4 //
   5 // Copyright (C) 2010 Christian M Zmasek
   6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  25
  26 package org.forester.ws.seqdb;
  27
  28 import java.util.List;
  29 import java.util.SortedSet;
  30 import java.util.TreeSet;
  31 import java.util.regex.Matcher;
  32 import java.util.regex.Pattern;
  33
  34 import org.forester.go.GoTerm;
  35 import org.forester.phylogeny.data.Accession;
  36 import org.forester.phylogeny.data.Annotation;
  37 import org.forester.sequence.MolecularSequence;
  38 import org.forester.util.ForesterUtil;
  39
  40 public final class EbiDbEntry implements SequenceDatabaseEntry {
  41
  42     private SortedSet<Annotation> _annotations;
  43     private String                _chromosome;
  44     private SortedSet<Accession>  _cross_references;
  45     private String                _de;
  46     private String                _gene_name;
  47     private String                _map;
  48     private String                _os;
  49     // FIXME actually this is NCBI entry
  50     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
  51     private String                _pa;
  52     private String                _provider;
  53     private String                _symbol;
  54     private String                _tax_id;
  55
  56     // TODO  PUBMED   15798186
  57     //TODO  (FEATURES)
  58     // source /db_xref="taxon:9606"
  59     // gene            1..2881
  60     // /gene="RBM39"
  61     //
  62     // /db_xref="MIM:604739"
  63     // CDS
  64     // /gene="RBM39"
  65     // /db_xref="MIM:604739"
  66     // /db_xref="InterPro:IPR002475"
  67     // /product="Bcl-2"
  68     // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
  69     //
  70     // Protein
  71     /*
  72     LOCUS       NM_184234               2881 bp    mRNA    linear   PRI 16-JUN-2013
  73     DEFINITION  Homo sapiens RNA binding motif protein 39 (RBM39), transcript
  74             variant 1, mRNA.
  75     ACCESSION   NM_184234
  76     VERSION     NM_184234.2  GI:336176061
  77     KEYWORDS    RefSeq.
  78     SOURCE      Homo sapiens (human)
  79     ORGANISM  Homo sapiens
  80             Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
  81             Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
  82             Catarrhini; Hominidae; Homo.
  83     REFERENCE   1  (bases 1 to 2881)
  84     AUTHORS   Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
  85             Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
  86             Meijer,G.A. and Fijneman,R.J.
  87     TITLE     CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
  88             progression
  89     JOURNAL   Cell Oncol (Dordr) 35 (4), 293-300 (2012)
  90     PUBMED   22711543
  91     REMARK    GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
  92             levels correlated with chromosome 20q DNA copy number status.
  93     REFERENCE   2  (bases 1 to 2881)
  94     AUTHORS   Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
  95     TITLE     CAPER-alpha alternative splicing regulates the expression of
  96             vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
  97     JOURNAL   Cancer 118 (8), 2106-2116 (2012)
  98     PUBMED   22009261
  99     REMARK    GeneRIF: Increased VEGF(165) expression is secondary to the
 100             down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
 101             alternative splicing and controls the shift from VEGF(189) to
 102             VEGF(165) .
 103     REFERENCE   3  (bases 1 to 2881)
 104     AUTHORS   Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
 105             Newton,D.L.
 106     TITLE     Proteomic analysis of nuclei isolated from cancer cell lines
 107             treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
 108             inhibitor
 109     JOURNAL   J. Proteome Res. 9 (8), 4016-4027 (2010)
 110     PUBMED   20515076
 111     REMARK    Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
 112     REFERENCE   4  (bases 1 to 2881)
 113     AUTHORS   Zhang,J.Y., Looi,K.S. and Tan,E.M.
 114     TITLE     Identification of tumor-associated antigens as diagnostic and
 115             predictive biomarkers in cancer
 116     JOURNAL   Methods Mol. Biol. 520, 1-10 (2009)
 117     PUBMED   19381943
 118     REFERENCE   5  (bases 1 to 2881)
 119     AUTHORS   Dutta,J., Fan,G. and Gelinas,C.
 120     TITLE     CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
 121             lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
 122             v-Rel
 123     JOURNAL   J. Virol. 82 (21), 10792-10802 (2008)
 124     PUBMED   18753212
 125     REMARK    GeneRIF: this study identifies CAPERalpha (RNA binding motif
 126             protein 39) as a new transcriptional coregulator for v-Rel and
 127             reveals an important role in modulating Rel's oncogenic activity.
 128     REFERENCE   6  (bases 1 to 2881)
 129     AUTHORS   Cazalla,D., Newton,K. and Caceres,J.F.
 130     TITLE     A novel SR-related protein is required for the second step of
 131             Pre-mRNA splicing
 132     JOURNAL   Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
 133     PUBMED   15798186
 134     REFERENCE   7  (bases 1 to 2881)
 135     AUTHORS   Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
 136             Berget,S.M. and O'Malley,B.W.
 137     TITLE     Steroid hormone receptor coactivation and alternative RNA splicing
 138             by U2AF65-related proteins CAPERalpha and CAPERbeta
 139     JOURNAL   Mol. Cell 17 (3), 429-439 (2005)
 140     PUBMED   15694343
 141     REFERENCE   8  (bases 1 to 2881)
 142     AUTHORS   Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
 143             Ridenour,G., Hyde,J.D. and Witten,M.L.
 144     TITLE     Dose-dependent transcriptome changes by metal ores on a human acute
 145             lymphoblastic leukemia cell line
 146     JOURNAL   Toxicol Ind Health 19 (7-10), 157-163 (2003)
 147     PUBMED   15747776
 148     REMARK    GeneRIF: 10 genes were down-regulated following treatment of the
 149             T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
 150     REFERENCE   9  (bases 1 to 2881)
 151     AUTHORS   Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
 152     TITLE     Molecular cloning and characterization of CAPER, a novel
 153             coactivator of activating protein-1 and estrogen receptors
 154     JOURNAL   J. Biol. Chem. 277 (2), 1229-1234 (2002)
 155     PUBMED   11704680
 156     REMARK    GeneRIF: This paper describes the mouse gene.
 157     REFERENCE   10 (bases 1 to 2881)
 158     AUTHORS   Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
 159     TITLE     Novel nuclear autoantigen with splicing factor motifs identified
 160             with antibody from hepatocellular carcinoma
 161     JOURNAL   J. Clin. Invest. 92 (5), 2419-2426 (1993)
 162     PUBMED   8227358
 163     COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
 164             reference sequence was derived from DC346351.1, BC141835.1 and
 165             C75555.1.
 166             On Jun 16, 2011 this sequence version replaced gi:35493810.
 167
 168             Summary: This gene encodes a member of the U2AF65 family of
 169             proteins. The encoded protein is found in the nucleus, where it
 170             co-localizes with core spliceosomal proteins. It has been shown to
 171             play a role in both steroid hormone receptor-mediated transcription
 172             and alternative splicing, and it is also a transcriptional
 173             coregulator of the viral oncoprotein v-Rel. Multiple transcript
 174             variants have been observed for this gene. A related pseudogene has
 175             been identified on chromosome X. [provided by RefSeq, Aug 2011].
 176
 177             Transcript Variant: This variant (1) encodes the longest isoform
 178             (a, also called CC1.4).
 179
 180             Publication Note:  This RefSeq record includes a subset of the
 181             publications that are available for this gene. Please see the Gene
 182             record to access additional publications.
 183
 184             ##Evidence-Data-START##
 185             Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
 186             RNAseq introns              :: mixed/partial sample support
 187                                            ERS025081, ERS025082 [ECO:0000350]
 188             ##Evidence-Data-END##
 189             COMPLETENESS: complete on the 3' end.
 190     PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
 191             1-578               DC346351.1         3-580
 192             579-2872            BC141835.1         429-2722
 193             2873-2881           C75555.1           1-9                 c
 194     FEATURES             Location/Qualifiers
 195      source          1..2881
 196                      /organism="Homo sapiens"
 197                      /mol_type="mRNA"
 198                      /db_xref="taxon:9606"
 199                      /chromosome="20"
 200                      /map="20q11.22"
 201      gene            1..2881
 202                      /gene="RBM39"
 203                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 204                      /note="RNA binding motif protein 39"
 205                      /db_xref="GeneID:9584"
 206                      /db_xref="HGNC:15923"
 207                      /db_xref="HPRD:09201"
 208                      /db_xref="MIM:604739"
 209      exon            1..396
 210                      /gene="RBM39"
 211                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 212                      /inference="alignment:Splign:1.39.8"
 213      STS             35..262
 214                      /gene="RBM39"
 215                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 216                      /standard_name="REN58946"
 217                      /db_xref="UniSTS:383746"
 218      misc_feature    221..223
 219                      /gene="RBM39"
 220                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 221                      /note="upstream in-frame stop codon"
 222      STS             299..453
 223                      /gene="RBM39"
 224                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 225                      /standard_name="G64285"
 226                      /db_xref="UniSTS:158667"
 227      exon            397..460
 228                      /gene="RBM39"
 229                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 230                      /inference="alignment:Splign:1.39.8"
 231      CDS             410..2002
 232                      /gene="RBM39"
 233                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 234                      /note="isoform a is encoded by transcript variant 1;
 235                      coactivator of activating protein-1 and estrogen
 236                      receptors; functional spliceosome-associated protein 59;
 237                      RNA-binding region (RNP1, RRM) containing 2;
 238                      hepatocellular carcinoma protein 1; splicing factor HCC1"
 239                      /codon_start=1
 240                      /product="RNA-binding protein 39 isoform a"
 241                      /protein_id="NP_909122.1"
 242                      /db_xref="GI:35493811"
 243                      /db_xref="CCDS:CCDS13266.1"
 244                      /db_xref="GeneID:9584"
 245                      /db_xref="HGNC:15923"
 246                      /db_xref="HPRD:09201"
 247                      /db_xref="MIM:604739"
 248                      /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
 249                      HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
 250                      KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
 251                      AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
 252                      LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
 253                      ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
 254                      ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
 255                      FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
 256                      IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
 257                      PTYHNLFPDSMTATQLLVPSRR"
 258      misc_feature    413..415
 259                      /gene="RBM39"
 260                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 261                      /experiment="experimental evidence, no additional details
 262                      recorded"
 263                      /note="N-acetylalanine; propagated from
 264                      UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
 265
 266      exon            461..510
 267                      /gene="RBM39"
 268                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 269                      /inference="alignment:Splign:1.39.8"
 270
 271      exon            1902..2874
 272                      /gene="RBM39"
 273                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 274                      /inference="alignment:Splign:1.39.8"
 275      STS             1956..2182
 276                      /gene="RBM39"
 277                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 278                      /standard_name="REN58786"
 279                      /db_xref="UniSTS:383586"
 280      STS             2104..2148
 281                      /gene="RBM39"
 282                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 283                      /standard_name="D19S1033"
 284                      /db_xref="UniSTS:154759"
 285      STS             2145..2400
 286                      /gene="RBM39"
 287                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 288                      /standard_name="REN58785"
 289                      /db_xref="UniSTS:383585"
 290
 291      polyA_signal    2851..2856
 292                      /gene="RBM39"
 293                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 294      polyA_site      2874
 295                      /gene="RBM39"
 296                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 297     ORIGIN
 298         1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
 299        61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
 300       121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
 301       181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
 302       241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
 303
 304
 305
 306      */
 307     private EbiDbEntry() {
 308     }
 309
 310     @Override
 311     public Object clone() throws CloneNotSupportedException {
 312         throw new CloneNotSupportedException();
 313     }
 314
 315     @Override
 316     public String getAccession() {
 317         return _pa;
 318     }
 319
 320     @Override
 321     public SortedSet<Annotation> getAnnotations() {
 322         return _annotations;
 323     }
 324
 325     @Override
 326     public String getChromosome() {
 327         return _chromosome;
 328     }
 329
 330     @Override
 331     public SortedSet<Accession> getCrossReferences() {
 332         return _cross_references;
 333     }
 334
 335     @Override
 336     public String getGeneName() {
 337         return _gene_name;
 338     }
 339
 340     @Override
 341     public SortedSet<GoTerm> getGoTerms() {
 342         return null;
 343     }
 344
 345     @Override
 346     public String getMap() {
 347         return _map;
 348     }
 349
 350     @Override
 351     public String getProvider() {
 352         return _provider;
 353     }
 354
 355     @Override
 356     public String getSequenceName() {
 357         return _de;
 358     }
 359
 360     @Override
 361     public String getSequenceSymbol() {
 362         return _symbol;
 363     }
 364
 365     @Override
 366     public String getTaxonomyIdentifier() {
 367         return _tax_id;
 368     }
 369
 370     @Override
 371     public String getTaxonomyScientificName() {
 372         return _os;
 373     }
 374
 375     @Override
 376     public boolean isEmpty() {
 377         return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
 378                 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
 379                 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
 380     }
 381
 382     public void setProvider( final String provider ) {
 383         _provider = provider;
 384     }
 385
 386     private void addAnnotation( final Annotation annotation ) {
 387         if ( _annotations == null ) {
 388             _annotations = new TreeSet<Annotation>();
 389         }
 390         _annotations.add( annotation );
 391     }
 392
 393     private void addCrossReference( final Accession accession ) {
 394         if ( _cross_references == null ) {
 395             _cross_references = new TreeSet<Accession>();
 396         }
 397         System.out.println( "XREF ADDED: " + accession );
 398         _cross_references.add( accession );
 399     }
 400
 401     private void setAccession( final String pa ) {
 402         if ( _pa == null ) {
 403             _pa = pa;
 404         }
 405     }
 406
 407     private void setChromosome( final String chromosome ) {
 408         _chromosome = chromosome;
 409     }
 410
 411     private void setGeneName( final String gene_name ) {
 412         if ( _gene_name == null ) {
 413             _gene_name = gene_name;
 414         }
 415     }
 416
 417     private void setMap( final String map ) {
 418         _map = map;
 419     }
 420
 421     private void setSequenceName( final String rec_name ) {
 422         if ( _de == null ) {
 423             _de = rec_name;
 424         }
 425     }
 426
 427     private void setSequenceSymbol( final String symbol ) {
 428         _symbol = symbol;
 429     }
 430
 431     private void setTaxId( final String tax_id ) {
 432         if ( _tax_id == null ) {
 433             _tax_id = tax_id;
 434         }
 435     }
 436
 437     private void setTaxonomyScientificName( final String os ) {
 438         if ( _os == null ) {
 439             _os = os;
 440         }
 441     }
 442
 443     //    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
 444     //        final EbiDbEntry e = new EbiDbEntry();
 445     //        for( final String line : lines ) {
 446     //            if ( line.startsWith( "PA" ) ) {
 447     //                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
 448     //            }
 449     //            else if ( line.startsWith( "DE" ) ) {
 450     //                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
 451     //            }
 452     //            else if ( line.startsWith( "OS" ) ) {
 453     //                if ( line.indexOf( "(" ) > 0 ) {
 454     //                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 455     //                }
 456     //                else {
 457     //                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
 458     //                }
 459     //            }
 460     //            else if ( line.startsWith( "OX" ) ) {
 461     //                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
 462     //                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
 463     //                }
 464     //            }
 465     //        }
 466     //        return e;
 467     //    }
 468     public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
 469         final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
 470         final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
 471         final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
 472         final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
 473         final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
 474         final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
 475         final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
 476         final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
 477         final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
 478         final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
 479         final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
 480         final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
 481         final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
 482         final EbiDbEntry e = new EbiDbEntry();
 483         final StringBuilder def = new StringBuilder();
 484         boolean in_definition = false;
 485         boolean in_features = false;
 486         boolean in_source = false;
 487         boolean in_gene = false;
 488         boolean in_cds = false;
 489         boolean in_mrna = false;
 490         boolean in_protein = false;
 491         for( final String line : lines ) {
 492             if ( line.startsWith( "ACCESSION " ) ) {
 493                 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
 494                 in_definition = false;
 495             }
 496             else if ( line.startsWith( "ID " ) ) {
 497                 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
 498                 in_definition = false;
 499             }
 500             else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
 501                 boolean definiton = false;
 502                 if ( line.startsWith( "DEFINITION " ) ) {
 503                     definiton = true;
 504                 }
 505                 if ( line.indexOf( "[" ) > 0 ) {
 506                     if ( definiton ) {
 507                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
 508                     }
 509                     else {
 510                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
 511                     }
 512                 }
 513                 else if ( line.indexOf( "." ) > 0 ) {
 514                     if ( definiton ) {
 515                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
 516                     }
 517                     else {
 518                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
 519                     }
 520                 }
 521                 else {
 522                     if ( definiton ) {
 523                         x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
 524                     }
 525                     else {
 526                         x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
 527                     }
 528                 }
 529                 if ( definiton ) {
 530                     in_definition = true;
 531                 }
 532             }
 533             else if ( line.startsWith( "  ORGANISM " ) ) {
 534                 if ( line.indexOf( "(" ) > 0 ) {
 535                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "  ORGANISM", "(" ) );
 536                 }
 537                 else {
 538                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "  ORGANISM" ) );
 539                 }
 540                 //  in_def = false;
 541             }
 542             else if ( line.startsWith( "OS " ) ) {
 543                 if ( line.indexOf( "(" ) > 0 ) {
 544                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 545                 }
 546                 else {
 547                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
 548                 }
 549             }
 550             else if ( line.startsWith( " " ) && in_definition ) {
 551                 def.append( " " );
 552                 if ( line.indexOf( "[" ) > 0 ) {
 553                     def.append( SequenceDbWsTools.extractTo( line, "[" ) );
 554                 }
 555                 else if ( line.indexOf( "." ) > 0 ) {
 556                     def.append( SequenceDbWsTools.extractTo( line, "." ) );
 557                 }
 558                 else {
 559                     def.append( line.trim() );
 560                 }
 561             }
 562             else {
 563                 in_definition = false;
 564             }
 565             if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
 566                 in_features = false;
 567                 in_source = false;
 568                 in_gene = false;
 569                 in_cds = false;
 570                 in_mrna = false;
 571                 in_protein = false;
 572                 // in_def = false;
 573             }
 574             if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
 575                 in_features = true;
 576             }
 577             if ( in_features && ( line.startsWith( "     source " ) || line.startsWith( "FT   source " ) ) ) {
 578                 in_source = true;
 579                 in_gene = false;
 580                 in_cds = false;
 581                 in_mrna = false;
 582                 in_protein = false;
 583             }
 584             if ( in_features && ( line.startsWith( "     gene " ) || line.startsWith( "FT   gene " ) ) ) {
 585                 in_source = false;
 586                 in_gene = true;
 587                 in_cds = false;
 588                 in_mrna = false;
 589                 in_protein = false;
 590             }
 591             if ( in_features && ( line.startsWith( "     CDS " ) || line.startsWith( "FT   CDS " ) ) ) {
 592                 in_source = false;
 593                 in_gene = false;
 594                 in_cds = true;
 595                 in_mrna = false;
 596                 in_protein = false;
 597             }
 598             if ( in_features && ( line.startsWith( "     Protein " ) || line.startsWith( "FT   Protein " ) ) ) {
 599                 in_source = false;
 600                 in_gene = false;
 601                 in_cds = false;
 602                 in_mrna = false;
 603                 in_protein = true;
 604             }
 605             if ( in_features && ( line.startsWith( "     mRNA " ) || line.startsWith( "FT   mRNA " ) ) ) {
 606                 in_source = false;
 607                 in_gene = false;
 608                 in_cds = false;
 609                 in_mrna = true;
 610                 in_protein = false;
 611             }
 612             if ( in_source ) {
 613                 final Matcher ti = taxon_PATTERN.matcher( line );
 614                 if ( ti.find() ) {
 615                     e.setTaxId( ti.group( 1 ) );
 616                 }
 617                 final Matcher chr = chromosome_PATTERN.matcher( line );
 618                 if ( chr.find() ) {
 619                     e.setChromosome( chr.group( 1 ) );
 620                 }
 621                 final Matcher map = map_PATTERN.matcher( line );
 622                 if ( map.find() ) {
 623                     e.setMap( map.group( 1 ) );
 624                 }
 625             }
 626             if ( in_cds || in_gene ) {
 627                 final Matcher hgnc = hgnc_PATTERN.matcher( line );
 628                 if ( hgnc.find() ) {
 629                     e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
 630                 }
 631                 final Matcher geneid = geneid_PATTERN.matcher( line );
 632                 if ( geneid.find() ) {
 633                     e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
 634                 }
 635             }
 636             if ( in_protein || in_cds || in_gene || in_mrna ) {
 637                 final Matcher ec = ec_PATTERN.matcher( line );
 638                 if ( ec.find() ) {
 639                     e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
 640                 }
 641                 final Matcher gene = gene_PATTERN.matcher( line );
 642                 if ( gene.find() ) {
 643                     e.setGeneName( gene.group( 1 ) );
 644                 }
 645                 final Matcher uniprot = uniprot_PATTERN.matcher( line );
 646                 if ( uniprot.find() ) {
 647                     e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
 648                 }
 649                 final Matcher interpro = interpro_PATTERN.matcher( line );
 650                 if ( interpro.find() ) {
 651                     e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
 652                 }
 653                 final Matcher mim = mim_PATTERN.matcher( line );
 654                 if ( mim.find() ) {
 655                     e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
 656                 }
 657                 final Matcher product = product_PATTERN.matcher( line );
 658                 if ( product.find() ) {
 659                     e.setSequenceSymbol( product.group( 1 ) );
 660                 }
 661                 final Matcher pdb = pdb_PATTERN.matcher( line );
 662                 if ( pdb.find() ) {
 663                     e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
 664                 }
 665             }
 666         }
 667         if ( def.length() > 0 ) {
 668             e.setSequenceName( def.toString().trim() );
 669         }
 670         return e;
 671     }
 672
 673     private static void x( final StringBuilder sb, final String s ) {
 674         if ( sb.length() > 0 ) {
 675             sb.append( " " );
 676         }
 677         sb.append( s.trim() );
 678     }
 679
 680     @Override
 681     public MolecularSequence getMolecularSequence() {
 682         // TODO Auto-generated method stub
 683         return null;
 684     }
 685 }