forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java

   1 // $Id:
   2 // forester -- software libraries and applications
   3 // for genomics and evolutionary biology research.
   4 //
   5 // Copyright (C) 2010 Christian M Zmasek
   6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  25
  26 package org.forester.ws.seqdb;
  27
  28 import java.util.List;
  29 import java.util.SortedSet;
  30 import java.util.TreeSet;
  31 import java.util.regex.Matcher;
  32 import java.util.regex.Pattern;
  33
  34 import org.forester.go.GoTerm;
  35 import org.forester.phylogeny.data.Accession;
  36 import org.forester.phylogeny.data.Annotation;
  37 import org.forester.sequence.MolecularSequence;
  38 import org.forester.util.ForesterUtil;
  39
  40 public final class EbiDbEntry implements SequenceDatabaseEntry {
  41
  42     private final static boolean  DEBUG = false;
  43     private SortedSet<Annotation> _annotations;
  44     private String                _chromosome;
  45     private SortedSet<Accession>  _cross_references;
  46     private String                _de;
  47     private String                _gene_name;
  48     private String                _map;
  49     private String                _os;
  50     // FIXME actually this is NCBI entry
  51     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
  52     private String                _pa;
  53     private String                _provider;
  54     private String                _symbol;
  55     private String                _tax_id;
  56
  57     // TODO  PUBMED   15798186
  58     //TODO  (FEATURES)
  59     // source /db_xref="taxon:9606"
  60     // gene            1..2881
  61     // /gene="RBM39"
  62     //
  63     // /db_xref="MIM:604739"
  64     // CDS
  65     // /gene="RBM39"
  66     // /db_xref="MIM:604739"
  67     // /db_xref="InterPro:IPR002475"
  68     // /product="Bcl-2"
  69     // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
  70     //
  71     // Protein
  72     /*
  73     LOCUS       NM_184234               2881 bp    mRNA    linear   PRI 16-JUN-2013
  74     DEFINITION  Homo sapiens RNA binding motif protein 39 (RBM39), transcript
  75             variant 1, mRNA.
  76     ACCESSION   NM_184234
  77     VERSION     NM_184234.2  GI:336176061
  78     KEYWORDS    RefSeq.
  79     SOURCE      Homo sapiens (human)
  80     ORGANISM  Homo sapiens
  81             Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
  82             Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
  83             Catarrhini; Hominidae; Homo.
  84     REFERENCE   1  (bases 1 to 2881)
  85     AUTHORS   Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
  86             Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
  87             Meijer,G.A. and Fijneman,R.J.
  88     TITLE     CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
  89             progression
  90     JOURNAL   Cell Oncol (Dordr) 35 (4), 293-300 (2012)
  91     PUBMED   22711543
  92     REMARK    GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
  93             levels correlated with chromosome 20q DNA copy number status.
  94     REFERENCE   2  (bases 1 to 2881)
  95     AUTHORS   Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
  96     TITLE     CAPER-alpha alternative splicing regulates the expression of
  97             vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
  98     JOURNAL   Cancer 118 (8), 2106-2116 (2012)
  99     PUBMED   22009261
 100     REMARK    GeneRIF: Increased VEGF(165) expression is secondary to the
 101             down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
 102             alternative splicing and controls the shift from VEGF(189) to
 103             VEGF(165) .
 104     REFERENCE   3  (bases 1 to 2881)
 105     AUTHORS   Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
 106             Newton,D.L.
 107     TITLE     Proteomic analysis of nuclei isolated from cancer cell lines
 108             treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
 109             inhibitor
 110     JOURNAL   J. Proteome Res. 9 (8), 4016-4027 (2010)
 111     PUBMED   20515076
 112     REMARK    Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
 113     REFERENCE   4  (bases 1 to 2881)
 114     AUTHORS   Zhang,J.Y., Looi,K.S. and Tan,E.M.
 115     TITLE     Identification of tumor-associated antigens as diagnostic and
 116             predictive biomarkers in cancer
 117     JOURNAL   Methods Mol. Biol. 520, 1-10 (2009)
 118     PUBMED   19381943
 119     REFERENCE   5  (bases 1 to 2881)
 120     AUTHORS   Dutta,J., Fan,G. and Gelinas,C.
 121     TITLE     CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
 122             lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
 123             v-Rel
 124     JOURNAL   J. Virol. 82 (21), 10792-10802 (2008)
 125     PUBMED   18753212
 126     REMARK    GeneRIF: this study identifies CAPERalpha (RNA binding motif
 127             protein 39) as a new transcriptional coregulator for v-Rel and
 128             reveals an important role in modulating Rel's oncogenic activity.
 129     REFERENCE   6  (bases 1 to 2881)
 130     AUTHORS   Cazalla,D., Newton,K. and Caceres,J.F.
 131     TITLE     A novel SR-related protein is required for the second step of
 132             Pre-mRNA splicing
 133     JOURNAL   Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
 134     PUBMED   15798186
 135     REFERENCE   7  (bases 1 to 2881)
 136     AUTHORS   Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
 137             Berget,S.M. and O'Malley,B.W.
 138     TITLE     Steroid hormone receptor coactivation and alternative RNA splicing
 139             by U2AF65-related proteins CAPERalpha and CAPERbeta
 140     JOURNAL   Mol. Cell 17 (3), 429-439 (2005)
 141     PUBMED   15694343
 142     REFERENCE   8  (bases 1 to 2881)
 143     AUTHORS   Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
 144             Ridenour,G., Hyde,J.D. and Witten,M.L.
 145     TITLE     Dose-dependent transcriptome changes by metal ores on a human acute
 146             lymphoblastic leukemia cell line
 147     JOURNAL   Toxicol Ind Health 19 (7-10), 157-163 (2003)
 148     PUBMED   15747776
 149     REMARK    GeneRIF: 10 genes were down-regulated following treatment of the
 150             T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
 151     REFERENCE   9  (bases 1 to 2881)
 152     AUTHORS   Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
 153     TITLE     Molecular cloning and characterization of CAPER, a novel
 154             coactivator of activating protein-1 and estrogen receptors
 155     JOURNAL   J. Biol. Chem. 277 (2), 1229-1234 (2002)
 156     PUBMED   11704680
 157     REMARK    GeneRIF: This paper describes the mouse gene.
 158     REFERENCE   10 (bases 1 to 2881)
 159     AUTHORS   Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
 160     TITLE     Novel nuclear autoantigen with splicing factor motifs identified
 161             with antibody from hepatocellular carcinoma
 162     JOURNAL   J. Clin. Invest. 92 (5), 2419-2426 (1993)
 163     PUBMED   8227358
 164     COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
 165             reference sequence was derived from DC346351.1, BC141835.1 and
 166             C75555.1.
 167             On Jun 16, 2011 this sequence version replaced gi:35493810.
 168
 169             Summary: This gene encodes a member of the U2AF65 family of
 170             proteins. The encoded protein is found in the nucleus, where it
 171             co-localizes with core spliceosomal proteins. It has been shown to
 172             play a role in both steroid hormone receptor-mediated transcription
 173             and alternative splicing, and it is also a transcriptional
 174             coregulator of the viral oncoprotein v-Rel. Multiple transcript
 175             variants have been observed for this gene. A related pseudogene has
 176             been identified on chromosome X. [provided by RefSeq, Aug 2011].
 177
 178             Transcript Variant: This variant (1) encodes the longest isoform
 179             (a, also called CC1.4).
 180
 181             Publication Note:  This RefSeq record includes a subset of the
 182             publications that are available for this gene. Please see the Gene
 183             record to access additional publications.
 184
 185             ##Evidence-Data-START##
 186             Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
 187             RNAseq introns              :: mixed/partial sample support
 188                                            ERS025081, ERS025082 [ECO:0000350]
 189             ##Evidence-Data-END##
 190             COMPLETENESS: complete on the 3' end.
 191     PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
 192             1-578               DC346351.1         3-580
 193             579-2872            BC141835.1         429-2722
 194             2873-2881           C75555.1           1-9                 c
 195     FEATURES             Location/Qualifiers
 196      source          1..2881
 197                      /organism="Homo sapiens"
 198                      /mol_type="mRNA"
 199                      /db_xref="taxon:9606"
 200                      /chromosome="20"
 201                      /map="20q11.22"
 202      gene            1..2881
 203                      /gene="RBM39"
 204                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 205                      /note="RNA binding motif protein 39"
 206                      /db_xref="GeneID:9584"
 207                      /db_xref="HGNC:15923"
 208                      /db_xref="HPRD:09201"
 209                      /db_xref="MIM:604739"
 210      exon            1..396
 211                      /gene="RBM39"
 212                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 213                      /inference="alignment:Splign:1.39.8"
 214      STS             35..262
 215                      /gene="RBM39"
 216                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 217                      /standard_name="REN58946"
 218                      /db_xref="UniSTS:383746"
 219      misc_feature    221..223
 220                      /gene="RBM39"
 221                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 222                      /note="upstream in-frame stop codon"
 223      STS             299..453
 224                      /gene="RBM39"
 225                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 226                      /standard_name="G64285"
 227                      /db_xref="UniSTS:158667"
 228      exon            397..460
 229                      /gene="RBM39"
 230                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 231                      /inference="alignment:Splign:1.39.8"
 232      CDS             410..2002
 233                      /gene="RBM39"
 234                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 235                      /note="isoform a is encoded by transcript variant 1;
 236                      coactivator of activating protein-1 and estrogen
 237                      receptors; functional spliceosome-associated protein 59;
 238                      RNA-binding region (RNP1, RRM) containing 2;
 239                      hepatocellular carcinoma protein 1; splicing factor HCC1"
 240                      /codon_start=1
 241                      /product="RNA-binding protein 39 isoform a"
 242                      /protein_id="NP_909122.1"
 243                      /db_xref="GI:35493811"
 244                      /db_xref="CCDS:CCDS13266.1"
 245                      /db_xref="GeneID:9584"
 246                      /db_xref="HGNC:15923"
 247                      /db_xref="HPRD:09201"
 248                      /db_xref="MIM:604739"
 249                      /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
 250                      HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
 251                      KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
 252                      AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
 253                      LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
 254                      ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
 255                      ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
 256                      FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
 257                      IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
 258                      PTYHNLFPDSMTATQLLVPSRR"
 259      misc_feature    413..415
 260                      /gene="RBM39"
 261                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 262                      /experiment="experimental evidence, no additional details
 263                      recorded"
 264                      /note="N-acetylalanine; propagated from
 265                      UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
 266
 267      exon            461..510
 268                      /gene="RBM39"
 269                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 270                      /inference="alignment:Splign:1.39.8"
 271
 272      exon            1902..2874
 273                      /gene="RBM39"
 274                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 275                      /inference="alignment:Splign:1.39.8"
 276      STS             1956..2182
 277                      /gene="RBM39"
 278                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 279                      /standard_name="REN58786"
 280                      /db_xref="UniSTS:383586"
 281      STS             2104..2148
 282                      /gene="RBM39"
 283                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 284                      /standard_name="D19S1033"
 285                      /db_xref="UniSTS:154759"
 286      STS             2145..2400
 287                      /gene="RBM39"
 288                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 289                      /standard_name="REN58785"
 290                      /db_xref="UniSTS:383585"
 291
 292      polyA_signal    2851..2856
 293                      /gene="RBM39"
 294                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 295      polyA_site      2874
 296                      /gene="RBM39"
 297                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 298     ORIGIN
 299         1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
 300        61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
 301       121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
 302       181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
 303       241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
 304
 305
 306
 307      */
 308     private EbiDbEntry() {
 309     }
 310
 311     @Override
 312     public Object clone() throws CloneNotSupportedException {
 313         throw new CloneNotSupportedException();
 314     }
 315
 316     @Override
 317     public String getAccession() {
 318         return _pa;
 319     }
 320
 321     @Override
 322     public SortedSet<Annotation> getAnnotations() {
 323         return _annotations;
 324     }
 325
 326     @Override
 327     public String getChromosome() {
 328         return _chromosome;
 329     }
 330
 331     @Override
 332     public SortedSet<Accession> getCrossReferences() {
 333         return _cross_references;
 334     }
 335
 336     @Override
 337     public String getGeneName() {
 338         return _gene_name;
 339     }
 340
 341     @Override
 342     public SortedSet<GoTerm> getGoTerms() {
 343         return null;
 344     }
 345
 346     @Override
 347     public String getMap() {
 348         return _map;
 349     }
 350
 351     @Override
 352     public String getProvider() {
 353         return _provider;
 354     }
 355
 356     @Override
 357     public String getSequenceName() {
 358         return _de;
 359     }
 360
 361     @Override
 362     public String getSequenceSymbol() {
 363         return _symbol;
 364     }
 365
 366     @Override
 367     public String getTaxonomyIdentifier() {
 368         return _tax_id;
 369     }
 370
 371     @Override
 372     public String getTaxonomyScientificName() {
 373         return _os;
 374     }
 375
 376     @Override
 377     public boolean isEmpty() {
 378         return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
 379                 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
 380                 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
 381     }
 382
 383     public void setProvider( final String provider ) {
 384         _provider = provider;
 385     }
 386
 387     private void addAnnotation( final Annotation annotation ) {
 388         if ( _annotations == null ) {
 389             _annotations = new TreeSet<Annotation>();
 390         }
 391         _annotations.add( annotation );
 392     }
 393
 394     private void addCrossReference( final Accession accession ) {
 395         if ( _cross_references == null ) {
 396             _cross_references = new TreeSet<Accession>();
 397         }
 398         if ( DEBUG ) {
 399             System.out.println( "XREF ADDED: " + accession );
 400         }
 401         _cross_references.add( accession );
 402     }
 403
 404     private void setAccession( final String pa ) {
 405         if ( _pa == null ) {
 406             _pa = pa;
 407         }
 408     }
 409
 410     private void setChromosome( final String chromosome ) {
 411         _chromosome = chromosome;
 412     }
 413
 414     private void setGeneName( final String gene_name ) {
 415         if ( _gene_name == null ) {
 416             _gene_name = gene_name;
 417         }
 418     }
 419
 420     private void setMap( final String map ) {
 421         _map = map;
 422     }
 423
 424     private void setSequenceName( final String rec_name ) {
 425         if ( _de == null ) {
 426             _de = rec_name;
 427         }
 428     }
 429
 430     private void setSequenceSymbol( final String symbol ) {
 431         _symbol = symbol;
 432     }
 433
 434     private void setTaxId( final String tax_id ) {
 435         if ( _tax_id == null ) {
 436             _tax_id = tax_id;
 437         }
 438     }
 439
 440     private void setTaxonomyScientificName( final String os ) {
 441         if ( _os == null ) {
 442             _os = os;
 443         }
 444     }
 445
 446     //    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
 447     //        final EbiDbEntry e = new EbiDbEntry();
 448     //        for( final String line : lines ) {
 449     //            if ( line.startsWith( "PA" ) ) {
 450     //                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
 451     //            }
 452     //            else if ( line.startsWith( "DE" ) ) {
 453     //                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
 454     //            }
 455     //            else if ( line.startsWith( "OS" ) ) {
 456     //                if ( line.indexOf( "(" ) > 0 ) {
 457     //                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 458     //                }
 459     //                else {
 460     //                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
 461     //                }
 462     //            }
 463     //            else if ( line.startsWith( "OX" ) ) {
 464     //                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
 465     //                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
 466     //                }
 467     //            }
 468     //        }
 469     //        return e;
 470     //    }
 471     public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
 472         final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
 473         final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
 474         final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
 475         final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
 476         final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
 477         final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
 478         final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
 479         final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
 480         final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
 481         final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
 482         final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
 483         final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
 484         final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
 485         final EbiDbEntry e = new EbiDbEntry();
 486         final StringBuilder def = new StringBuilder();
 487         boolean in_definition = false;
 488         boolean in_features = false;
 489         boolean in_source = false;
 490         boolean in_gene = false;
 491         boolean in_cds = false;
 492         boolean in_mrna = false;
 493         boolean in_protein = false;
 494         for( final String line : lines ) {
 495             if ( line.startsWith( "ACCESSION " ) ) {
 496                 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
 497                 in_definition = false;
 498             }
 499             else if ( line.startsWith( "ID " ) ) {
 500                 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
 501                 in_definition = false;
 502             }
 503             else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
 504                 boolean definiton = false;
 505                 if ( line.startsWith( "DEFINITION " ) ) {
 506                     definiton = true;
 507                 }
 508                 if ( line.indexOf( "[" ) > 0 ) {
 509                     if ( definiton ) {
 510                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
 511                     }
 512                     else {
 513                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
 514                     }
 515                 }
 516                 else if ( line.indexOf( "." ) > 0 ) {
 517                     if ( definiton ) {
 518                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
 519                     }
 520                     else {
 521                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
 522                     }
 523                 }
 524                 else {
 525                     if ( definiton ) {
 526                         x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
 527                     }
 528                     else {
 529                         x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
 530                     }
 531                 }
 532                 if ( definiton ) {
 533                     in_definition = true;
 534                 }
 535             }
 536             else if ( line.startsWith( "  ORGANISM " ) ) {
 537                 if ( line.indexOf( "(" ) > 0 ) {
 538                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "  ORGANISM", "(" ) );
 539                 }
 540                 else {
 541                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "  ORGANISM" ) );
 542                 }
 543                 //  in_def = false;
 544             }
 545             else if ( line.startsWith( "OS " ) ) {
 546                 if ( line.indexOf( "(" ) > 0 ) {
 547                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 548                 }
 549                 else {
 550                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
 551                 }
 552             }
 553             else if ( line.startsWith( " " ) && in_definition ) {
 554                 def.append( " " );
 555                 if ( line.indexOf( "[" ) > 0 ) {
 556                     def.append( SequenceDbWsTools.extractTo( line, "[" ) );
 557                 }
 558                 else if ( line.indexOf( "." ) > 0 ) {
 559                     def.append( SequenceDbWsTools.extractTo( line, "." ) );
 560                 }
 561                 else {
 562                     def.append( line.trim() );
 563                 }
 564             }
 565             else {
 566                 in_definition = false;
 567             }
 568             if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
 569                 in_features = false;
 570                 in_source = false;
 571                 in_gene = false;
 572                 in_cds = false;
 573                 in_mrna = false;
 574                 in_protein = false;
 575                 // in_def = false;
 576             }
 577             if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
 578                 in_features = true;
 579             }
 580             if ( in_features && ( line.startsWith( "     source " ) || line.startsWith( "FT   source " ) ) ) {
 581                 in_source = true;
 582                 in_gene = false;
 583                 in_cds = false;
 584                 in_mrna = false;
 585                 in_protein = false;
 586             }
 587             if ( in_features && ( line.startsWith( "     gene " ) || line.startsWith( "FT   gene " ) ) ) {
 588                 in_source = false;
 589                 in_gene = true;
 590                 in_cds = false;
 591                 in_mrna = false;
 592                 in_protein = false;
 593             }
 594             if ( in_features && ( line.startsWith( "     CDS " ) || line.startsWith( "FT   CDS " ) ) ) {
 595                 in_source = false;
 596                 in_gene = false;
 597                 in_cds = true;
 598                 in_mrna = false;
 599                 in_protein = false;
 600             }
 601             if ( in_features && ( line.startsWith( "     Protein " ) || line.startsWith( "FT   Protein " ) ) ) {
 602                 in_source = false;
 603                 in_gene = false;
 604                 in_cds = false;
 605                 in_mrna = false;
 606                 in_protein = true;
 607             }
 608             if ( in_features && ( line.startsWith( "     mRNA " ) || line.startsWith( "FT   mRNA " ) ) ) {
 609                 in_source = false;
 610                 in_gene = false;
 611                 in_cds = false;
 612                 in_mrna = true;
 613                 in_protein = false;
 614             }
 615             if ( in_source ) {
 616                 final Matcher ti = taxon_PATTERN.matcher( line );
 617                 if ( ti.find() ) {
 618                     e.setTaxId( ti.group( 1 ) );
 619                 }
 620                 final Matcher chr = chromosome_PATTERN.matcher( line );
 621                 if ( chr.find() ) {
 622                     e.setChromosome( chr.group( 1 ) );
 623                 }
 624                 final Matcher map = map_PATTERN.matcher( line );
 625                 if ( map.find() ) {
 626                     e.setMap( map.group( 1 ) );
 627                 }
 628             }
 629             if ( in_cds || in_gene ) {
 630                 final Matcher hgnc = hgnc_PATTERN.matcher( line );
 631                 if ( hgnc.find() ) {
 632                     e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
 633                 }
 634                 final Matcher geneid = geneid_PATTERN.matcher( line );
 635                 if ( geneid.find() ) {
 636                     e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
 637                 }
 638             }
 639             if ( in_protein || in_cds || in_gene || in_mrna ) {
 640                 final Matcher ec = ec_PATTERN.matcher( line );
 641                 if ( ec.find() ) {
 642                     e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
 643                 }
 644                 final Matcher gene = gene_PATTERN.matcher( line );
 645                 if ( gene.find() ) {
 646                     e.setGeneName( gene.group( 1 ) );
 647                 }
 648                 final Matcher uniprot = uniprot_PATTERN.matcher( line );
 649                 if ( uniprot.find() ) {
 650                     e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
 651                 }
 652                 final Matcher interpro = interpro_PATTERN.matcher( line );
 653                 if ( interpro.find() ) {
 654                     e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
 655                 }
 656                 final Matcher mim = mim_PATTERN.matcher( line );
 657                 if ( mim.find() ) {
 658                     e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
 659                 }
 660                 final Matcher product = product_PATTERN.matcher( line );
 661                 if ( product.find() ) {
 662                     e.setSequenceSymbol( product.group( 1 ) );
 663                 }
 664                 final Matcher pdb = pdb_PATTERN.matcher( line );
 665                 if ( pdb.find() ) {
 666                     e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
 667                 }
 668             }
 669         }
 670         if ( def.length() > 0 ) {
 671             e.setSequenceName( def.toString().trim() );
 672         }
 673         return e;
 674     }
 675
 676     private static void x( final StringBuilder sb, final String s ) {
 677         if ( sb.length() > 0 ) {
 678             sb.append( " " );
 679         }
 680         sb.append( s.trim() );
 681     }
 682
 683     @Override
 684     public MolecularSequence getMolecularSequence() {
 685         // TODO Auto-generated method stub
 686         return null;
 687     }
 688 }