forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java

   1 // $Id:
   2 // forester -- software libraries and applications
   3 // for genomics and evolutionary biology research.
   4 //
   5 // Copyright (C) 2010 Christian M Zmasek
   6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
   7 // All rights reserved
   8 //
   9 // This library is free software; you can redistribute it and/or
  10 // modify it under the terms of the GNU Lesser General Public
  11 // License as published by the Free Software Foundation; either
  12 // version 2.1 of the License, or (at your option) any later version.
  13 //
  14 // This library is distributed in the hope that it will be useful,
  15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17 // Lesser General Public License for more details.
  18 //
  19 // You should have received a copy of the GNU Lesser General Public
  20 // License along with this library; if not, write to the Free Software
  21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  22 //
  23 // Contact: phylosoft @ gmail . com
  24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
  25
  26 package org.forester.ws.seqdb;
  27
  28 import java.util.List;
  29 import java.util.SortedSet;
  30 import java.util.TreeSet;
  31 import java.util.regex.Matcher;
  32 import java.util.regex.Pattern;
  33
  34 import org.forester.go.GoTerm;
  35 import org.forester.phylogeny.data.Accession;
  36 import org.forester.phylogeny.data.Annotation;
  37 import org.forester.util.ForesterUtil;
  38
  39 public final class EbiDbEntry implements SequenceDatabaseEntry {
  40
  41     private SortedSet<Annotation> _annotations;
  42     private String                _chromosome;
  43     private SortedSet<Accession>  _cross_references;
  44     private String                _de;
  45     private String                _gene_name;
  46     private String                _map;
  47     private String                _os;
  48     // FIXME actually this is NCBI entry
  49     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
  50     private String                _pa;
  51     private String                _provider;
  52     private String                _symbol;
  53     private String                _tax_id;
  54
  55     // TODO  PUBMED   15798186
  56     //TODO  (FEATURES)
  57     // source /db_xref="taxon:9606"
  58     // gene            1..2881
  59     // /gene="RBM39"
  60     //
  61     // /db_xref="MIM:604739"
  62     // CDS
  63     // /gene="RBM39"
  64     // /db_xref="MIM:604739"
  65     // /db_xref="InterPro:IPR002475"
  66     // /product="Bcl-2"
  67     // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
  68     //
  69     // Protein
  70     /*
  71     LOCUS       NM_184234               2881 bp    mRNA    linear   PRI 16-JUN-2013
  72     DEFINITION  Homo sapiens RNA binding motif protein 39 (RBM39), transcript
  73             variant 1, mRNA.
  74     ACCESSION   NM_184234
  75     VERSION     NM_184234.2  GI:336176061
  76     KEYWORDS    RefSeq.
  77     SOURCE      Homo sapiens (human)
  78     ORGANISM  Homo sapiens
  79             Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
  80             Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
  81             Catarrhini; Hominidae; Homo.
  82     REFERENCE   1  (bases 1 to 2881)
  83     AUTHORS   Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
  84             Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
  85             Meijer,G.A. and Fijneman,R.J.
  86     TITLE     CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
  87             progression
  88     JOURNAL   Cell Oncol (Dordr) 35 (4), 293-300 (2012)
  89     PUBMED   22711543
  90     REMARK    GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
  91             levels correlated with chromosome 20q DNA copy number status.
  92     REFERENCE   2  (bases 1 to 2881)
  93     AUTHORS   Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
  94     TITLE     CAPER-alpha alternative splicing regulates the expression of
  95             vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
  96     JOURNAL   Cancer 118 (8), 2106-2116 (2012)
  97     PUBMED   22009261
  98     REMARK    GeneRIF: Increased VEGF(165) expression is secondary to the
  99             down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
 100             alternative splicing and controls the shift from VEGF(189) to
 101             VEGF(165) .
 102     REFERENCE   3  (bases 1 to 2881)
 103     AUTHORS   Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
 104             Newton,D.L.
 105     TITLE     Proteomic analysis of nuclei isolated from cancer cell lines
 106             treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
 107             inhibitor
 108     JOURNAL   J. Proteome Res. 9 (8), 4016-4027 (2010)
 109     PUBMED   20515076
 110     REMARK    Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
 111     REFERENCE   4  (bases 1 to 2881)
 112     AUTHORS   Zhang,J.Y., Looi,K.S. and Tan,E.M.
 113     TITLE     Identification of tumor-associated antigens as diagnostic and
 114             predictive biomarkers in cancer
 115     JOURNAL   Methods Mol. Biol. 520, 1-10 (2009)
 116     PUBMED   19381943
 117     REFERENCE   5  (bases 1 to 2881)
 118     AUTHORS   Dutta,J., Fan,G. and Gelinas,C.
 119     TITLE     CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
 120             lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
 121             v-Rel
 122     JOURNAL   J. Virol. 82 (21), 10792-10802 (2008)
 123     PUBMED   18753212
 124     REMARK    GeneRIF: this study identifies CAPERalpha (RNA binding motif
 125             protein 39) as a new transcriptional coregulator for v-Rel and
 126             reveals an important role in modulating Rel's oncogenic activity.
 127     REFERENCE   6  (bases 1 to 2881)
 128     AUTHORS   Cazalla,D., Newton,K. and Caceres,J.F.
 129     TITLE     A novel SR-related protein is required for the second step of
 130             Pre-mRNA splicing
 131     JOURNAL   Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
 132     PUBMED   15798186
 133     REFERENCE   7  (bases 1 to 2881)
 134     AUTHORS   Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
 135             Berget,S.M. and O'Malley,B.W.
 136     TITLE     Steroid hormone receptor coactivation and alternative RNA splicing
 137             by U2AF65-related proteins CAPERalpha and CAPERbeta
 138     JOURNAL   Mol. Cell 17 (3), 429-439 (2005)
 139     PUBMED   15694343
 140     REFERENCE   8  (bases 1 to 2881)
 141     AUTHORS   Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
 142             Ridenour,G., Hyde,J.D. and Witten,M.L.
 143     TITLE     Dose-dependent transcriptome changes by metal ores on a human acute
 144             lymphoblastic leukemia cell line
 145     JOURNAL   Toxicol Ind Health 19 (7-10), 157-163 (2003)
 146     PUBMED   15747776
 147     REMARK    GeneRIF: 10 genes were down-regulated following treatment of the
 148             T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
 149     REFERENCE   9  (bases 1 to 2881)
 150     AUTHORS   Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
 151     TITLE     Molecular cloning and characterization of CAPER, a novel
 152             coactivator of activating protein-1 and estrogen receptors
 153     JOURNAL   J. Biol. Chem. 277 (2), 1229-1234 (2002)
 154     PUBMED   11704680
 155     REMARK    GeneRIF: This paper describes the mouse gene.
 156     REFERENCE   10 (bases 1 to 2881)
 157     AUTHORS   Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
 158     TITLE     Novel nuclear autoantigen with splicing factor motifs identified
 159             with antibody from hepatocellular carcinoma
 160     JOURNAL   J. Clin. Invest. 92 (5), 2419-2426 (1993)
 161     PUBMED   8227358
 162     COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
 163             reference sequence was derived from DC346351.1, BC141835.1 and
 164             C75555.1.
 165             On Jun 16, 2011 this sequence version replaced gi:35493810.
 166
 167             Summary: This gene encodes a member of the U2AF65 family of
 168             proteins. The encoded protein is found in the nucleus, where it
 169             co-localizes with core spliceosomal proteins. It has been shown to
 170             play a role in both steroid hormone receptor-mediated transcription
 171             and alternative splicing, and it is also a transcriptional
 172             coregulator of the viral oncoprotein v-Rel. Multiple transcript
 173             variants have been observed for this gene. A related pseudogene has
 174             been identified on chromosome X. [provided by RefSeq, Aug 2011].
 175
 176             Transcript Variant: This variant (1) encodes the longest isoform
 177             (a, also called CC1.4).
 178
 179             Publication Note:  This RefSeq record includes a subset of the
 180             publications that are available for this gene. Please see the Gene
 181             record to access additional publications.
 182
 183             ##Evidence-Data-START##
 184             Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
 185             RNAseq introns              :: mixed/partial sample support
 186                                            ERS025081, ERS025082 [ECO:0000350]
 187             ##Evidence-Data-END##
 188             COMPLETENESS: complete on the 3' end.
 189     PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
 190             1-578               DC346351.1         3-580
 191             579-2872            BC141835.1         429-2722
 192             2873-2881           C75555.1           1-9                 c
 193     FEATURES             Location/Qualifiers
 194      source          1..2881
 195                      /organism="Homo sapiens"
 196                      /mol_type="mRNA"
 197                      /db_xref="taxon:9606"
 198                      /chromosome="20"
 199                      /map="20q11.22"
 200      gene            1..2881
 201                      /gene="RBM39"
 202                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 203                      /note="RNA binding motif protein 39"
 204                      /db_xref="GeneID:9584"
 205                      /db_xref="HGNC:15923"
 206                      /db_xref="HPRD:09201"
 207                      /db_xref="MIM:604739"
 208      exon            1..396
 209                      /gene="RBM39"
 210                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 211                      /inference="alignment:Splign:1.39.8"
 212      STS             35..262
 213                      /gene="RBM39"
 214                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 215                      /standard_name="REN58946"
 216                      /db_xref="UniSTS:383746"
 217      misc_feature    221..223
 218                      /gene="RBM39"
 219                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 220                      /note="upstream in-frame stop codon"
 221      STS             299..453
 222                      /gene="RBM39"
 223                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 224                      /standard_name="G64285"
 225                      /db_xref="UniSTS:158667"
 226      exon            397..460
 227                      /gene="RBM39"
 228                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 229                      /inference="alignment:Splign:1.39.8"
 230      CDS             410..2002
 231                      /gene="RBM39"
 232                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 233                      /note="isoform a is encoded by transcript variant 1;
 234                      coactivator of activating protein-1 and estrogen
 235                      receptors; functional spliceosome-associated protein 59;
 236                      RNA-binding region (RNP1, RRM) containing 2;
 237                      hepatocellular carcinoma protein 1; splicing factor HCC1"
 238                      /codon_start=1
 239                      /product="RNA-binding protein 39 isoform a"
 240                      /protein_id="NP_909122.1"
 241                      /db_xref="GI:35493811"
 242                      /db_xref="CCDS:CCDS13266.1"
 243                      /db_xref="GeneID:9584"
 244                      /db_xref="HGNC:15923"
 245                      /db_xref="HPRD:09201"
 246                      /db_xref="MIM:604739"
 247                      /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
 248                      HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
 249                      KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
 250                      AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
 251                      LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
 252                      ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
 253                      ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
 254                      FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
 255                      IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
 256                      PTYHNLFPDSMTATQLLVPSRR"
 257      misc_feature    413..415
 258                      /gene="RBM39"
 259                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 260                      /experiment="experimental evidence, no additional details
 261                      recorded"
 262                      /note="N-acetylalanine; propagated from
 263                      UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
 264
 265      exon            461..510
 266                      /gene="RBM39"
 267                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 268                      /inference="alignment:Splign:1.39.8"
 269
 270      exon            1902..2874
 271                      /gene="RBM39"
 272                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 273                      /inference="alignment:Splign:1.39.8"
 274      STS             1956..2182
 275                      /gene="RBM39"
 276                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 277                      /standard_name="REN58786"
 278                      /db_xref="UniSTS:383586"
 279      STS             2104..2148
 280                      /gene="RBM39"
 281                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 282                      /standard_name="D19S1033"
 283                      /db_xref="UniSTS:154759"
 284      STS             2145..2400
 285                      /gene="RBM39"
 286                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 287                      /standard_name="REN58785"
 288                      /db_xref="UniSTS:383585"
 289
 290      polyA_signal    2851..2856
 291                      /gene="RBM39"
 292                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 293      polyA_site      2874
 294                      /gene="RBM39"
 295                      /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
 296     ORIGIN
 297         1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
 298        61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
 299       121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
 300       181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
 301       241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
 302
 303
 304
 305     */
 306     private EbiDbEntry() {
 307     }
 308
 309     @Override
 310     public Object clone() throws CloneNotSupportedException {
 311         throw new CloneNotSupportedException();
 312     }
 313
 314     @Override
 315     public String getAccession() {
 316         return _pa;
 317     }
 318
 319     @Override
 320     public SortedSet<Annotation> getAnnotations() {
 321         return _annotations;
 322     }
 323
 324     @Override
 325     public String getChromosome() {
 326         return _chromosome;
 327     }
 328
 329     @Override
 330     public SortedSet<Accession> getCrossReferences() {
 331         return _cross_references;
 332     }
 333
 334     @Override
 335     public String getGeneName() {
 336         return _gene_name;
 337     }
 338
 339     @Override
 340     public SortedSet<GoTerm> getGoTerms() {
 341         return null;
 342     }
 343
 344     @Override
 345     public String getMap() {
 346         return _map;
 347     }
 348
 349     @Override
 350     public String getProvider() {
 351         return _provider;
 352     }
 353
 354     @Override
 355     public String getSequenceName() {
 356         return _de;
 357     }
 358
 359     @Override
 360     public String getSequenceSymbol() {
 361         return _symbol;
 362     }
 363
 364     @Override
 365     public String getTaxonomyIdentifier() {
 366         return _tax_id;
 367     }
 368
 369     @Override
 370     public String getTaxonomyScientificName() {
 371         return _os;
 372     }
 373
 374     @Override
 375     public boolean isEmpty() {
 376         return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
 377                 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
 378                 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
 379     }
 380
 381     public void setProvider( final String provider ) {
 382         _provider = provider;
 383     }
 384
 385     private void addAnnotation( final Annotation annotation ) {
 386         if ( _annotations == null ) {
 387             _annotations = new TreeSet<Annotation>();
 388         }
 389         _annotations.add( annotation );
 390     }
 391
 392     private void addCrossReference( final Accession accession ) {
 393         if ( _cross_references == null ) {
 394             _cross_references = new TreeSet<Accession>();
 395         }
 396         System.out.println( "XREF ADDED: " + accession );
 397         _cross_references.add( accession );
 398     }
 399
 400     private void setAccession( final String pa ) {
 401         if ( _pa == null ) {
 402             _pa = pa;
 403         }
 404     }
 405
 406     private void setChromosome( final String chromosome ) {
 407         _chromosome = chromosome;
 408     }
 409
 410     private void setGeneName( final String gene_name ) {
 411         if ( _gene_name == null ) {
 412             _gene_name = gene_name;
 413         }
 414     }
 415
 416     private void setMap( final String map ) {
 417         _map = map;
 418     }
 419
 420     private void setSequenceName( final String rec_name ) {
 421         if ( _de == null ) {
 422             _de = rec_name;
 423         }
 424     }
 425
 426     private void setSequenceSymbol( final String symbol ) {
 427         _symbol = symbol;
 428     }
 429
 430     private void setTaxId( final String tax_id ) {
 431         if ( _tax_id == null ) {
 432             _tax_id = tax_id;
 433         }
 434     }
 435
 436     private void setTaxonomyScientificName( final String os ) {
 437         if ( _os == null ) {
 438             _os = os;
 439         }
 440     }
 441
 442     //    public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
 443     //        final EbiDbEntry e = new EbiDbEntry();
 444     //        for( final String line : lines ) {
 445     //            if ( line.startsWith( "PA" ) ) {
 446     //                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
 447     //            }
 448     //            else if ( line.startsWith( "DE" ) ) {
 449     //                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
 450     //            }
 451     //            else if ( line.startsWith( "OS" ) ) {
 452     //                if ( line.indexOf( "(" ) > 0 ) {
 453     //                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 454     //                }
 455     //                else {
 456     //                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
 457     //                }
 458     //            }
 459     //            else if ( line.startsWith( "OX" ) ) {
 460     //                if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
 461     //                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
 462     //                }
 463     //            }
 464     //        }
 465     //        return e;
 466     //    }
 467     public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
 468         final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
 469         final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
 470         final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
 471         final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
 472         final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
 473         final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
 474         final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
 475         final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
 476         final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
 477         final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
 478         final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
 479         final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
 480         final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
 481         final EbiDbEntry e = new EbiDbEntry();
 482         final StringBuilder def = new StringBuilder();
 483         boolean in_definition = false;
 484         boolean in_features = false;
 485         boolean in_source = false;
 486         boolean in_gene = false;
 487         boolean in_cds = false;
 488         boolean in_mrna = false;
 489         boolean in_protein = false;
 490         for( final String line : lines ) {
 491             if ( line.startsWith( "ACCESSION " ) ) {
 492                 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
 493                 in_definition = false;
 494             }
 495             else if ( line.startsWith( "ID " ) ) {
 496                 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
 497                 in_definition = false;
 498             }
 499             else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
 500                 boolean definiton = false;
 501                 if ( line.startsWith( "DEFINITION " ) ) {
 502                     definiton = true;
 503                 }
 504                 if ( line.indexOf( "[" ) > 0 ) {
 505                     if ( definiton ) {
 506                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
 507                     }
 508                     else {
 509                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
 510                     }
 511                 }
 512                 else if ( line.indexOf( "." ) > 0 ) {
 513                     if ( definiton ) {
 514                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
 515                     }
 516                     else {
 517                         x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
 518                     }
 519                 }
 520                 else {
 521                     if ( definiton ) {
 522                         x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
 523                     }
 524                     else {
 525                         x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
 526                     }
 527                 }
 528                 if ( definiton ) {
 529                     in_definition = true;
 530                 }
 531             }
 532             else if ( line.startsWith( "  ORGANISM " ) ) {
 533                 if ( line.indexOf( "(" ) > 0 ) {
 534                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "  ORGANISM", "(" ) );
 535                 }
 536                 else {
 537                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "  ORGANISM" ) );
 538                 }
 539                 //  in_def = false;
 540             }
 541             else if ( line.startsWith( "OS " ) ) {
 542                 if ( line.indexOf( "(" ) > 0 ) {
 543                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
 544                 }
 545                 else {
 546                     e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
 547                 }
 548             }
 549             else if ( line.startsWith( " " ) && in_definition ) {
 550                 def.append( " " );
 551                 if ( line.indexOf( "[" ) > 0 ) {
 552                     def.append( SequenceDbWsTools.extractTo( line, "[" ) );
 553                 }
 554                 else if ( line.indexOf( "." ) > 0 ) {
 555                     def.append( SequenceDbWsTools.extractTo( line, "." ) );
 556                 }
 557                 else {
 558                     def.append( line.trim() );
 559                 }
 560             }
 561             else {
 562                 in_definition = false;
 563             }
 564             if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
 565                 in_features = false;
 566                 in_source = false;
 567                 in_gene = false;
 568                 in_cds = false;
 569                 in_mrna = false;
 570                 in_protein = false;
 571                 // in_def = false;
 572             }
 573             if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
 574                 in_features = true;
 575             }
 576             if ( in_features && ( line.startsWith( "     source " ) || line.startsWith( "FT   source " ) ) ) {
 577                 in_source = true;
 578                 in_gene = false;
 579                 in_cds = false;
 580                 in_mrna = false;
 581                 in_protein = false;
 582             }
 583             if ( in_features && ( line.startsWith( "     gene " ) || line.startsWith( "FT   gene " ) ) ) {
 584                 in_source = false;
 585                 in_gene = true;
 586                 in_cds = false;
 587                 in_mrna = false;
 588                 in_protein = false;
 589             }
 590             if ( in_features && ( line.startsWith( "     CDS " ) || line.startsWith( "FT   CDS " ) ) ) {
 591                 in_source = false;
 592                 in_gene = false;
 593                 in_cds = true;
 594                 in_mrna = false;
 595                 in_protein = false;
 596             }
 597             if ( in_features && ( line.startsWith( "     Protein " ) || line.startsWith( "FT   Protein " ) ) ) {
 598                 in_source = false;
 599                 in_gene = false;
 600                 in_cds = false;
 601                 in_mrna = false;
 602                 in_protein = true;
 603             }
 604             if ( in_features && ( line.startsWith( "     mRNA " ) || line.startsWith( "FT   mRNA " ) ) ) {
 605                 in_source = false;
 606                 in_gene = false;
 607                 in_cds = false;
 608                 in_mrna = true;
 609                 in_protein = false;
 610             }
 611             if ( in_source ) {
 612                 final Matcher ti = taxon_PATTERN.matcher( line );
 613                 if ( ti.find() ) {
 614                     e.setTaxId( ti.group( 1 ) );
 615                 }
 616                 final Matcher chr = chromosome_PATTERN.matcher( line );
 617                 if ( chr.find() ) {
 618                     e.setChromosome( chr.group( 1 ) );
 619                 }
 620                 final Matcher map = map_PATTERN.matcher( line );
 621                 if ( map.find() ) {
 622                     e.setMap( map.group( 1 ) );
 623                 }
 624             }
 625             if ( in_cds || in_gene ) {
 626                 final Matcher hgnc = hgnc_PATTERN.matcher( line );
 627                 if ( hgnc.find() ) {
 628                     e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
 629                 }
 630                 final Matcher geneid = geneid_PATTERN.matcher( line );
 631                 if ( geneid.find() ) {
 632                     e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
 633                 }
 634             }
 635             if ( in_protein || in_cds || in_gene || in_mrna ) {
 636                 final Matcher ec = ec_PATTERN.matcher( line );
 637                 if ( ec.find() ) {
 638                     e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
 639                 }
 640                 final Matcher gene = gene_PATTERN.matcher( line );
 641                 if ( gene.find() ) {
 642                     e.setGeneName( gene.group( 1 ) );
 643                 }
 644                 final Matcher uniprot = uniprot_PATTERN.matcher( line );
 645                 if ( uniprot.find() ) {
 646                     e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
 647                 }
 648                 final Matcher interpro = interpro_PATTERN.matcher( line );
 649                 if ( interpro.find() ) {
 650                     e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
 651                 }
 652                 final Matcher mim = mim_PATTERN.matcher( line );
 653                 if ( mim.find() ) {
 654                     e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
 655                 }
 656                 final Matcher product = product_PATTERN.matcher( line );
 657                 if ( product.find() ) {
 658                     e.setSequenceSymbol( product.group( 1 ) );
 659                 }
 660                 final Matcher pdb = pdb_PATTERN.matcher( line );
 661                 if ( pdb.find() ) {
 662                     e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
 663                 }
 664             }
 665         }
 666         if ( def.length() > 0 ) {
 667             e.setSequenceName( def.toString().trim() );
 668         }
 669         return e;
 670     }
 671
 672     private static void x( final StringBuilder sb, final String s ) {
 673         if ( sb.length() > 0 ) {
 674             sb.append( " " );
 675         }
 676         sb.append( s.trim() );
 677     }
 678 }