2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.ws.seqdb;
28 import java.util.List;
29 import java.util.SortedSet;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import org.forester.go.GoTerm;
35 import org.forester.phylogeny.data.Accession;
36 import org.forester.phylogeny.data.Annotation;
37 import org.forester.sequence.MolecularSequence;
38 import org.forester.util.ForesterUtil;
40 public final class EbiDbEntry implements SequenceDatabaseEntry {
42 private final static boolean DEBUG = false;
44 private final static Pattern LETTERS_PATTERN = Pattern.compile( "^[A-Z]+" );
45 private final static Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
46 private final static Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w\\.]+)\"" );
47 private final static Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
48 private final static Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
49 private final static Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
50 private final static Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
51 private final static Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
52 private final static Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
53 private final static Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
54 private final static Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
55 private final static Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
56 private final static Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
58 private SortedSet<Annotation> _annotations;
59 private String _chromosome;
60 private SortedSet<Accession> _cross_references;
62 private String _gene_name;
66 private String _provider;
67 private String _symbol;
68 private String _tax_id;
70 private EbiDbEntry() {
74 public Object clone() throws CloneNotSupportedException {
75 throw new CloneNotSupportedException();
79 public String getAccession() {
84 public SortedSet<Annotation> getAnnotations() {
89 public String getChromosome() {
94 public SortedSet<Accession> getCrossReferences() {
95 return _cross_references;
99 public String getGeneName() {
104 public SortedSet<GoTerm> getGoTerms() {
109 public String getMap() {
114 public String getProvider() {
119 public String getSequenceName() {
124 public String getSequenceSymbol() {
129 public String getTaxonomyIdentifier() {
134 public String getTaxonomyScientificName() {
139 public boolean isEmpty() {
140 return ( ForesterUtil.isEmpty( getAccession() ) && ForesterUtil.isEmpty( getSequenceName() )
141 && ForesterUtil.isEmpty( getTaxonomyScientificName() )
142 && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
147 public MolecularSequence getMolecularSequence() {
148 // TODO Auto-generated method stub
151 private void addAnnotation( final Annotation annotation ) {
152 if ( _annotations == null ) {
153 _annotations = new TreeSet<Annotation>();
155 _annotations.add( annotation );
158 private void addCrossReference( final Accession accession ) {
159 if ( _cross_references == null ) {
160 _cross_references = new TreeSet<Accession>();
163 System.out.println( "XREF ADDED: " + accession );
165 _cross_references.add( accession );
168 private void setAccession( final String pa ) {
174 private void setChromosome( final String chromosome ) {
175 _chromosome = chromosome;
178 private void setGeneName( final String gene_name ) {
179 if ( _gene_name == null ) {
180 _gene_name = gene_name;
184 private void setMap( final String map ) {
188 private void setSequenceName( final String rec_name ) {
194 private void setSequenceSymbol( final String symbol ) {
198 private void setTaxId( final String tax_id ) {
199 if ( _tax_id == null ) {
204 private void setTaxonomyScientificName( final String os ) {
210 private static void append( final StringBuilder sb, final String s ) {
211 if ( sb.length() > 0 ) {
214 sb.append( s.trim() );
217 public final static SequenceDatabaseEntry createInstance( final List<String> lines ) {
219 final EbiDbEntry e = new EbiDbEntry();
220 final StringBuilder def = new StringBuilder();
221 boolean in_definition = false;
222 boolean in_features = false;
223 boolean in_source = false;
224 boolean in_gene = false;
225 boolean in_cds = false;
226 boolean in_mrna = false;
227 boolean in_protein = false;
228 for( final String line : lines ) {
229 if ( line.startsWith( "ACCESSION " ) ) {
230 e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
231 in_definition = false;
233 else if ( line.startsWith( "ID " ) ) {
234 e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
235 in_definition = false;
237 else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
238 boolean definiton = false;
239 if ( line.startsWith( "DEFINITION " ) ) {
242 if ( line.indexOf( "[" ) > 0 ) {
244 append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
247 append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
250 else if ( line.indexOf( "." ) > 0 ) {
252 append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
255 append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
260 append( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
263 append( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
267 in_definition = true;
270 else if ( line.startsWith( " ORGANISM " ) ) {
271 if ( line.indexOf( "(" ) > 0 ) {
272 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
275 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
278 else if ( line.startsWith( "OS " ) ) {
279 if ( line.indexOf( "(" ) > 0 ) {
280 e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
283 e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
286 else if ( line.startsWith( " " ) && in_definition ) {
288 if ( line.indexOf( "[" ) > 0 ) {
289 def.append( SequenceDbWsTools.extractTo( line, "[" ) );
291 else if ( line.indexOf( "." ) > 0 ) {
292 def.append( SequenceDbWsTools.extractTo( line, "." ) );
295 def.append( line.trim() );
299 in_definition = false;
301 if ( !line.startsWith( "FT " ) && LETTERS_PATTERN.matcher( line ).find() ) {
309 if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
312 if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
319 if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
326 if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
333 if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
340 if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
348 final Matcher ti = taxon_PATTERN.matcher( line );
350 e.setTaxId( ti.group( 1 ) );
352 final Matcher chr = chromosome_PATTERN.matcher( line );
354 e.setChromosome( chr.group( 1 ) );
356 final Matcher map = map_PATTERN.matcher( line );
358 e.setMap( map.group( 1 ) );
361 if ( in_cds || in_gene ) {
362 final Matcher hgnc = hgnc_PATTERN.matcher( line );
364 e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
366 final Matcher geneid = geneid_PATTERN.matcher( line );
367 if ( geneid.find() ) {
368 e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
371 if ( in_protein || in_cds || in_gene || in_mrna ) {
372 final Matcher ec = ec_PATTERN.matcher( line );
374 e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
376 final Matcher gene = gene_PATTERN.matcher( line );
378 e.setGeneName( gene.group( 1 ) );
380 final Matcher uniprot = uniprot_PATTERN.matcher( line );
381 if ( uniprot.find() ) {
382 e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
384 final Matcher interpro = interpro_PATTERN.matcher( line );
385 if ( interpro.find() ) {
386 e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
388 final Matcher mim = mim_PATTERN.matcher( line );
390 e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
392 final Matcher product = product_PATTERN.matcher( line );
393 if ( product.find() ) {
394 e.setSequenceSymbol( product.group( 1 ) );
396 final Matcher pdb = pdb_PATTERN.matcher( line );
398 e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
402 if ( def.length() > 0 ) {
403 e.setSequenceName( def.toString().trim() );