+ private void setGeneName( final String gene_name ) {
+ if ( _gene_name == null ) {
+ _gene_name = gene_name;
+ }
+ }
+
+ private void setMap( final String map ) {
+ _map = map;
+ }
+
+ private void setSequenceName( final String rec_name ) {
+ if ( _de == null ) {
+ _de = rec_name;
+ }
+ }
+
+ private void setSequenceSymbol( final String symbol ) {
+ _symbol = symbol;
+ }
+
+ private void setTaxId( final String tax_id ) {
+ if ( _tax_id == null ) {
+ _tax_id = tax_id;
+ }
+ }
+
+ private void setTaxonomyScientificName( final String os ) {
+ if ( _os == null ) {
+ _os = os;
+ }
+ }
+
+ private static void append( final StringBuilder sb, final String s ) {
+ if ( sb.length() > 0 ) {
+ sb.append( " " );
+ }
+ sb.append( s.trim() );
+ }
+
+ public final static SequenceDatabaseEntry createInstance( final List<String> lines ) {
+
+ final EbiDbEntry e = new EbiDbEntry();
+ final StringBuilder def = new StringBuilder();
+ boolean in_definition = false;
+ boolean in_features = false;
+ boolean in_source = false;
+ boolean in_gene = false;
+ boolean in_cds = false;
+ boolean in_mrna = false;
+ boolean in_protein = false;
+ for( final String line : lines ) {
+ if ( line.startsWith( "ACCESSION " ) ) {
+ e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
+ in_definition = false;
+ }
+ else if ( line.startsWith( "ID " ) ) {
+ e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
+ in_definition = false;
+ }
+ else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
+ boolean definiton = false;
+ if ( line.startsWith( "DEFINITION " ) ) {
+ definiton = true;
+ }
+ if ( line.indexOf( "[" ) > 0 ) {
+ if ( definiton ) {
+ append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
+ }
+ else {
+ append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
+ }
+ }
+ else if ( line.indexOf( "." ) > 0 ) {
+ if ( definiton ) {
+ append( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
+ }
+ else {
+ append( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
+ }
+ }
+ else {
+ if ( definiton ) {
+ append( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
+ }
+ else {
+ append( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
+ }
+ }
+ if ( definiton ) {
+ in_definition = true;
+ }
+ }
+ else if ( line.startsWith( " ORGANISM " ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
+ }
+ else {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
+ }
+ }
+ else if ( line.startsWith( "OS " ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+ }
+ else {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
+ }
+ }
+ else if ( line.startsWith( " " ) && in_definition ) {
+ def.append( " " );
+ if ( line.indexOf( "[" ) > 0 ) {
+ def.append( SequenceDbWsTools.extractTo( line, "[" ) );
+ }
+ else if ( line.indexOf( "." ) > 0 ) {
+ def.append( SequenceDbWsTools.extractTo( line, "." ) );
+ }
+ else {
+ def.append( line.trim() );
+ }
+ }
+ else {
+ in_definition = false;
+ }
+ if ( !line.startsWith( "FT " ) && LETTERS_PATTERN.matcher( line ).find() ) {
+ in_features = false;
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
+ in_features = true;
+ }
+ if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
+ in_source = true;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
+ in_source = false;
+ in_gene = true;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = true;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = true;
+ }
+ if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = true;
+ in_protein = false;
+ }
+ if ( in_source ) {
+ final Matcher ti = taxon_PATTERN.matcher( line );
+ if ( ti.find() ) {
+ e.setTaxId( ti.group( 1 ) );
+ }
+ final Matcher chr = chromosome_PATTERN.matcher( line );
+ if ( chr.find() ) {
+ e.setChromosome( chr.group( 1 ) );
+ }
+ final Matcher map = map_PATTERN.matcher( line );
+ if ( map.find() ) {
+ e.setMap( map.group( 1 ) );
+ }
+ }
+ if ( in_cds || in_gene ) {
+ final Matcher hgnc = hgnc_PATTERN.matcher( line );
+ if ( hgnc.find() ) {
+ e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
+ }
+ final Matcher geneid = geneid_PATTERN.matcher( line );
+ if ( geneid.find() ) {
+ e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
+ }
+ }
+ if ( in_protein || in_cds || in_gene || in_mrna ) {
+ final Matcher ec = ec_PATTERN.matcher( line );
+ if ( ec.find() ) {
+ e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
+ }
+ final Matcher gene = gene_PATTERN.matcher( line );
+ if ( gene.find() ) {
+ e.setGeneName( gene.group( 1 ) );
+ }
+ final Matcher uniprot = uniprot_PATTERN.matcher( line );
+ if ( uniprot.find() ) {
+ e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
+ }
+ final Matcher interpro = interpro_PATTERN.matcher( line );
+ if ( interpro.find() ) {
+ e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
+ }
+ final Matcher mim = mim_PATTERN.matcher( line );
+ if ( mim.find() ) {
+ e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
+ }
+ final Matcher product = product_PATTERN.matcher( line );
+ if ( product.find() ) {
+ e.setSequenceSymbol( product.group( 1 ) );
+ }
+ final Matcher pdb = pdb_PATTERN.matcher( line );
+ if ( pdb.find() ) {
+ e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
+ }
+ }
+ }
+ if ( def.length() > 0 ) {
+ e.setSequenceName( def.toString().trim() );
+ }
+ return e;