From a1e88a27d4df0836cd947b7140b37bb7a0b16077 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 8 Oct 2013 06:39:10 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/phylogeny/data/Taxonomy.java | 10 ++- .../java/src/org/forester/ws/seqdb/EbiDbEntry.java | 72 +++++++------------- 2 files changed, 30 insertions(+), 52 deletions(-) diff --git a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java index 48984f0..b354921 100644 --- a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java +++ b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java @@ -326,12 +326,10 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable lines ) { - final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" ); - final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); - final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" ); - final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); - final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); - final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); - - final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" ); - final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" ); - - + final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" ); + final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); + final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" ); + final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); + final Pattern mim_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); + final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); + final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" ); + final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" ); final EbiDbEntry e = new EbiDbEntry(); final StringBuilder def = new StringBuilder(); boolean in_def = false; boolean in_features = false; boolean in_source = false; boolean in_gene = false; - boolean in_cds = false; - boolean in_protein = false; + boolean in_cds = false; + boolean in_protein = false; for( final String line : lines ) { - if ( line.startsWith( "ACCESSION " ) ) { e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) ); in_def = false; @@ -106,7 +102,7 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { else { e.setOs( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) ); } - // in_def = false; + // in_def = false; } else if ( line.startsWith( " " ) && in_def ) { def.append( " " ); @@ -123,51 +119,40 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { else { in_def = false; } - - - if ( X_PATTERN.matcher( line ).find() ) { + if ( X_PATTERN.matcher( line ).find() ) { in_features = false; in_source = false; in_gene = false; in_cds = false; - in_protein = false; - // in_def = false; + in_protein = false; + // in_def = false; } - - if ( line.startsWith( "FEATURES " ) ) { in_features = true; - } - if ( in_features && line.startsWith( " source " ) ) { in_source = true; in_gene = false; - in_cds = false; - in_protein = false; + in_protein = false; } if ( in_features && line.startsWith( " gene " ) ) { - in_source = false; in_gene = true; - in_cds = false; - in_protein = false; + in_protein = false; } if ( in_features && line.startsWith( " CDS " ) ) { in_source = false; in_gene = false; - in_cds = true; - in_protein = false; + in_protein = false; } if ( in_features && line.startsWith( " Protein " ) ) { in_source = false; in_gene = false; - in_cds = false; - in_protein = true; + in_protein = true; } } if ( def.length() > 0 ) { @@ -177,17 +162,15 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } // FIXME actually this is NCBI entry //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ - private String _pa; - private String _de; - private String _os; - private String _tax_id; - - - private String _symbol; - private String _provider; - + private String _pa; + private String _de; + private String _os; + private String _tax_id; + private String _symbol; + private String _provider; private ArrayList _cross_references; private String _gene_name; + // TODO PUBMED 15798186 //TODO (FEATURES) // source /db_xref="taxon:9606" @@ -200,7 +183,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { // /db_xref="MIM:604739" // /db_xref="InterPro:IPR002475" // /product="Bcl-2" - // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse? // // Protein @@ -496,8 +478,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { return _tax_id; } - - @Override public String getTaxonomyScientificName() { return _os; -- 1.7.10.2