From: cmzmasek@gmail.com Date: Tue, 19 Nov 2013 21:08:37 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=0307ef1e2fbfc7be8c562c8e5684d8434934591a;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 2583a90..ae43b31 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -127,7 +127,7 @@ import org.forester.ws.wabi.TxSearch.TAX_RANK; @SuppressWarnings( "unused") public final class Test { - private final static boolean PERFORM_DB_TESTS = false; + private final static boolean PERFORM_DB_TESTS = true; private final static double ZERO_DIFF = 1.0E-9; private final static String PATH_TO_TEST_DATA = System.getProperty( "user.dir" ) + ForesterUtil.getFileSeparator() + "test_data" @@ -490,18 +490,6 @@ public final class Test { System.out.println( "failed." ); failed++; } - if ( PERFORM_DB_TESTS ) { - System.out.print( "Ebi Entry Retrieval: " ); - if ( Test.testEbiEntryRetrieval() ) { - System.out.println( "OK." ); - succeeded++; - } - else { - System.out.println( "failed." ); - failed++; - } - } - /////////////////////System.exit( 0 ); System.out.print( "UniProtKB id extraction: " ); if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) { System.out.println( "OK." ); @@ -521,6 +509,18 @@ public final class Test { failed++; } if ( PERFORM_DB_TESTS ) { + System.out.print( "Ebi Entry Retrieval: " ); + if ( Test.testEbiEntryRetrieval() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + } + // System.exit( 0 ); + if ( PERFORM_DB_TESTS ) { System.out.print( "Sequence DB tools 2: " ); if ( testSequenceDbWsTools2() ) { System.out.println( "OK." ); @@ -532,6 +532,7 @@ public final class Test { System.exit( -1 ); } } + // System.exit( 0 ); System.out.print( "Hmmscan output parser: " ); if ( testHmmscanOutputParser() ) { System.out.println( "OK." ); @@ -11297,6 +11298,20 @@ public final class Test { System.out.println( acc.toString() ); return false; } + n.setName( "gi|71845847|1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" ); + acc = SequenceDbWsTools.obtainSeqAccession( n ); + if ( ( acc == null ) || !acc.getSource().equals( Source.GI.toString() ) + || !acc.getValue().equals( "71845847" ) ) { + System.out.println( acc.toString() ); + return false; + } + n.setName( "gi|71845847|gb|AAZ45343.1| 1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" ); + acc = SequenceDbWsTools.obtainSeqAccession( n ); + if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() ) + || !acc.getValue().equals( "AAZ45343.1" ) ) { + System.out.println( acc.toString() ); + return false; + } } catch ( final Exception e ) { return false; @@ -11322,7 +11337,6 @@ public final class Test { } final PhylogenyNode n2 = new PhylogenyNode( "NM_001030253" ); SequenceDbWsTools.obtainSeqInformation( n2 ); - System.out.println( n2.toString() ); if ( !n2.getNodeData().getSequence().getName() .equals( "Danio rerio B-cell leukemia/lymphoma 2 (bcl2), mRNA" ) ) { return false; @@ -11338,7 +11352,6 @@ public final class Test { } final PhylogenyNode n3 = new PhylogenyNode( "NM_184234.2" ); SequenceDbWsTools.obtainSeqInformation( n3 ); - System.out.println( "n=" + n3.toString() ); if ( !n3.getNodeData().getSequence().getName() .equals( "Homo sapiens RNA binding motif protein 39 (RBM39), transcript variant 1, mRNA" ) ) { return false; @@ -11503,14 +11516,14 @@ public final class Test { System.out.println( entry4.getGeneName() ); return false; } - if ( !entry4.getChromosome().equals( "ras" ) ) { - System.out.println( entry4.getChromosome() ); - return false; - } - if ( !entry4.getMap().equals( "ras" ) ) { - System.out.println( entry4.getMap() ); - return false; - } + // if ( !entry4.getChromosome().equals( "ras" ) ) { + // System.out.println( entry4.getChromosome() ); + // return false; + // } + // if ( !entry4.getMap().equals( "ras" ) ) { + // System.out.println( entry4.getMap() ); + // return false; + // } //TODO FIXME gi... // //TODO fails: @@ -11518,6 +11531,22 @@ public final class Test { // if ( !entry5.getAccession().equals( "HM043801" ) ) { // return false; // } + final SequenceDatabaseEntry entry5 = SequenceDbWsTools.obtainEntry( "AAZ45343.1" ); + if ( !entry5.getAccession().equals( "AAZ45343" ) ) { + return false; + } + if ( !entry5.getTaxonomyScientificName().equals( "Dechloromonas aromatica RCB" ) ) { + System.out.println( entry5.getTaxonomyScientificName() ); + return false; + } + if ( !entry5.getSequenceName().equals( "Dechloromonas aromatica RCB 1,4-alpha-glucan branching enzyme" ) ) { + System.out.println( entry5.getSequenceName() ); + return false; + } + if ( !entry5.getTaxonomyIdentifier().equals( "159087" ) ) { + System.out.println( entry5.getTaxonomyIdentifier() ); + return false; + } } catch ( final IOException e ) { System.out.println(); diff --git a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java index e32a2a4..00c52fc 100644 --- a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java @@ -38,273 +38,19 @@ import org.forester.util.ForesterUtil; public final class EbiDbEntry implements SequenceDatabaseEntry { - // public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { - // final EbiDbEntry e = new EbiDbEntry(); - // for( final String line : lines ) { - // if ( line.startsWith( "PA" ) ) { - // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) ); - // } - // else if ( line.startsWith( "DE" ) ) { - // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) ); - // } - // else if ( line.startsWith( "OS" ) ) { - // if ( line.indexOf( "(" ) > 0 ) { - // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); - // } - // else { - // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) ); - // } - // } - // else if ( line.startsWith( "OX" ) ) { - // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) ); - // } - // } - // } - // return e; - // } - public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List lines ) { - final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" ); - final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); - final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" ); - final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); - final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); - final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); - final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" ); - final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" ); - final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" ); - final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" ); - final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" ); - final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" ); - final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" ); - final EbiDbEntry e = new EbiDbEntry(); - final StringBuilder def = new StringBuilder(); - boolean in_definition = false; - boolean in_features = false; - boolean in_source = false; - boolean in_gene = false; - boolean in_cds = false; - boolean in_mrna = false; - boolean in_protein = false; - for( final String line : lines ) { - if ( line.startsWith( "ACCESSION " ) ) { - e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) ); - in_definition = false; - } - else if ( line.startsWith( "ID " ) ) { - e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) ); - in_definition = false; - } - else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) { - boolean definiton = false; - if ( line.startsWith( "DEFINITION " ) ) { - definiton = true; - } - if ( line.indexOf( "[" ) > 0 ) { - if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) ); - } - else { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) ); - } - } - else if ( line.indexOf( "." ) > 0 ) { - if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) ); - } - else { - x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) ); - } - } - else { - if ( definiton ) { - x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) ); - } - else { - x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) ); - } - } - if ( definiton ) { - in_definition = true; - } - } - else if ( line.startsWith( " ORGANISM " ) ) { - if ( line.indexOf( "(" ) > 0 ) { - e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) ); - } - else { - e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) ); - } - // in_def = false; - } - else if ( line.startsWith( "OS " ) ) { - if ( line.indexOf( "(" ) > 0 ) { - e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); - } - else { - e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) ); - } - } - else if ( line.startsWith( " " ) && in_definition ) { - def.append( " " ); - if ( line.indexOf( "[" ) > 0 ) { - def.append( SequenceDbWsTools.extractTo( line, "[" ) ); - } - else if ( line.indexOf( "." ) > 0 ) { - def.append( SequenceDbWsTools.extractTo( line, "." ) ); - } - else { - def.append( line.trim() ); - } - } - else { - in_definition = false; - } - if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) { - in_features = false; - in_source = false; - in_gene = false; - in_cds = false; - in_mrna = false; - in_protein = false; - // in_def = false; - } - if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) { - in_features = true; - } - if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) { - in_source = true; - in_gene = false; - in_cds = false; - in_mrna = false; - in_protein = false; - } - if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) { - in_source = false; - in_gene = true; - in_cds = false; - in_mrna = false; - in_protein = false; - } - if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) { - in_source = false; - in_gene = false; - in_cds = true; - in_mrna = false; - in_protein = false; - } - if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) { - in_source = false; - in_gene = false; - in_cds = false; - in_mrna = false; - in_protein = true; - } - if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) { - in_source = false; - in_gene = false; - in_cds = false; - in_mrna = true; - in_protein = false; - } - if ( in_source ) { - final Matcher ti = taxon_PATTERN.matcher( line ); - if ( ti.find() ) { - e.setTaxId( ti.group( 1 ) ); - } - final Matcher chr = chromosome_PATTERN.matcher( line ); - if ( chr.find() ) { - e.setChromosome( chr.group( 1 ) ); - } - final Matcher map = map_PATTERN.matcher( line ); - if ( map.find() ) { - e.setMap( map.group( 1 ) ); - } - } - if ( in_cds || in_gene ) { - final Matcher hgnc = hgnc_PATTERN.matcher( line ); - if ( hgnc.find() ) { - e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) ); - } - final Matcher geneid = geneid_PATTERN.matcher( line ); - if ( geneid.find() ) { - e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) ); - } - } - if ( in_protein || in_cds || in_gene || in_mrna ) { - final Matcher ec = ec_PATTERN.matcher( line ); - if ( ec.find() ) { - e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) ); - } - final Matcher gene = gene_PATTERN.matcher( line ); - if ( gene.find() ) { - e.setGeneName( gene.group( 1 ) ); - } - final Matcher uniprot = uniprot_PATTERN.matcher( line ); - if ( uniprot.find() ) { - e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) ); - } - final Matcher interpro = interpro_PATTERN.matcher( line ); - if ( interpro.find() ) { - e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) ); - } - final Matcher mim = mim_PATTERN.matcher( line ); - if ( mim.find() ) { - e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) ); - } - final Matcher product = product_PATTERN.matcher( line ); - if ( product.find() ) { - e.setSequenceSymbol( product.group( 1 ) ); - } - final Matcher pdb = pdb_PATTERN.matcher( line ); - if ( pdb.find() ) { - e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) ); - } - } - } - if ( def.length() > 0 ) { - e.setSequenceName( def.toString().trim() ); - } - return e; - } - private String _map; - private String _chromosome; - - private void setMap( final String map ) { - _map = map; - } - - private void setChromosome( final String chromosome ) { - _chromosome = chromosome; - } - - @Override - public String getMap() { - return _map; - } - - @Override - public String getChromosome() { - return _chromosome; - } - - private static void x( final StringBuilder sb, final String s ) { - if ( sb.length() > 0 ) { - sb.append( " " ); - } - sb.append( s.trim() ); - } + private SortedSet _annotations; + private String _chromosome; + private SortedSet _cross_references; + private String _de; + private String _gene_name; + private String _map; + private String _os; // FIXME actually this is NCBI entry //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ private String _pa; - private String _de; - private String _os; - private String _tax_id; - private String _symbol; private String _provider; - private SortedSet _cross_references; - private SortedSet _annotations; - private String _gene_name; + private String _symbol; + private String _tax_id; // TODO PUBMED 15798186 //TODO (FEATURES) @@ -560,14 +306,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { private EbiDbEntry() { } - private void addCrossReference( final Accession accession ) { - if ( _cross_references == null ) { - _cross_references = new TreeSet(); - } - System.out.println( "XREF ADDED: " + accession ); - _cross_references.add( accession ); - } - @Override public Object clone() throws CloneNotSupportedException { throw new CloneNotSupportedException(); @@ -579,6 +317,16 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } @Override + public SortedSet getAnnotations() { + return _annotations; + } + + @Override + public String getChromosome() { + return _chromosome; + } + + @Override public SortedSet getCrossReferences() { return _cross_references; } @@ -594,6 +342,11 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } @Override + public String getMap() { + return _map; + } + + @Override public String getProvider() { return _provider; } @@ -608,10 +361,6 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { return _symbol; } - private void setSequenceSymbol( final String symbol ) { - _symbol = symbol; - } - @Override public String getTaxonomyIdentifier() { return _tax_id; @@ -629,10 +378,33 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) ); } - private void setSequenceName( final String rec_name ) { - if ( _de == null ) { - _de = rec_name; + public void setProvider( final String provider ) { + _provider = provider; + } + + private void addAnnotation( final Annotation annotation ) { + if ( _annotations == null ) { + _annotations = new TreeSet(); } + _annotations.add( annotation ); + } + + private void addCrossReference( final Accession accession ) { + if ( _cross_references == null ) { + _cross_references = new TreeSet(); + } + System.out.println( "XREF ADDED: " + accession ); + _cross_references.add( accession ); + } + + private void setAccession( final String pa ) { + if ( _pa == null ) { + _pa = pa; + } + } + + private void setChromosome( final String chromosome ) { + _chromosome = chromosome; } private void setGeneName( final String gene_name ) { @@ -641,20 +413,18 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } } - private void setTaxonomyScientificName( final String os ) { - if ( _os == null ) { - _os = os; - } + private void setMap( final String map ) { + _map = map; } - private void setAccession( final String pa ) { - if ( _pa == null ) { - _pa = pa; + private void setSequenceName( final String rec_name ) { + if ( _de == null ) { + _de = rec_name; } } - public void setProvider( final String provider ) { - _provider = provider; + private void setSequenceSymbol( final String symbol ) { + _symbol = symbol; } private void setTaxId( final String tax_id ) { @@ -663,15 +433,246 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } } - @Override - public SortedSet getAnnotations() { - return _annotations; + private void setTaxonomyScientificName( final String os ) { + if ( _os == null ) { + _os = os; + } } - private void addAnnotation( final Annotation annotation ) { - if ( _annotations == null ) { - _annotations = new TreeSet(); + // public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { + // final EbiDbEntry e = new EbiDbEntry(); + // for( final String line : lines ) { + // if ( line.startsWith( "PA" ) ) { + // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) ); + // } + // else if ( line.startsWith( "DE" ) ) { + // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) ); + // } + // else if ( line.startsWith( "OS" ) ) { + // if ( line.indexOf( "(" ) > 0 ) { + // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); + // } + // else { + // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) ); + // } + // } + // else if ( line.startsWith( "OX" ) ) { + // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { + // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) ); + // } + // } + // } + // return e; + // } + public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List lines ) { + final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" ); + final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" ); + final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" ); + final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" ); + final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" ); + final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); + final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" ); + final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" ); + final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" ); + final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" ); + final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" ); + final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" ); + final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" ); + final EbiDbEntry e = new EbiDbEntry(); + final StringBuilder def = new StringBuilder(); + boolean in_definition = false; + boolean in_features = false; + boolean in_source = false; + boolean in_gene = false; + boolean in_cds = false; + boolean in_mrna = false; + boolean in_protein = false; + for( final String line : lines ) { + if ( line.startsWith( "ACCESSION " ) ) { + e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) ); + in_definition = false; + } + else if ( line.startsWith( "ID " ) ) { + e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) ); + in_definition = false; + } + else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) { + boolean definiton = false; + if ( line.startsWith( "DEFINITION " ) ) { + definiton = true; + } + if ( line.indexOf( "[" ) > 0 ) { + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) ); + } + } + else if ( line.indexOf( "." ) > 0 ) { + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) ); + } + } + else { + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) ); + } + } + if ( definiton ) { + in_definition = true; + } + } + else if ( line.startsWith( " ORGANISM " ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) ); + } + else { + e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) ); + } + // in_def = false; + } + else if ( line.startsWith( "OS " ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); + } + else { + e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) ); + } + } + else if ( line.startsWith( " " ) && in_definition ) { + def.append( " " ); + if ( line.indexOf( "[" ) > 0 ) { + def.append( SequenceDbWsTools.extractTo( line, "[" ) ); + } + else if ( line.indexOf( "." ) > 0 ) { + def.append( SequenceDbWsTools.extractTo( line, "." ) ); + } + else { + def.append( line.trim() ); + } + } + else { + in_definition = false; + } + if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) { + in_features = false; + in_source = false; + in_gene = false; + in_cds = false; + in_mrna = false; + in_protein = false; + // in_def = false; + } + if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) { + in_features = true; + } + if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) { + in_source = true; + in_gene = false; + in_cds = false; + in_mrna = false; + in_protein = false; + } + if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) { + in_source = false; + in_gene = true; + in_cds = false; + in_mrna = false; + in_protein = false; + } + if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) { + in_source = false; + in_gene = false; + in_cds = true; + in_mrna = false; + in_protein = false; + } + if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) { + in_source = false; + in_gene = false; + in_cds = false; + in_mrna = false; + in_protein = true; + } + if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) { + in_source = false; + in_gene = false; + in_cds = false; + in_mrna = true; + in_protein = false; + } + if ( in_source ) { + final Matcher ti = taxon_PATTERN.matcher( line ); + if ( ti.find() ) { + e.setTaxId( ti.group( 1 ) ); + } + final Matcher chr = chromosome_PATTERN.matcher( line ); + if ( chr.find() ) { + e.setChromosome( chr.group( 1 ) ); + } + final Matcher map = map_PATTERN.matcher( line ); + if ( map.find() ) { + e.setMap( map.group( 1 ) ); + } + } + if ( in_cds || in_gene ) { + final Matcher hgnc = hgnc_PATTERN.matcher( line ); + if ( hgnc.find() ) { + e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) ); + } + final Matcher geneid = geneid_PATTERN.matcher( line ); + if ( geneid.find() ) { + e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) ); + } + } + if ( in_protein || in_cds || in_gene || in_mrna ) { + final Matcher ec = ec_PATTERN.matcher( line ); + if ( ec.find() ) { + e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) ); + } + final Matcher gene = gene_PATTERN.matcher( line ); + if ( gene.find() ) { + e.setGeneName( gene.group( 1 ) ); + } + final Matcher uniprot = uniprot_PATTERN.matcher( line ); + if ( uniprot.find() ) { + e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) ); + } + final Matcher interpro = interpro_PATTERN.matcher( line ); + if ( interpro.find() ) { + e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) ); + } + final Matcher mim = mim_PATTERN.matcher( line ); + if ( mim.find() ) { + e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) ); + } + final Matcher product = product_PATTERN.matcher( line ); + if ( product.find() ) { + e.setSequenceSymbol( product.group( 1 ) ); + } + final Matcher pdb = pdb_PATTERN.matcher( line ); + if ( pdb.find() ) { + e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) ); + } + } } - _annotations.add( annotation ); + if ( def.length() > 0 ) { + e.setSequenceName( def.toString().trim() ); + } + return e; + } + + private static void x( final StringBuilder sb, final String s ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( s.trim() ); } } diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index 29376bd..d7cafd1 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -61,6 +61,7 @@ public final class SequenceDbWsTools { public final static String EMBL_DBS_REFSEQ_P = "refseqp"; public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id="; public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; + public final static String EMBL_EMBL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=EMBL&style=raw&id="; private final static boolean DEBUG = true; private final static String URL_ENC = "UTF-8"; @@ -257,28 +258,24 @@ public final class SequenceDbWsTools { return result; } - public static List queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException { + public static List queryEmblDb( final Accession acc, final int max_lines_to_return ) throws IOException { final StringBuilder url_sb = new StringBuilder(); // url_sb.append( BASE_EMBL_DB_URL ); - if ( id.getSource().equals( Source.NCBI.toString() ) ) { + System.out.println( "source: " + acc.getSource() ); + if ( acc.getSource().equals( Source.NCBI.toString() ) ) { url_sb.append( EMBL_GENBANK ); //url_sb.append( '/' ); } - else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) { + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { url_sb.append( EMBL_REFSEQ ); - // if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { - // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); - // url_sb.append( '/' ); - // } - // else { - // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N ); - // url_sb.append( '/' ); - // } + } + else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + url_sb.append( EMBL_EMBL ); } else { - throw new IllegalArgumentException( "unable to handle source: " + id.getSource() ); + throw new IllegalArgumentException( "unable to handle source: " + acc.getSource() ); } - return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); + return queryDb( acc.getValue(), max_lines_to_return, url_sb.toString() ); } public static List queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return ) @@ -330,20 +327,32 @@ public final class SequenceDbWsTools { // Eat this, and move to next. } } - else if ( acc.getSource().equals( Source.EMBL.toString() ) ) { + else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { if ( DEBUG ) { - System.out.println( "embl: " + query ); + System.out.println( "refseq: " + query ); } try { - db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); } catch ( final FileNotFoundException e ) { // Eat this, and move to next. } } - else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) { + else if ( acc.getSource().equals( Source.EMBL.toString() ) || acc.getSource().equals( Source.NCBI.toString() ) + || acc.getSource().equals( Source.EMBL.toString() ) ) { if ( DEBUG ) { - System.out.println( "refseq: " + query ); + System.out.println( acc.toString() ); + } + try { + db_entry = obtainEmblEntry( acc, lines_to_return ); + } + catch ( final FileNotFoundException e ) { + // Eat this, and move to next. + } + } + else if ( acc.getSource().equals( Source.GI.toString() ) ) { + if ( DEBUG ) { + System.out.println( "gi: " + query ); } try { db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );