From 22b24f8bfc6470aae914f5c97c826542f2697e77 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Sat, 12 Oct 2013 03:37:19 +0000 Subject: [PATCH] inprogress --- forester/java/src/org/forester/test/Test.java | 62 ++++++++++++- .../java/src/org/forester/ws/seqdb/EbiDbEntry.java | 96 ++++++++++++++++---- .../forester/ws/seqdb/SequenceDatabaseEntry.java | 3 + .../org/forester/ws/seqdb/SequenceDbWsTools.java | 34 +++++-- .../src/org/forester/ws/seqdb/UniProtEntry.java | 7 +- 5 files changed, 171 insertions(+), 31 deletions(-) diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 3de1c55..b073bb4 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -237,6 +237,18 @@ public final class Test { System.out.println( "failed." ); failed++; } + if ( PERFORM_DB_TESTS ) { + System.out.print( "Ebi Entry Retrieval: " ); + if ( Test.testEbiEntryRetrieval() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + } + System.exit( 0 ); System.out.print( "UniProtKB id extraction: " ); if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) { System.out.println( "OK." ); @@ -267,7 +279,6 @@ public final class Test { System.exit( -1 ); } } - // System.exit( 0 ); System.out.print( "Hmmscan output parser: " ); if ( testHmmscanOutputParser() ) { System.out.println( "OK." ); @@ -829,8 +840,8 @@ public final class Test { System.out.println( "failed." ); failed++; } - System.out.print( "EMBL Entry Retrieval: " ); - if ( Test.testEmblEntryRetrieval() ) { + System.out.print( "Genbank accessor parsing: " ); + if ( Test.testGenbankAccessorParsing() ) { System.out.println( "OK." ); succeeded++; } @@ -3362,7 +3373,7 @@ public final class Test { return true; } - private static boolean testEmblEntryRetrieval() { + private static boolean testGenbankAccessorParsing() { //The format for GenBank Accession numbers are: //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals //Protein: 3 letters + 5 numerals @@ -11038,6 +11049,49 @@ public final class Test { return true; } + private static boolean testEbiEntryRetrieval() { + try { + final SequenceDatabaseEntry entry = SequenceDbWsTools + .obtainEmblEntry( new Accession( "AAK41263", Accession.Source.NCBI ) ); + if ( !entry.getAccession().equals( "AAK41263" ) ) { + System.out.println( entry.getAccession() ); + return false; + } + if ( !entry.getTaxonomyScientificName().equals( "Sulfolobus solfataricus P2" ) ) { + System.out.println( entry.getTaxonomyScientificName() ); + return false; + } + if ( !entry.getSequenceName() + .equals( "Sulfolobus solfataricus P2 Glycogen debranching enzyme, hypothetical (treX-like)" ) ) { + System.out.println( entry.getSequenceName() ); + return false; + } + // if ( !entry.getSequenceSymbol().equals( "mAspAT" ) ) { + // System.out.println( entry.getSequenceSymbol() ); + // return false; + // } + if ( !entry.getGeneName().equals( "treX-like" ) ) { + System.out.println( entry.getGeneName() ); + return false; + } + if ( !entry.getTaxonomyIdentifier().equals( "273057" ) ) { + System.out.println( entry.getTaxonomyIdentifier() ); + return false; + } + } + catch ( final IOException e ) { + System.out.println(); + System.out.println( "the following might be due to absence internet connection:" ); + e.printStackTrace( System.out ); + return true; + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + private static boolean testUniprotEntryRetrieval() { try { final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 ); diff --git a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java index 5f2cad2..c167259 100644 --- a/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java @@ -27,10 +27,12 @@ package org.forester.ws.seqdb; import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.go.GoTerm; import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; import org.forester.util.ForesterUtil; public final class EbiDbEntry implements SequenceDatabaseEntry { @@ -70,9 +72,10 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" ); final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" ); final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" ); + final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"[\\.\\-\\d]+\"" ); final EbiDbEntry e = new EbiDbEntry(); final StringBuilder def = new StringBuilder(); - boolean in_def = false; + boolean in_definition = false; boolean in_features = false; boolean in_source = false; boolean in_gene = false; @@ -81,19 +84,44 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { for( final String line : lines ) { if ( line.startsWith( "ACCESSION " ) ) { e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) ); - in_def = false; + in_definition = false; } - else if ( line.startsWith( "DEFINITION " ) ) { + else if ( line.startsWith( "ID " ) ) { + e.setPA( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) ); + in_definition = false; + } + else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) { + boolean definiton = false; + if ( line.startsWith( "DEFINITION " ) ) { + definiton = true; + } if ( line.indexOf( "[" ) > 0 ) { - def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ); + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) ); + } } else if ( line.indexOf( "." ) > 0 ) { - def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ); + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) ); + } } else { - def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ); + if ( definiton ) { + x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) ); + } + else { + x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) ); + } + } + if ( definiton ) { + in_definition = true; } - in_def = true; } else if ( line.startsWith( " ORGANISM " ) ) { if ( line.indexOf( "(" ) > 0 ) { @@ -104,7 +132,15 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } // in_def = false; } - else if ( line.startsWith( " " ) && in_def ) { + else if ( line.startsWith( "OS " ) ) { + if ( line.indexOf( "(" ) > 0 ) { + e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); + } + else { + e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) ); + } + } + else if ( line.startsWith( " " ) && in_definition ) { def.append( " " ); if ( line.indexOf( "[" ) > 0 ) { def.append( SequenceDbWsTools.extractTo( line, "[" ) ); @@ -117,7 +153,7 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { } } else { - in_def = false; + in_definition = false; } if ( X_PATTERN.matcher( line ).find() ) { in_features = false; @@ -154,22 +190,36 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { in_cds = false; in_protein = true; } + if ( in_protein || in_cds ) { + final Matcher m = ec_PATTERN.matcher( line ); + if ( m.find() ) { + e.addAnnotation( new Annotation( "EC", m.group( 1 ) ) ); + } + } } if ( def.length() > 0 ) { e.setDe( def.toString().trim() ); } return e; } + + private static void x( final StringBuilder sb, final String s ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( s.trim() ); + } // FIXME actually this is NCBI entry //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/ - private String _pa; - private String _de; - private String _os; - private String _tax_id; - private String _symbol; - private String _provider; - private ArrayList _cross_references; - private String _gene_name; + private String _pa; + private String _de; + private String _os; + private String _tax_id; + private String _symbol; + private String _provider; + private List _cross_references; + private List _annotations; + private String _gene_name; // TODO PUBMED 15798186 //TODO (FEATURES) @@ -523,4 +573,16 @@ public final class EbiDbEntry implements SequenceDatabaseEntry { _tax_id = tax_id; } } + + @Override + public List getAnnotations() { + return _annotations; + } + + private void addAnnotation( final Annotation annotation ) { + if ( _annotations == null ) { + _annotations = new ArrayList(); + } + _annotations.add( annotation ); + } } diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java b/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java index 3a28d6a..b060d0d 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDatabaseEntry.java @@ -29,6 +29,7 @@ import java.util.List; import org.forester.go.GoTerm; import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; public interface SequenceDatabaseEntry { @@ -38,6 +39,8 @@ public interface SequenceDatabaseEntry { public List getGoTerms(); + public List getAnnotations(); + public String getProvider(); public String getSequenceName(); diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index cecef9f..209d284 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -55,8 +55,9 @@ import org.forester.util.SequenceAccessionTools; public final class SequenceDbWsTools { public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id="; + public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id="; public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/"; - public final static String EMBL_DBS_EMBL = "embl"; + //public final static String EMBL_DBS_EMBL = "embl"; public final static String EMBL_DBS_REFSEQ_N = "refseqn"; public final static String EMBL_DBS_REFSEQ_P = "refseqp"; private final static boolean DEBUG = true; @@ -141,10 +142,14 @@ public final class SequenceDbWsTools { return null; } - public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc, final int max_lines_to_return ) throws IOException { - final List lines = queryEmblDb( id, max_lines_to_return ); - return EbiDbEntry.createInstanceFromPlainText( lines ); + final List lines = queryEmblDb( acc, max_lines_to_return ); + return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); + } + + public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException { + return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN ); } public final static Accession obtainSeqAccession( final PhylogenyNode node ) { @@ -155,12 +160,16 @@ public final class SequenceDbWsTools { return acc; } - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc, final int max_lines_to_return ) throws IOException { - final List lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return ); + final List lines = queryEmblDbForRefSeqEntry( acc, max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); } + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException { + return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN ); + } + public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data, final int lines_to_return, final SortedSet not_found, @@ -206,6 +215,10 @@ public final class SequenceDbWsTools { return UniProtEntry.createInstanceFromPlainText( lines ); } + public static SequenceDatabaseEntry obtainUniProtEntry( final String query ) throws IOException { + return obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN ); + } + public static List queryDb( final String query, int max_lines_to_return, final String base_url ) throws IOException { if ( ForesterUtil.isEmpty( query ) ) { @@ -252,9 +265,9 @@ public final class SequenceDbWsTools { public static List queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException { final StringBuilder url_sb = new StringBuilder(); // url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) { - url_sb.append( EMBL_DBS_EMBL ); - url_sb.append( '/' ); + if ( id.getSource().equals( Source.NCBI.toString() ) ) { + url_sb.append( EMBL_GENBANK ); + //url_sb.append( '/' ); } else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) { url_sb.append( EMBL_REFSEQ ); @@ -267,6 +280,9 @@ public final class SequenceDbWsTools { // url_sb.append( '/' ); // } } + else { + throw new IllegalArgumentException( "unable to handle source: " + id.getSource() ); + } return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() ); } diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java index 4ba10de..1345523 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -34,6 +34,7 @@ import org.forester.go.BasicGoTerm; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; import org.forester.util.ForesterUtil; public final class UniProtEntry implements SequenceDatabaseEntry { @@ -124,7 +125,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { if ( _cross_references == null ) { _cross_references = new ArrayList(); } - System.out.println( "XREF ADDED: " + accession ); _cross_references.add( accession ); } @@ -288,4 +288,9 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } return e; } + + @Override + public List getAnnotations() { + return null; + } } -- 1.7.10.2