System.out.println( "failed." );
failed++;
}
+ if ( PERFORM_DB_TESTS ) {
+ System.out.print( "Ebi Entry Retrieval: " );
+ if ( Test.testEbiEntryRetrieval() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
+ }
+ System.exit( 0 );
System.out.print( "UniProtKB id extraction: " );
if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
System.out.println( "OK." );
System.exit( -1 );
}
}
- // System.exit( 0 );
System.out.print( "Hmmscan output parser: " );
if ( testHmmscanOutputParser() ) {
System.out.println( "OK." );
System.out.println( "failed." );
failed++;
}
- System.out.print( "EMBL Entry Retrieval: " );
- if ( Test.testEmblEntryRetrieval() ) {
+ System.out.print( "Genbank accessor parsing: " );
+ if ( Test.testGenbankAccessorParsing() ) {
System.out.println( "OK." );
succeeded++;
}
return true;
}
- private static boolean testEmblEntryRetrieval() {
+ private static boolean testGenbankAccessorParsing() {
//The format for GenBank Accession numbers are:
//Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
//Protein: 3 letters + 5 numerals
return true;
}
+ private static boolean testEbiEntryRetrieval() {
+ try {
+ final SequenceDatabaseEntry entry = SequenceDbWsTools
+ .obtainEmblEntry( new Accession( "AAK41263", Accession.Source.NCBI ) );
+ if ( !entry.getAccession().equals( "AAK41263" ) ) {
+ System.out.println( entry.getAccession() );
+ return false;
+ }
+ if ( !entry.getTaxonomyScientificName().equals( "Sulfolobus solfataricus P2" ) ) {
+ System.out.println( entry.getTaxonomyScientificName() );
+ return false;
+ }
+ if ( !entry.getSequenceName()
+ .equals( "Sulfolobus solfataricus P2 Glycogen debranching enzyme, hypothetical (treX-like)" ) ) {
+ System.out.println( entry.getSequenceName() );
+ return false;
+ }
+ // if ( !entry.getSequenceSymbol().equals( "mAspAT" ) ) {
+ // System.out.println( entry.getSequenceSymbol() );
+ // return false;
+ // }
+ if ( !entry.getGeneName().equals( "treX-like" ) ) {
+ System.out.println( entry.getGeneName() );
+ return false;
+ }
+ if ( !entry.getTaxonomyIdentifier().equals( "273057" ) ) {
+ System.out.println( entry.getTaxonomyIdentifier() );
+ return false;
+ }
+ }
+ catch ( final IOException e ) {
+ System.out.println();
+ System.out.println( "the following might be due to absence internet connection:" );
+ e.printStackTrace( System.out );
+ return true;
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
private static boolean testUniprotEntryRetrieval() {
try {
final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 );
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.forester.go.GoTerm;
import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
import org.forester.util.ForesterUtil;
public final class EbiDbEntry implements SequenceDatabaseEntry {
final Pattern taxon_xref_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:(IP\\d+)\"" );
final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/TrEMBL:(\\w+)\"" );
+ final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"[\\.\\-\\d]+\"" );
final EbiDbEntry e = new EbiDbEntry();
final StringBuilder def = new StringBuilder();
- boolean in_def = false;
+ boolean in_definition = false;
boolean in_features = false;
boolean in_source = false;
boolean in_gene = false;
for( final String line : lines ) {
if ( line.startsWith( "ACCESSION " ) ) {
e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
- in_def = false;
+ in_definition = false;
}
- else if ( line.startsWith( "DEFINITION " ) ) {
+ else if ( line.startsWith( "ID " ) ) {
+ e.setPA( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
+ in_definition = false;
+ }
+ else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
+ boolean definiton = false;
+ if ( line.startsWith( "DEFINITION " ) ) {
+ definiton = true;
+ }
if ( line.indexOf( "[" ) > 0 ) {
- def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) );
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
+ }
}
else if ( line.indexOf( "." ) > 0 ) {
- def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) );
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
+ }
}
else {
- def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) );
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
+ }
+ }
+ if ( definiton ) {
+ in_definition = true;
}
- in_def = true;
}
else if ( line.startsWith( " ORGANISM " ) ) {
if ( line.indexOf( "(" ) > 0 ) {
}
// in_def = false;
}
- else if ( line.startsWith( " " ) && in_def ) {
+ else if ( line.startsWith( "OS " ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+ }
+ else {
+ e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
+ }
+ }
+ else if ( line.startsWith( " " ) && in_definition ) {
def.append( " " );
if ( line.indexOf( "[" ) > 0 ) {
def.append( SequenceDbWsTools.extractTo( line, "[" ) );
}
}
else {
- in_def = false;
+ in_definition = false;
}
if ( X_PATTERN.matcher( line ).find() ) {
in_features = false;
in_cds = false;
in_protein = true;
}
+ if ( in_protein || in_cds ) {
+ final Matcher m = ec_PATTERN.matcher( line );
+ if ( m.find() ) {
+ e.addAnnotation( new Annotation( "EC", m.group( 1 ) ) );
+ }
+ }
}
if ( def.length() > 0 ) {
e.setDe( def.toString().trim() );
}
return e;
}
+
+ private static void x( final StringBuilder sb, final String s ) {
+ if ( sb.length() > 0 ) {
+ sb.append( " " );
+ }
+ sb.append( s.trim() );
+ }
// FIXME actually this is NCBI entry
//http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
- private String _pa;
- private String _de;
- private String _os;
- private String _tax_id;
- private String _symbol;
- private String _provider;
- private ArrayList<Accession> _cross_references;
- private String _gene_name;
+ private String _pa;
+ private String _de;
+ private String _os;
+ private String _tax_id;
+ private String _symbol;
+ private String _provider;
+ private List<Accession> _cross_references;
+ private List<Annotation> _annotations;
+ private String _gene_name;
// TODO PUBMED 15798186
//TODO (FEATURES)
_tax_id = tax_id;
}
}
+
+ @Override
+ public List<Annotation> getAnnotations() {
+ return _annotations;
+ }
+
+ private void addAnnotation( final Annotation annotation ) {
+ if ( _annotations == null ) {
+ _annotations = new ArrayList<Annotation>();
+ }
+ _annotations.add( annotation );
+ }
}
import org.forester.go.GoTerm;
import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
public interface SequenceDatabaseEntry {
public List<GoTerm> getGoTerms();
+ public List<Annotation> getAnnotations();
+
public String getProvider();
public String getSequenceName();
public final class SequenceDbWsTools {
public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
+ public final static String EMBL_GENBANK = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=GENBANK&style=raw&id=";
public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
- public final static String EMBL_DBS_EMBL = "embl";
+ //public final static String EMBL_DBS_EMBL = "embl";
public final static String EMBL_DBS_REFSEQ_N = "refseqn";
public final static String EMBL_DBS_REFSEQ_P = "refseqp";
private final static boolean DEBUG = true;
return null;
}
- public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return )
+ public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc, final int max_lines_to_return )
throws IOException {
- final List<String> lines = queryEmblDb( id, max_lines_to_return );
- return EbiDbEntry.createInstanceFromPlainText( lines );
+ final List<String> lines = queryEmblDb( acc, max_lines_to_return );
+ return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
+ }
+
+ public static SequenceDatabaseEntry obtainEmblEntry( final Accession acc ) throws IOException {
+ return obtainEmblEntry( acc, DEFAULT_LINES_TO_RETURN );
}
public final static Accession obtainSeqAccession( final PhylogenyNode node ) {
return acc;
}
- public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
+ public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc, final int max_lines_to_return )
throws IOException {
- final List<String> lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return );
+ final List<String> lines = queryEmblDbForRefSeqEntry( acc, max_lines_to_return );
return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
}
+ public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession acc ) throws IOException {
+ return obtainRefSeqEntryFromEmbl( acc, DEFAULT_LINES_TO_RETURN );
+ }
+
public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data,
final int lines_to_return,
final SortedSet<String> not_found,
return UniProtEntry.createInstanceFromPlainText( lines );
}
+ public static SequenceDatabaseEntry obtainUniProtEntry( final String query ) throws IOException {
+ return obtainUniProtEntry( query, DEFAULT_LINES_TO_RETURN );
+ }
+
public static List<String> queryDb( final String query, int max_lines_to_return, final String base_url )
throws IOException {
if ( ForesterUtil.isEmpty( query ) ) {
public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
final StringBuilder url_sb = new StringBuilder();
// url_sb.append( BASE_EMBL_DB_URL );
- if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) {
- url_sb.append( EMBL_DBS_EMBL );
- url_sb.append( '/' );
+ if ( id.getSource().equals( Source.NCBI.toString() ) ) {
+ url_sb.append( EMBL_GENBANK );
+ //url_sb.append( '/' );
}
else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) {
url_sb.append( EMBL_REFSEQ );
// url_sb.append( '/' );
// }
}
+ else {
+ throw new IllegalArgumentException( "unable to handle source: " + id.getSource() );
+ }
return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
}
import org.forester.go.GoNameSpace;
import org.forester.go.GoTerm;
import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Annotation;
import org.forester.util.ForesterUtil;
public final class UniProtEntry implements SequenceDatabaseEntry {
if ( _cross_references == null ) {
_cross_references = new ArrayList<Accession>();
}
- System.out.println( "XREF ADDED: " + accession );
_cross_references.add( accession );
}
}
return e;
}
+
+ @Override
+ public List<Annotation> getAnnotations() {
+ return null;
+ }
}