1 package org.forester.ws.uniprot;
3 import java.util.regex.Matcher;
4 import java.util.regex.Pattern;
7 public class DatabaseTools {
8 //The format for GenBank Accession numbers are:
9 //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
10 //Protein: 3 letters + 5 numerals
11 //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
13 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
14 .compile( "^.*[^a-zA-Z0-9]?([A-Z]\\d{5})[^a-zA-Z0-9]?" );
16 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
17 .compile( "^.*[^a-zA-Z0-9]?([A-Z]{2}\\d{6})[^a-zA-Z0-9]?" );
19 private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
20 .compile( "^.*[^a-zA-Z0-9]?([A-Z]{3}\\d{5})[^a-zA-Z0-9]?" );
24 private final static boolean DEBUG = false;
27 * Returns null if no match.
33 static public String parseGenbankAccessor( final String query ) {
34 Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
35 if ( m.lookingAt() ) {
39 m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
40 if ( m.lookingAt() ) {
44 m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
45 if ( m.lookingAt() ) {
55 static String extract( final String target, final String a, final String b ) {
56 final int i_a = target.indexOf( a );
57 final int i_b = target.indexOf( b );
58 if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
59 throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and ["
62 return target.substring( i_a + a.length(), i_b ).trim();
67 static String extract( final String target, final String a ) {
68 final int i_a = target.indexOf( a );
69 return target.substring( i_a + a.length() ).trim();