2 package org.forester.ws.uniprot;
4 import java.util.regex.Matcher;
5 import java.util.regex.Pattern;
7 public class DatabaseTools {
9 //The format for GenBank Accession numbers are:
10 //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
11 //Protein: 3 letters + 5 numerals
12 //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
13 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
14 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
15 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
16 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );
17 private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
18 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
19 private final static boolean DEBUG = false;
22 * Returns null if no match.
28 static public String parseGenbankAccessor( final String query ) {
29 Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
30 if ( m.lookingAt() ) {
34 m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
35 if ( m.lookingAt() ) {
39 m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
40 if ( m.lookingAt() ) {
50 static String extract( final String target, final String a, final String b ) {
51 final int i_a = target.indexOf( a );
52 final int i_b = target.indexOf( b );
53 if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
54 throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and ["
57 return target.substring( i_a + a.length(), i_b ).trim();
60 static String extract( final String target, final String a ) {
61 final int i_a = target.indexOf( a );
62 return target.substring( i_a + a.length() ).trim();