+
package org.forester.ws.uniprot;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
public class DatabaseTools {
+
//The format for GenBank Accession numbers are:
//Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
//Protein: 3 letters + 5 numerals
//http://www.ncbi.nlm.nih.gov/Sequin/acc.html
-
private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
- .compile( "^.*[^a-zA-Z0-9]?([A-Z]\\d{5})[^a-zA-Z0-9]?" );
-
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
- .compile( "^.*[^a-zA-Z0-9]?([A-Z]{2}\\d{6})[^a-zA-Z0-9]?" );
-
- private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
- .compile( "^.*[^a-zA-Z0-9]?([A-Z]{3}\\d{5})[^a-zA-Z0-9]?" );
-
-
-
- private final static boolean DEBUG = false;
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );
+ private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
+ private final static boolean DEBUG = false;
/**
* Returns null if no match.
return m.group( 1 );
}
else {
- m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
+ m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
if ( m.lookingAt() ) {
return m.group( 1 );
- }
+ }
else {
m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
if ( m.lookingAt() ) {
return target.substring( i_a + a.length(), i_b ).trim();
}
-
-
static String extract( final String target, final String a ) {
final int i_a = target.indexOf( a );
return target.substring( i_a + a.length() ).trim();
}
-
}