2 // FORESTER -- software libraries and applications
\r
3 // for evolutionary biology research and applications.
\r
5 // Copyright (C) 2008-2009 Christian M. Zmasek
\r
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
\r
7 // Copyright (C) 2000-2001 Washington University School of Medicine
\r
8 // and Howard Hughes Medical Institute
\r
9 // Copyright (C) 2003-2007 Ethalinda K.S. Cannon
\r
10 // All rights reserved
\r
12 // This library is free software; you can redistribute it and/or
\r
13 // modify it under the terms of the GNU Lesser General Public
\r
14 // License as published by the Free Software Foundation; either
\r
15 // version 2.1 of the License, or (at your option) any later version.
\r
17 // This library is distributed in the hope that it will be useful,
\r
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
20 // Lesser General Public License for more details.
\r
22 // You should have received a copy of the GNU Lesser General Public
\r
23 // License along with this library; if not, write to the Free Software
\r
24 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
26 // Contact: phylosoft @ gmail . com
\r
27 // WWW: www.phylosoft.org/forester
\r
29 package org.forester.util;
\r
31 import java.util.regex.Matcher;
\r
32 import java.util.regex.Pattern;
\r
34 import org.forester.phylogeny.data.Identifier;
\r
36 public final class SequenceIdParser {
\r
38 // gb_ADF31344_1_segmented_worms_
\r
40 // gb_EHB07727_1_rodents_
\r
41 // dbj_BAF37827_1_turtles_
\r
42 // emb_CAA73223_1_primates_
\r
43 // lcl_91970_unknown_
\r
44 // mites|ref_XP_002434188_1
\r
45 // ref_XP_002434188_1_mites___ticks_
\r
46 // ref_NP_001121530_1_frogs___toads_
\r
47 //The format for GenBank Accession numbers are:
\r
48 //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
\r
49 //Protein: 3 letters + 5 numerals
\r
50 //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
\r
51 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
\r
52 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
\r
53 private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
\r
54 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );
\r
55 private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
\r
56 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
\r
57 // RefSeq accession numbers can be distinguished from GenBank accessions
\r
58 // by their distinct prefix format of 2 characters followed by an
\r
59 // underscore character ('_'). For example, a RefSeq protein accession is NP_015325.
\r
60 private final static Pattern REFSEQ_PATTERN = Pattern
\r
61 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );
\r
64 * Returns null if no match.
\r
67 public final static Identifier parse( final String s ) {
\r
68 String v = parseGenbankAccessor( s );
\r
69 if ( !ForesterUtil.isEmpty( v ) ) {
\r
70 return new Identifier( v, Identifier.NCBI );
\r
72 v = parseRefSeqAccessor( s );
\r
73 if ( !ForesterUtil.isEmpty( v ) ) {
\r
74 return new Identifier( v, Identifier.REFSEQ );
\r
79 public static boolean isProtein( final String query ) {
\r
80 return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();
\r
84 * Returns null if no match.
\r
87 public static String parseGenbankAccessor( final String query ) {
\r
88 Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
\r
89 if ( m.lookingAt() ) {
\r
90 return m.group( 1 );
\r
93 m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
\r
94 if ( m.lookingAt() ) {
\r
95 return m.group( 1 );
\r
98 m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
\r
99 if ( m.lookingAt() ) {
\r
100 return m.group( 1 );
\r
110 * Returns null if no match.
\r
113 private final static String parseRefSeqAccessor( final String query ) {
\r
114 final Matcher m = REFSEQ_PATTERN.matcher( query );
\r
115 if ( m.lookingAt() ) {
\r
116 return m.group( 1 );
\r
121 private SequenceIdParser() {
\r
122 // Hiding the constructor.
\r