2 // FORESTER -- software libraries and applications
\r
3 // for evolutionary biology research and applications.
\r
5 // Copyright (C) 2008-2009 Christian M. Zmasek
\r
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
\r
7 // Copyright (C) 2000-2001 Washington University School of Medicine
\r
8 // and Howard Hughes Medical Institute
\r
9 // Copyright (C) 2003-2007 Ethalinda K.S. Cannon
\r
10 // All rights reserved
\r
12 // This library is free software; you can redistribute it and/or
\r
13 // modify it under the terms of the GNU Lesser General Public
\r
14 // License as published by the Free Software Foundation; either
\r
15 // version 2.1 of the License, or (at your option) any later version.
\r
17 // This library is distributed in the hope that it will be useful,
\r
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
20 // Lesser General Public License for more details.
\r
22 // You should have received a copy of the GNU Lesser General Public
\r
23 // License along with this library; if not, write to the Free Software
\r
24 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
26 // Contact: phylosoft @ gmail . com
\r
27 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
\r
29 package org.forester.util;
\r
31 import java.util.regex.Matcher;
\r
32 import java.util.regex.Pattern;
\r
34 import org.forester.phylogeny.PhylogenyNode;
\r
35 import org.forester.phylogeny.data.Accession;
\r
36 import org.forester.phylogeny.data.Accession.Source;
\r
37 import org.forester.phylogeny.data.Sequence;
\r
39 public final class SequenceAccessionTools {
\r
41 //The format for GenBank Accession numbers are:
\r
42 //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
\r
43 //Protein: 3 letters + 5 numerals
\r
44 //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
\r
45 public final static Pattern GENBANK_NUC_PATTERN_1 = Pattern
\r
46 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
47 public final static Pattern GENBANK_NUC_PATTERN_2 = Pattern
\r
48 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
49 public final static Pattern GENBANK_PROT_PATTERN = Pattern
\r
50 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );
\r
51 public final static Pattern GI_PATTERN = Pattern
\r
52 .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );
\r
53 public final static String UNIPROT_KB_BASE_PATTERN_STR = "((?:[OPQ][0-9][A-Z0-9]{3}[0-9])|(?:[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}))";
\r
54 public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern.compile( "(?:\\b|_)"
\r
55 + UNIPROT_KB_BASE_PATTERN_STR + "(?:\\b|_)" );
\r
56 public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern.compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]"
\r
57 + UNIPROT_KB_BASE_PATTERN_STR + "(?:\\b|_)" );
\r
58 public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
\r
59 .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|"
\r
60 + UNIPROT_KB_BASE_PATTERN_STR
\r
61 + ")_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );
\r
62 public final static Pattern ENSEMBL_PATTERN = Pattern.compile( "(?:\\b|_)(ENS[A-Z]*[0-9]+)(?:\\b|_)" );
\r
63 // RefSeq accession numbers can be distinguished from GenBank accessions
\r
64 // by their distinct prefix format of 2 characters followed by an
\r
65 // underscore character ('_'). For example, a RefSeq protein accession is NP_015325.
\r
66 private final static Pattern REFSEQ_PATTERN = Pattern
\r
67 .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );
\r
69 private SequenceAccessionTools() {
\r
70 // Hiding the constructor.
\r
73 public final static boolean isProteinDbQuery( final String query ) {
\r
74 final String r1 = parseRefSeqAccessorFromString( query );
\r
75 if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {
\r
78 final String r2 = parseUniProtAccessorFromString( query );
\r
79 if ( !ForesterUtil.isEmpty( r2 ) ) {
\r
82 return GENBANK_PROT_PATTERN.matcher( query ).lookingAt();
\r
85 public final static Accession obtainAccessorFromDataFields( final PhylogenyNode n ) {
\r
86 String a = obtainUniProtAccessorFromDataFields( n );
\r
87 if ( !ForesterUtil.isEmpty( a ) ) {
\r
88 return new Accession( a, Source.UNIPROT );
\r
90 a = obtainGenbankAccessorFromDataFields( n );
\r
91 if ( !ForesterUtil.isEmpty( a ) ) {
\r
92 return new Accession( a, Source.NCBI );
\r
94 a = obtainRefSeqAccessorFromDataFields( n );
\r
95 if ( !ForesterUtil.isEmpty( a ) ) {
\r
96 return new Accession( a, Source.REFSEQ );
\r
98 a = obtainGiNumberFromDataFields( n );
\r
99 if ( !ForesterUtil.isEmpty( a ) ) {
\r
100 return new Accession( a, Source.GI );
\r
105 public final static Accession obtainFromSeqAccession( final PhylogenyNode n ) {
\r
106 if ( n.getNodeData().isHasSequence() && ( n.getNodeData().getSequence().getAccession() != null )
\r
107 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getSource() )
\r
108 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getValue() ) ) {
\r
109 final String source = n.getNodeData().getSequence().getAccession().getSource().toLowerCase();
\r
110 final String value = n.getNodeData().getSequence().getAccession().getValue();
\r
111 if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source
\r
112 .equals( "sp" ) ) ) {
\r
113 return new Accession( value, Source.UNIPROT );
\r
115 else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {
\r
116 return new Accession( value, Source.EMBL );
\r
118 else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {
\r
119 return new Accession( value, Source.NCBI );
\r
121 else if ( source.equals( "refseq" ) ) {
\r
122 return new Accession( value, Source.REFSEQ );
\r
124 else if ( source.equals( "gi" ) ) {
\r
125 return new Accession( value, Source.GI );
\r
131 public final static String obtainGenbankAccessorFromDataFields( final PhylogenyNode n ) {
\r
133 if ( n.getNodeData().isHasSequence() ) {
\r
134 final Sequence seq = n.getNodeData().getSequence();
\r
135 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
136 a = parseGenbankAccessorFromString( seq.getSymbol() );
\r
138 if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
139 a = parseGenbankAccessorFromString( seq.getGeneName() );
\r
141 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
142 a = parseGenbankAccessorFromString( seq.getName() );
\r
144 if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )
\r
145 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
146 a = parseGenbankAccessorFromString( seq.getAccession().getValue() );
\r
149 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {
\r
150 a = parseGenbankAccessorFromString( n.getName() );
\r
155 public final static String obtainGiNumberFromDataFields( final PhylogenyNode n ) {
\r
157 if ( n.getNodeData().isHasSequence() ) {
\r
158 final Sequence seq = n.getNodeData().getSequence();
\r
159 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
160 a = parseGInumberFromString( seq.getName() );
\r
162 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
163 a = parseGInumberFromString( seq.getGeneName() );
\r
165 if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )
\r
166 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
167 a = parseGInumberFromString( seq.getAccession().getValue() );
\r
170 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {
\r
171 a = parseGInumberFromString( n.getName() );
\r
176 public final static String obtainRefSeqAccessorFromDataFields( final PhylogenyNode n ) {
\r
178 if ( n.getNodeData().isHasSequence() ) {
\r
179 final Sequence seq = n.getNodeData().getSequence();
\r
180 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
181 a = parseRefSeqAccessorFromString( seq.getSymbol() );
\r
183 if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
184 a = parseRefSeqAccessorFromString( seq.getGeneName() );
\r
186 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
187 a = parseRefSeqAccessorFromString( seq.getName() );
\r
189 if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )
\r
190 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
191 a = parseRefSeqAccessorFromString( seq.getAccession().getValue() );
\r
194 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {
\r
195 a = parseRefSeqAccessorFromString( n.getName() );
\r
200 public final static String obtainUniProtAccessorFromDataFields( final PhylogenyNode n ) {
\r
202 if ( n.getNodeData().isHasSequence() ) {
\r
203 final Sequence seq = n.getNodeData().getSequence();
\r
204 if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
\r
205 a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getSymbol() );
\r
207 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
\r
208 a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getName() );
\r
210 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {
\r
211 a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getGeneName() );
\r
213 if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null )
\r
214 && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
\r
215 a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getAccession().getValue() );
\r
218 if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) {
\r
219 a = SequenceAccessionTools.parseUniProtAccessorFromString( n.getName() );
\r
224 public final static Accession parseAccessorFromString( final String s ) {
\r
225 if ( !ForesterUtil.isEmpty( s ) ) {
\r
226 String v = parseUniProtAccessorFromString( s );
\r
227 if ( !ForesterUtil.isEmpty( v ) ) {
\r
228 return new Accession( v, Source.UNIPROT );
\r
230 v = parseGenbankAccessorFromString( s );
\r
231 if ( !ForesterUtil.isEmpty( v ) ) {
\r
232 return new Accession( v, Source.NCBI );
\r
234 v = parseRefSeqAccessorFromString( s );
\r
235 if ( !ForesterUtil.isEmpty( v ) ) {
\r
236 return new Accession( v, Source.REFSEQ );
\r
238 v = parseGInumberFromString( s );
\r
239 if ( !ForesterUtil.isEmpty( v ) ) {
\r
240 return new Accession( v, Source.GI );
\r
242 v = parseEnsemlAccessorFromString( s );
\r
243 if ( !ForesterUtil.isEmpty( v ) ) {
\r
244 return new Accession( v, Source.ENSEMBL );
\r
250 public final static String parseGenbankAccessorFromString( final String s ) {
\r
251 Matcher m = GENBANK_NUC_PATTERN_1.matcher( s );
\r
252 if ( m.lookingAt() ) {
\r
253 return m.group( 1 );
\r
256 m = GENBANK_NUC_PATTERN_2.matcher( s );
\r
257 if ( m.lookingAt() ) {
\r
258 return m.group( 1 );
\r
261 m = GENBANK_PROT_PATTERN.matcher( s );
\r
262 if ( m.lookingAt() ) {
\r
263 return m.group( 1 );
\r
272 public final static String parseGenbankProteinAccessorFromString( final String s ) {
\r
273 final Matcher m = GENBANK_PROT_PATTERN.matcher( s );
\r
274 if ( m.lookingAt() ) {
\r
275 return m.group( 1 );
\r
282 public final static String parseGInumberFromString( final String s ) {
\r
283 final Matcher m = GI_PATTERN.matcher( s );
\r
285 return m.group( 1 );
\r
290 public final static String parseEnsemlAccessorFromString( final String s ) {
\r
291 final Matcher m = ENSEMBL_PATTERN.matcher( s );
\r
293 return m.group( 1 );
\r
298 public final static String parseRefSeqAccessorFromString( final String s ) {
\r
299 final Matcher m = REFSEQ_PATTERN.matcher( s );
\r
300 if ( m.lookingAt() ) {
\r
301 return m.group( 1 );
\r
306 public final static String parseUniProtAccessorFromString( final String s ) {
\r
307 Matcher m = UNIPROT_KB_PATTERN_1.matcher( s );
\r
309 return m.group( 1 );
\r
311 m = UNIPROT_KB_PATTERN_2.matcher( s );
\r
315 m = UNIPROT_KB_PATTERN_0.matcher( s );
\r
317 return m.group( 1 );
\r